In [19]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.impute import SimpleImputer

In [20]:
df = pd.read_csv(
    r"C:\Users\HP\Desktop\Coffee\data\cleaned\coffee-cleaned.csv",
    sep=';',             
    encoding='utf-8',   
    header =0
)
df = df.dropna(subset=["coffee_name"])
df.reset_index(drop=True, inplace=True)

In [21]:
text_features = ["coffee_name"]
numeric_features = ["total_score", "agtron_roast", "agtron_ground", "price_per_100g"]
categorical_features = ["origin_country", "roaster_country", "roast_level"]

In [24]:
preprocessor = ColumnTransformer(
    transformers=[
        ("text", 
         Pipeline([
             ("imputer", SimpleImputer(strategy='constant', fill_value='')),
             ("tfidf", TfidfVectorizer(stop_words='english'))
         ]),
         "coffee_name"),
        
        ("num",
         Pipeline([
             ("imputer", SimpleImputer(strategy='median')),
             ("scaler", StandardScaler())
         ]),
         numeric_features),
        
        ("cat",
         Pipeline([
             ("imputer", SimpleImputer(strategy='constant', fill_value='Unknown')),
             ("onehot", OneHotEncoder(handle_unknown="ignore"))
         ]),
         categorical_features)
    ]
)

In [25]:
feature_matrix = preprocessor.fit_transform(df)

ValueError: Expected a 2-dimensional container but got <class 'pandas.core.series.Series'> instead. Pass a DataFrame containing a single row (i.e. single sample) or a single column (i.e. single feature) instead.

In [None]:
similarity_matrix = cosine_similarity(feature_matrix)