Content-based filtering recommendation system using cosine distance

In [35]:
import polars as pl
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder

In [3]:
# Read in dataset from previous step
df_raw = pl.read_csv('../1_process_data/data/processed_data.csv')

# Define categorical columns
categorical_cols = ['maker', 'region', 'spicy', 'soupy', 'base']

# One hot encoding
encoder = OneHotEncoder(sparse_output=False)
encoded_array = encoder.fit_transform(df_raw.select(categorical_cols).to_numpy())

# Convert back to a Polars DataFrame
encoded_df = pl.DataFrame(
    encoded_array, 
    schema=encoder.get_feature_names_out(categorical_cols).tolist()  # Get column names
)

# Update all columns to integer
encoded_df = encoded_df.with_columns(
    pl.col(col).cast(pl.Int64) for col in encoded_df.columns
)

# Merge with original DataFrame (excluding the original categorical columns)
df_encoded = df_raw.drop(categorical_cols).hstack(encoded_df).drop(['price', 'weight'])


In [44]:
# Function to get similar movies
def recommend_noodles(product_nood, df_noods, cosine_sim_matrix):
    idx = np.where((df_noods['product'] == product_nood).to_numpy())[0][0]  # Get noods index
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))  # Get similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:]  # Sort and exclude self

    recommendations = df_noods.select("product").to_series().to_list()  # Convert to list for indexing
    return [(recommendations[i[0]], i[1]) for i in sim_scores]  # Return (product, similarity)

# Convert one-hot encoded features to a NumPy array
features = df_encoded.drop('product').to_numpy()

# Compute cosine similarity between all movies
cosine_sim = cosine_similarity(features)

# Recommend movies similar to "Inception"
recommended_noodles = recommend_noodles("Sanyo Sapporo Ichiban Miso Ramen (USA)", df_encoded, cosine_sim)

# Print recommendations
for noods, score in recommended_noodles[0:5]:
    print(f"Recommended: {noods} (Similarity: {score:.3f})")

Recommended: Sanyo Foods Pokemon Soy Sauce Ramen (Similarity: 1.000)
Recommended: Nissin Demae Iccho Sesame Ramen (EU) (Similarity: 0.800)
Recommended: Nissin Yakisoba (UK) (Similarity: 0.800)
Recommended: Itsuki Kyoto Style Miso Tonkotsu Ramen (Similarity: 0.800)
Recommended: Itsuki Sapporo Style Miso Ramen (Similarity: 0.800)
