In [1]:
import numpy as np
import pandas as pd

# For Content-Based Filtering Model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# For Collaborative-Based FIltering Model
from surprise import Dataset, Reader
from surprise import SVD

---

In [2]:
# Load DF of all wine products and user interactions:
df_all_interactions = pd.read_csv('../data/df_all_interactions.csv')

## Popularity Model:

Popularity model - not actually personalized. Recommends to a user the most popular items that the user has not previously consumed. Uses 'wisdom of the crows'.

In [3]:
# Computing the most popular items:
df_most_popular = df_all_interactions.groupby('product_id')['event_strength'].sum().sort_values(ascending= False).reset_index()
df_most_popular.head(10)

Unnamed: 0,product_id,event_strength
0,1145574,70.0
1,1405250,55.5
2,938102,44.0
3,583739,40.5
4,1232190,39.0
5,202370,31.0
6,1017295,30.0
7,202479,26.0
8,583740,22.0
9,117139,19.0


In [4]:
def recommend_top_10(user_id, df_interactions, df_most_popular):
    # Get the list of products the user has already interacted with
    user_interactions = df_interactions[df_interactions['user_id'] == user_id]['product_id'].unique()
    
    # Filter out products that the user has already interacted with
    top_10_recommendations = df_most_popular[~df_most_popular['product_id'].isin(user_interactions)].head(10)
    
    return top_10_recommendations

In [5]:
# Example: Calling the function
user_id = '121711'
top_10_recommendations = recommend_top_10(user_id, df_all_interactions, df_most_popular)

# Print the top 10 recommendations for the user
print(top_10_recommendations)

   product_id  event_strength
0     1145574            70.0
1     1405250            55.5
2      938102            44.0
3      583739            40.5
4     1232190            39.0
5      202370            31.0
6     1017295            30.0
7      202479            26.0
8      583740            22.0
9      117139            19.0


## Content-Based Filtering Model:

-Write description
-Cold-strart
-Info retrieval

In [6]:
vectorizer = TfidfVectorizer(
                    analyzer='word',
                    ngram_range=(1, 2),  # considers unigrams and bigrams
                    min_df=0.003, # min document frequency for a term to be included in the vocabulary. Ignore terms that appear < 0.3% of the documents.
                    max_df=0.5, # max document frequency. Terms that appear in > 50% of the documents will be ignored, too common.
                    max_features=5000,
                    stop_words='english')

In [7]:
item_ids = df_all_interactions['product_id'].tolist()
tfidf_matrix = vectorizer.fit_transform(df_all_interactions['wine_name'] + df_all_interactions['wine_type'] + df_all_interactions['wine_origin'])
tfidf_feature_names = vectorizer.get_feature_names_out()
tfidf_matrix

<1025x1012 sparse matrix of type '<class 'numpy.float64'>'
	with 14442 stored elements in Compressed Sparse Row format>

In [8]:
# Step 1: Feature Extraction
# Combine textual features into a single column and handle missing values
df_all_interactions['wine_features'] = df_all_interactions['wine_name'] + ' ' + df_all_interactions['wine_type'] + ' ' + df_all_interactions['wine_origin']
df_all_interactions['wine_features'].fillna('', inplace=True)

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
wine_tfidf_matrix = tfidf_vectorizer.fit_transform(df_all_interactions['wine_features'])

# Step 2: User Profiles
def build_user_profile(user_id):
    user_interactions = df_all_interactions[df_all_interactions['user_id'] == user_id].copy()
    
    # Handle missing interaction strengths (NaN) by replacing with zeros
    user_interactions['event_strength'].fillna(0, inplace=True)
    user_item_strengths = user_interactions['event_strength'].values.reshape(-1, 1)
    
    user_profile = (user_item_strengths.T @ wine_tfidf_matrix[user_interactions.index].toarray()) / max(user_item_strengths.sum(), 1)
    return user_profile

# Step 3: Wine Profiles
wine_profiles = wine_tfidf_matrix

# Step 4: Recommendation Generation
def recommend_wines(user_id, num_recommendations=10):
    user_profile = build_user_profile(user_id)
    
    # Calculate cosine similarity between user profile and all wine profiles
    cosine_similarities = linear_kernel(user_profile, wine_profiles).flatten()
    
    # Get indices of top N wines with highest similarity scores
    top_indices = cosine_similarities.argsort()[-num_recommendations:][::-1]
    
    # Get the corresponding wine IDs
    recommended_wine_ids = df_all_interactions.iloc[top_indices]['product_id'].tolist()
    
    return recommended_wine_ids

In [12]:
# Example: Recommend 10 wines for a user with user_id '121711'
user_id_to_recommend = '121711'
recommended_wines = recommend_wines(user_id_to_recommend, num_recommendations=10)
print(recommended_wines)

[1230356, 1153834, 583740, 583740, 583740, 583740, 583740, 583740, 583740, 583740]


## Collaborative-Based Filtering Model:

-Write description

In [10]:
# pip install scikit-surprise

In [13]:
# Step 1: Preprocess the data
reader = Reader(rating_scale=(0, 5))  # Define the rating scale
data = Dataset.load_from_df(df_all_interactions[['user_id', 'product_id', 'event_strength']], reader)

# Step 2: Create a user-item interaction matrix 
# done implicitly by Surprise

# Step 3: Apply collaborative filtering algorithm
# Choose a collaborative filtering algorithm (e.g., SVD)
algo = SVD()

# Train the model on the data
trainset = data.build_full_trainset()
algo.fit(trainset)

# Example: Recommend wines for a specific user (user_id)
user_id_to_recommend = '121711'  
items_to_ignore = df_all_interactions[df_all_interactions['user_id'] == user_id_to_recommend]['product_id'].tolist()

# Get top N recommendations for the user
top_n = 10  # Number of recommendations to retrieve
user_recommendations = []

for product_id in df_all_interactions['product_id'].unique():
    if product_id not in items_to_ignore:
        predicted_rating = algo.predict(user_id_to_recommend, product_id).est
        user_recommendations.append((product_id, predicted_rating))

# Sort recommendations by predicted rating (higher is better)
user_recommendations.sort(key=lambda x: x[1], reverse=True)

# Get the top N recommendations
top_recommendations = user_recommendations[:top_n]

# Print or use top_recommendations as needed
print(top_recommendations)


[(878447, 2.1125384100281934), (1220861, 2.1121085617210236), (1393318, 2.09657245713216), (1193738, 2.0700567355690755), (534558, 2.069045645129811), (202370, 2.0618055914075164), (1070730, 2.055649476379271), (1069468, 2.0411214683891346), (931216, 2.0320598864918336), (1232190, 2.0296291966656606)]
