In [59]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from sklearn.decomposition import TruncatedSVD
from tqdm import tqdm

In [60]:
ratings_df = pd.read_parquet("sample_user_ratings.parquet")

print(ratings_df.head())
print(ratings_df.shape)

   rating  product_id                       user_id
0     5.0  B09TR3DQ8R  AHHSGIZIOTG5UB5E535ZSYS3ZAZA
1     5.0  B003I4F6NG  AGBU4666V6SPXZ5CU3BO6OF3OUHA
2     5.0  B078JYNSCR  AFPKQFRRTCUUI52KWYNWZDWLG37A
3     3.0  B08GKY6N3X  AE3C6BG455J24M3ZFG3LTOJ6NMQQ
4     5.0  B07T9NS94T  AFHPAUGWK56E2AE6J5BL7IERSY5Q
(100000, 3)


In [61]:
# user_means = ratings_df.groupby('user_id')['rating'].mean()

# # Create a copy and subtract user mean
# ratings_df['adjusted_rating'] = ratings_df['rating'] - ratings_df['user_id'].map(user_means)


In [62]:
# User item matrix
# user_item_matrix = ratings_df.pivot_table(index='user_id', columns='product_id', values='rating')
# user_item_matrix = user_item_matrix.fillna(0)
# print(user_item_matrix.shape)

#Code above leads to storing a really large matrix (crashes runtime -> alternative is to use sparse matrix)


In [63]:
user_ids = ratings_df['user_id'].unique()
product_ids = ratings_df['product_id'].unique()

user_to_idx = {user: i for i, user in enumerate(user_ids)}
product_to_idx = {prod: i for i, prod in enumerate(product_ids)}

# Map the user and product IDs to indices
row_indices = ratings_df['user_id'].map(user_to_idx)
col_indices = ratings_df['product_id'].map(product_to_idx)
data = ratings_df['rating'].values

# Create a sparse user-item matrix in COO format
user_item_sparse = coo_matrix((data, (row_indices, col_indices)), shape=(len(user_ids), len(product_ids)))
print("Matrix Shape: ", user_item_sparse.shape)

Matrix Shape:  (45927, 69877)


In [64]:
# Num latent factors (can change as needed)

n_factors = 20
svd = TruncatedSVD(n_components=n_factors, random_state=42)
user_latent = svd.fit_transform(user_item_sparse)
item_latent = svd.components_.T

print("User latent factors shape:", user_latent.shape)
print("Item latent factors shape:", item_latent.shape)

User latent factors shape: (45927, 20)
Item latent factors shape: (69877, 20)


In [65]:
# Get Recommendations for users
def get_recommendations(user_id, user_to_idx, product_ids, user_latent, item_latent, ratings_df, top_n=10):
    if user_id not in user_to_idx:
        print("User not found.")
        return pd.DataFrame()

    user_index = user_to_idx[user_id]

    predicted_ratings = np.dot(user_latent[user_index], item_latent.T)

    rated_products = ratings_df[ratings_df['user_id'] == user_id]['product_id'].unique()

    predictions = pd.DataFrame({
        'product_id': product_ids,
        'predicted_rating': predicted_ratings
    })

    recommendations = predictions[~predictions['product_id'].isin(rated_products)]

    recommendations = recommendations.sort_values('predicted_rating', ascending=False)

    return recommendations.head(top_n)

In [66]:
# Example
test_user = user_ids[0]
print("Recommendations for test user:", test_user)
test_recs = get_recommendations(test_user, user_to_idx, product_ids, user_latent, item_latent, ratings_df)
print(test_recs)

Recommendations for test user: AHHSGIZIOTG5UB5E535ZSYS3ZAZA
       product_id  predicted_rating
809    B0C7LZYYWQ      3.792713e-06
801    B0B69XTPVL      3.335994e-06
337    B0B53DWRVW      1.675591e-06
31555  B0B3MNTBLV      1.399080e-06
1114   B0CGY43Y3P      1.332715e-06
8636   B08JLVZBXZ      1.266972e-06
3107   B09CQ7R1D8      1.128499e-06
270    B0BZJP9VGZ      1.006931e-06
3120   B08K22FTF1      9.071973e-07
1480   B09Q8J51BB      8.773022e-07


In [67]:
all_recommendations = []

for user_id in tqdm(user_ids, desc="Generating recommendations for all users"):
    recs = get_recommendations(user_id, user_to_idx, product_ids, user_latent, item_latent, ratings_df, top_n=10)
    # Enumerate the recommendations for ranking (starting at rank 1)
    for rank, row in enumerate(recs.itertuples(index=False), start=1):
        all_recommendations.append({
            'user_id': user_id,
            'product_id': row.product_id,
            'predicted_rating': row.predicted_rating,
            'rank': rank
        })

Generating recommendations for all users: 100%|██████████| 45927/45927 [54:42<00:00, 13.99it/s]


In [68]:
# Save recs as CSV
all_recs_df = pd.DataFrame(all_recommendations)
all_recs_df.to_csv("top10_recommendations_all_users_sample.csv", index=False)