In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [2]:
import pandas as pd

# Load the parquet file
ratings_df = pd.read_parquet("sample_user_ratings.parquet")

# Save it as a CSV file
ratings_df.to_csv("sample_user_ratings.csv", index=False)

In [3]:
# Get Data
ratings_df = pd.read_parquet("sample_user_ratings.parquet")

print("Ratings Sample:")
print(ratings_df.head())
print("Ratings matrix shape:", ratings_df.shape)

user_ids = ratings_df['user_id'].unique()
product_ids = ratings_df['product_id'].unique()

user_to_idx = {user: i for i, user in enumerate(user_ids)}
product_to_idx = {prod: i for i, prod in enumerate(product_ids)}

Ratings Sample:
   rating  product_id                       user_id
0     5.0  B09TR3DQ8R  AHHSGIZIOTG5UB5E535ZSYS3ZAZA
1     5.0  B003I4F6NG  AGBU4666V6SPXZ5CU3BO6OF3OUHA
2     5.0  B078JYNSCR  AFPKQFRRTCUUI52KWYNWZDWLG37A
3     3.0  B08GKY6N3X  AE3C6BG455J24M3ZFG3LTOJ6NMQQ
4     5.0  B07T9NS94T  AFHPAUGWK56E2AE6J5BL7IERSY5Q
Ratings matrix shape: (100000, 3)


In [4]:
# Compute Global Mean, Biases, and the Residual Matrix

# Global Mean
global_mean = ratings_df['rating'].mean()
print("Global Mean Rating:", global_mean)

# User Biases
user_bias = ratings_df.groupby('user_id')['rating'].mean() - global_mean

# Item Biases
def compute_item_bias(df, user_bias, global_mean):
    df_with_bias = df.copy()
    df_with_bias['user_bias'] = df_with_bias['user_id'].map(user_bias)
    df_with_bias['residual'] = df_with_bias['rating'] - global_mean - df_with_bias['user_bias']
    return df_with_bias.groupby('product_id')['residual'].mean()

item_bias = compute_item_bias(ratings_df, user_bias, global_mean)



# Map biases back to df
ratings_df['user_bias'] = ratings_df['user_id'].map(user_bias)
ratings_df['item_bias'] = ratings_df['product_id'].map(item_bias)
ratings_df['rating_residual'] = ratings_df['rating'] - (global_mean + ratings_df['user_bias'] + ratings_df['item_bias'])

print("Sample of ratings with biases and residuals:")
print(ratings_df.head())



Global Mean Rating: 4.34643
Sample of ratings with biases and residuals:
   rating  product_id                       user_id  user_bias  item_bias  \
0     5.0  B09TR3DQ8R  AHHSGIZIOTG5UB5E535ZSYS3ZAZA   0.653570   0.000000   
1     5.0  B003I4F6NG  AGBU4666V6SPXZ5CU3BO6OF3OUHA  -1.346430   1.000000   
2     5.0  B078JYNSCR  AFPKQFRRTCUUI52KWYNWZDWLG37A   0.367856   0.285714   
3     3.0  B08GKY6N3X  AE3C6BG455J24M3ZFG3LTOJ6NMQQ  -2.346430   1.000000   
4     5.0  B07T9NS94T  AFHPAUGWK56E2AE6J5BL7IERSY5Q   0.653570   0.000000   

   rating_residual  
0              0.0  
1              1.0  
2              0.0  
3              0.0  
4              0.0  


In [5]:
# User-Item Matrix
row_indices = ratings_df['user_id'].map(user_to_idx)
col_indices = ratings_df['product_id'].map(product_to_idx)
data = ratings_df['rating_residual'].values

user_item_sparse = coo_matrix((data, (row_indices, col_indices)), shape=(len(user_ids), len(product_ids)))
print("Matrix Shape:", user_item_sparse.shape)

Matrix Shape: (45927, 69877)


In [6]:
# Num latent factors (can change as needed)

n_factors = 20
svd = TruncatedSVD(n_components=n_factors, random_state=42)
user_latent = svd.fit_transform(user_item_sparse)
item_latent = svd.components_.T

print("User latent factors shape:", user_latent.shape)
print("Item latent factors shape:", item_latent.shape)

User latent factors shape: (45927, 20)
Item latent factors shape: (69877, 20)


In [7]:
# Get Recs with SVD (with bias)

def get_recommendations_svd(user_id, user_to_idx, product_ids, ratings_df,
                            user_latent, item_latent, top_n=10):
    if user_id not in user_to_idx:
        print("User not found.")
        return pd.DataFrame()

    user_index = user_to_idx[user_id]
    predicted_residuals = np.dot(user_latent[user_index], item_latent.T)

    user_b = user_bias.get(user_id, 0.0)

    pred_df = pd.DataFrame({
        'product_id': product_ids,
        'item_bias': [item_bias.get(prod, 0.0) for prod in product_ids],
        'predicted_residual': predicted_residuals
    })

    # Predicted rating = global_mean + user_bias + item_bias + predicted_residual
    pred_df['predicted_rating'] = global_mean + user_b + pred_df['item_bias'] + pred_df['predicted_residual']

    # Remove products the user already rated
    rated_products = ratings_df[ratings_df['user_id'] == user_id]['product_id'].unique()
    recommendations = pred_df[~pred_df['product_id'].isin(rated_products)]
    recommendations = recommendations.sort_values('predicted_rating', ascending=False)

    return recommendations.head(top_n)


In [8]:
# Test
test_user = user_ids[0]
print("SVD-based Recommendations for test user:", test_user)
test_recs_svd = get_recommendations_svd(test_user, user_to_idx, product_ids, ratings_df, user_latent, item_latent)
print(test_recs_svd)

SVD-based Recommendations for test user: AHHSGIZIOTG5UB5E535ZSYS3ZAZA
       product_id  item_bias  predicted_residual  predicted_rating
27676  B0054K55I8   3.333333                 0.0          8.333333
1467   B01LZYD15F   3.125000                 0.0          8.125000
46897  B093TH4WM6   3.000000                 0.0          8.000000
58512  B0141N64YM   3.000000                 0.0          8.000000
66438  B08Z48PMGM   3.000000                 0.0          8.000000
13721  B0014CQN5E   3.000000                 0.0          8.000000
704    B016WVV2VG   3.000000                 0.0          8.000000
15257  B005WZBXKE   2.833333                 0.0          7.833333
46617  B075MR6QGZ   2.800000                 0.0          7.800000
39934  B01LW19G2Z   2.750000                 0.0          7.750000


In [9]:
all_recommendations = []

for user_id in tqdm(user_ids, desc="Generating recommendations"):
    recs = get_recommendations_svd(user_id, user_to_idx, product_ids, ratings_df,
                                  user_latent, item_latent, top_n=10)
    for _, row in recs.iterrows():
        all_recommendations.append({
            'user_id': user_id,
            'product_id': row['product_id'],
            'predicted_rating': row['predicted_rating']
        })

recommendations_df = pd.DataFrame(all_recommendations)
recommendations_df.to_csv('user_recommendations.csv', index=False)

from IPython.display import FileLink
FileLink('user_recommendations.csv')

Generating recommendations: 100%|██████████| 45927/45927 [3:45:54<00:00,  3.39it/s]


In [43]:
from IPython.display import HTML

def auto_download(file_path):
    return HTML(f"""
        <html>
            <body>
                <script>
                    var link = document.createElement('a');
                    link.href = '{file_path}';
                    link.download = '{file_path}';
                    document.body.appendChild(link);
                    link.click();
                    document.body.removeChild(link);
                </script>
                <p>Downloading <strong>{file_path}</strong>...</p>
            </body>
        </html>
    """)

auto_download("user_recommendations.csv")


In [8]:
# Slope One Implementation

def build_diff_matrix(ratings_df):
    """
    Build the difference matrix and frequency counts used for Slope One.
    For each item pair (i, j), compute the average difference in ratings among users who rated both.
    """
    diff = {}
    freq = {}

    user_groups = ratings_df.groupby('user_id')

    for user, group in tqdm(user_groups, desc="Building diff matrix"):
        ratings = group[['product_id', 'rating']].set_index('product_id')['rating'].to_dict()
        for i, rating_i in ratings.items():
            diff.setdefault(i, {})
            freq.setdefault(i, {})
            for j, rating_j in ratings.items():
                diff[i].setdefault(j, 0.0)
                freq[i].setdefault(j, 0)
                diff[i][j] += rating_i - rating_j
                freq[i][j] += 1

    for i in diff:
        for j in diff[i]:
            diff[i][j] /= freq[i][j]

    return diff, freq

# Build the difference and frequency matrices for Slope One
diff_matrix, freq_matrix = build_diff_matrix(ratings_df)

def predict_rating_slope_one(user_id, product_id, ratings_df, diff_matrix, freq_matrix):
    """
    Predict the rating for a given user and product using the Slope One algorithm.
    """
    user_ratings = ratings_df[ratings_df['user_id'] == user_id][['product_id', 'rating']].set_index('product_id')['rating']

    numerator = 0.0
    denominator = 0
    # For every item the user has rated, add the weighted contribution to the prediction.
    for other_item, user_rating in user_ratings.items():
        if product_id in diff_matrix.get(other_item, {}) and freq_matrix[other_item].get(product_id, 0) > 0:
            weight = freq_matrix[other_item][product_id]
            numerator += (diff_matrix[other_item][product_id] + user_rating) * weight
            denominator += weight
    if denominator > 0:
        return numerator / denominator
    else:
        return global_mean

def get_recommendations_slope_one(user_id, ratings_df, diff_matrix, freq_matrix, product_ids, top_n=10):
    """
    Get top-N recommendations for a user using the Slope One algorithm.
    """
    rated_products = set(ratings_df[ratings_df['user_id'] == user_id]['product_id'])
    predictions = []
    for prod in product_ids:
        if prod in rated_products:
            continue
        pred_rating = predict_rating_slope_one(user_id, prod, ratings_df, diff_matrix, freq_matrix)
        predictions.append((prod, pred_rating))

    pred_df = pd.DataFrame(predictions, columns=['product_id', 'predicted_rating'])
    pred_df = pred_df.sort_values('predicted_rating', ascending=False)

    return pred_df.head(top_n)

Building diff matrix: 100%|██████████| 45927/45927 [00:54<00:00, 846.84it/s] 


In [9]:
# Test
print("Slope One Recommendations for test user:", test_user)
test_recs_slope = get_recommendations_slope_one(test_user, ratings_df, diff_matrix, freq_matrix, product_ids)
print(test_recs_slope)

Slope One Recommendations for test user: AHHSGIZIOTG5UB5E535ZSYS3ZAZA
       product_id  predicted_rating
119    B076F1C3SG               5.0
10549  B09BK97PWT               5.0
67590  B07MQK65Z5               5.0
30101  B09JM43CHM               5.0
37564  B07N17SW7L               5.0
6614   B094BYWB2F               5.0
19342  B0914T36CH               5.0
8815   B08SQG62VB               5.0
13823  B088DB19RM               5.0
43963  B002VFLGBE               5.0


In [11]:
# Combining SVD and SlopeOne

def get_combined_prediction(user_id, product_id,
                            user_to_idx, product_ids, ratings_df,
                            user_latent, item_latent,
                            diff_matrix, freq_matrix, alpha=0.5):

    # SVD-based Prediction
    if user_id not in user_to_idx:
        svd_pred = global_mean
    else:
        user_index = user_to_idx[user_id]
        predicted_residuals = np.dot(user_latent[user_index], item_latent.T)
        user_b = user_bias.get(user_id, 0.0)
        try:
            item_idx = list(product_ids).index(product_id)
            item_b = item_bias.get(product_id, 0.0)
            svd_pred = global_mean + user_b + item_b + predicted_residuals[item_idx]
        except ValueError:
            svd_pred = global_mean

    # Slope One Prediction
    slope_pred = predict_rating_slope_one(user_id, product_id, ratings_df, diff_matrix, freq_matrix)

    # Combined Prediction
    combined_rating = alpha * svd_pred + (1 - alpha) * slope_pred

    return combined_rating


In [26]:
# Test
test_product = product_ids[8]
test_user = user_ids[5]
combined = get_combined_prediction(test_user, test_product,
                                   user_to_idx, product_ids, ratings_df,
                                   user_latent, item_latent,
                                   diff_matrix, freq_matrix, alpha=0.6)
print("Combined Prediction for user {} on product {}: {:.4f}".format(test_user, test_product, combined))

Combined Prediction for user AEPMSKC2NVUFLSDQRRLMM2OUE3MQ on product B09NJKH8TT: 4.5964


In [31]:
def get_combined_recommendations(user_id,
                                 user_to_idx, product_ids, ratings_df,
                                 user_latent, item_latent,
                                 diff_matrix, freq_matrix,
                                 top_n=10, alpha=0.5):
    rated_products = set(ratings_df[ratings_df['user_id'] == user_id]['product_id'])
    predictions = []

    for prod in product_ids:
        if prod in rated_products:
            continue
        pred = get_combined_prediction(user_id, prod,
                                       user_to_idx, product_ids, ratings_df,
                                       user_latent, item_latent,
                                       diff_matrix, freq_matrix, alpha)
        predictions.append((prod, pred))

    pred_df = pd.DataFrame(predictions, columns=['product_id', 'predicted_rating'])
    pred_df = pred_df.sort_values('predicted_rating', ascending=False)
    return pred_df.head(top_n)

In [32]:
#Test
test_user = user_ids[0]
print("Combined Recommendations for test user:", test_user)
test_recs_combined = get_combined_recommendations(test_user, user_to_idx, product_ids, ratings_df, user_latent, item_latent, diff_matrix, freq_matrix, top_n=10, alpha=0.5)
print(test_recs_combined)

Combined Recommendations for test user: AHHSGIZIOTG5UB5E535ZSYS3ZAZA
       product_id  predicted_rating
27674  B0054K55I8          6.339882
1466   B01LZYD15F          6.235715
703    B016WVV2VG          6.173215
66436  B08Z48PMGM          6.173215
58510  B0141N64YM          6.173215
13720  B0014CQN5E          6.173215
46895  B093TH4WM6          6.173215
15256  B005WZBXKE          6.089882
46615  B075MR6QGZ          6.073215
54956  B07S6BZ88P          6.048215
