In [18]:
import os
import numpy as np
import pandas as pd
from hashlib import sha1
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split

In [19]:
data = pd.read_csv("data/amazon_reviews_subset_60.csv")
data.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,50774468,REYUKJGWI1EGL,0380007789,856556145,The Wolf and the Dove,Books,3.0,12.0,17.0,N,N,The most palpable Woodiwiss novel I have read ...,Anything written by Kathleen E. Woodiwiss is a...,2005-10-14
1,US,53080748,R1A7RF31MUT168,0446532436,967980395,True Believer,Books,2.0,4.0,8.0,N,N,Wait for the Movie!,Not all that good. I was waiting for a revela...,2005-10-14
2,US,50230169,R23MCAR8GSV3T0,0451526341,380925201,Animal farm: A Fairy Story,Books,4.0,2.0,2.0,N,N,Simple Yet Profound,"A generation ago, the sight of the cover of Ge...",2005-10-14
3,US,50776149,RUCZYTA3MP0MR,038551428X,970964974,"The Traveler (Fourth Realm Trilogy, Book 1)",Books,5.0,2.0,5.0,N,N,Great Marketing for a Pretty Good Book,The most interesting thing about this book is ...,2005-10-14
4,US,12598621,RCL2ARHKWH6RL,059035342X,667539744,Harry Potter and the Sorcerer's Stone,Books,5.0,2.0,2.0,N,N,I Think Part Of The Charm Is You Feel Like You...,Even though this is the shortest book in the H...,2005-10-14


In [20]:
ratings = data[["customer_id", "product_id", "star_rating"]]

In [21]:
user_key = "customer_id"
item_key = "product_id"
N = len(ratings[user_key].unique())
M = len(ratings[item_key].unique())
print(f"Number of users (N) : {N}")
print(f"Number of Products (M) : {M}")


Number of users (N) : 2493
Number of Products (M) : 3763


In [22]:
non_nan_ratings_percentage = (ratings.shape[0] / (N * M)) * 100
print(f"Non-nan ratings percentage: {np.round(non_nan_ratings_percentage,3)}")

Non-nan ratings percentage: 0.59


In [23]:
avg_nratings_per_user = ratings.groupby(user_key).size().mean()
avg_nratings_per_product = ratings.groupby(item_key).size().mean()

print(f"Average number of ratings per user : {avg_nratings_per_user}")
print(f"Average number of ratings per product: {avg_nratings_per_product}")

Average number of ratings per user : 22.21099077416767
Average number of ratings per product: 14.71485516874834


In [24]:
X = ratings.copy()
y = ratings[user_key]
X_train, X_valid, y_train, y_valid = train_test_split(
X, y, test_size=0.2, random_state=123
)
X_train.shape, X_valid.shape

((44297, 3), (11075, 3))

In [25]:
y_train

24150    48296637
30027    52159798
17928    52009378
22487    52626966
43771    52613129
           ...   
54370    51403161
17730    52947290
28030    52999835
15725    52697458
52734    52856625
Name: customer_id, Length: 44297, dtype: int64

In [26]:
user_mapper = dict(zip(np.unique(ratings[user_key]), list(range(N))))
item_mapper = dict(zip(np.unique(ratings[item_key]), list(range(M))))
user_inverse_mapper = dict(zip(list(range(N)), np.unique(ratings[user_key])))
item_inverse_mapper = dict(zip(list(range(M)), np.unique(ratings[item_key])))

def create_Y_from_ratings(
    data, N, M, user_mapper, item_mapper, user_key="customer_id", item_key="product_id"
):  # Function to create a dense utility matrix
    Y = np.zeros((N, M))
    Y.fill(np.nan)
    for index, val in data.iterrows():
        n = user_mapper[val[user_key]]
        m = item_mapper[val[item_key]]
        Y[n, m] = val["star_rating"]

    return Y


train_mat = create_Y_from_ratings(X_train, N, M, user_mapper, item_mapper)
valid_mat = create_Y_from_ratings(X_valid, N, M, user_mapper, item_mapper)

In [27]:
valid_mat

array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])

In [28]:
nnn_train_mat = np.count_nonzero(~np.isnan(train_mat))
nnn_valid_mat = np.count_nonzero(~np.isnan(valid_mat))

print(f"Number of non-nan elements in train_mat: {nnn_train_mat}")
print(f"Number of non-nan elements in valid_mat: {nnn_valid_mat}")

Number of non-nan elements in train_mat: 43068
Number of non-nan elements in valid_mat: 10976


In [29]:
def error(Y1, Y2):
    """
    Given two matrices of the same shape, 
    returns the root mean squared error (RMSE).
    """
    return np.sqrt(np.nanmean((Y1 - Y2) ** 2))


def evaluate(pred_Y, train_mat, valid_mat, model_name="Global average"):
    """
    Given predicted utility matrix and train and validation utility matrices 
    print train and validation RMSEs.
    """
    print("%s train RMSE: %0.2f" % (model_name, error(pred_Y, train_mat)))
    print("%s valid RMSE: %0.2f" % (model_name, error(pred_Y, valid_mat)))

In [30]:
# Global Average Method

avg = np.nanmean(train_mat)
pred_g = np.zeros(train_mat.shape) + avg
evaluate(pred_g, train_mat, valid_mat, model_name="Global average")

Global average train RMSE: 1.08
Global average valid RMSE: 1.09


In [31]:
# Per User Average

pred_n = np.zeros(train_mat.shape)
avg_rate = []
for i in range (train_mat.shape[0]):
    pred_n[i, :] = np.nanmean(train_mat[i, :])

evaluate(pred_n, train_mat, valid_mat, model_name="Per-user average")

Per-user average train RMSE: 0.95
Per-user average valid RMSE: 1.02


  pred_n[i, :] = np.nanmean(train_mat[i, :])


In [32]:
# Per product Average

pred_m = np.zeros(train_mat.shape)
for j in range(train_mat.shape[1]):
    column_mean = np.nanmean(train_mat[:, j])
    if np.isnan(column_mean):
        pred_m[:, j] = avg
    else:
        pred_m[:, j] = column_mean

evaluate(pred_m, train_mat, valid_mat, model_name="Per-product average")

  column_mean = np.nanmean(train_mat[:, j])


Per-product average train RMSE: 0.95
Per-product average valid RMSE: 1.06


In [33]:
# Per-user and per-product average 

pred_n_m = np.zeros(train_mat.shape)
for i in range(train_mat.shape[0]):
    for j in range(train_mat.shape[1]):
        pred_n_m[i, j] = (pred_n[i, j] + pred_m[i, j]) / 2

evaluate(pred_n_m, train_mat, valid_mat, model_name="Average of per-user and per-product average")

Average of per-user and per-product average train RMSE: 0.90
Average of per-user and per-product average valid RMSE: 0.98


In [34]:
# k-nearest neighbours imputation

from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5, keep_empty_features=True)
train_mat_imp = imputer.fit_transform(train_mat)
evaluate(train_mat_imp, train_mat, valid_mat, model_name="KNN imputer n_neighbors=2")

KNN imputer n_neighbors=2 train RMSE: 0.00
KNN imputer n_neighbors=2 valid RMSE: 1.08


In [35]:
# TruncatedSVD by replacing missing values with zeros

nan_indices = np.isnan(train_mat)
train_mat_sub_avg = np.where(nan_indices, np.nan, train_mat-pred_n_m)
train_mat_sub_avg = np.nan_to_num(train_mat_sub_avg, nan=0)

n_components = [2,4,10,12]
for number in n_components:
    model = TruncatedSVD(n_components=number, random_state=42)
    model.fit(train_mat_sub_avg)
    Z = model.transform(train_mat_sub_avg)
    W = model.components_
    reconstructed = Z@W + pred_n_m
    evaluate(reconstructed, train_mat, valid_mat, model_name=f"TruncatedSVD with {number} Components")

TruncatedSVD with 2 Components train RMSE: 0.89
TruncatedSVD with 2 Components valid RMSE: 0.98
TruncatedSVD with 4 Components train RMSE: 0.88
TruncatedSVD with 4 Components valid RMSE: 0.97
TruncatedSVD with 10 Components train RMSE: 0.86
TruncatedSVD with 10 Components valid RMSE: 0.97
TruncatedSVD with 12 Components train RMSE: 0.85
TruncatedSVD with 12 Components valid RMSE: 0.97


In [36]:
# Collaborative filtering with surprise package

import surprise
from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import cross_validate

reader = Reader()
data = Dataset.load_from_df(ratings, reader)

trainset, validset = surprise.model_selection.train_test_split(
    data, test_size=0.2, random_state=42
)

k = 2
algo = SVD(n_factors=k, random_state=123)
algo.fit(trainset)
preds = algo.test(trainset.build_testset())

cross_validate(algo, data, measures=["RMSE"], cv=5, verbose=True);

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9661  0.9499  0.9547  0.9535  0.9460  0.9541  0.0068  
Fit time          0.05    0.05    0.05    0.05    0.05    0.05    0.00    
Test time         0.02    0.02    0.02    0.02    0.07    0.03    0.02    
