In [89]:
import os
import numpy as np
import pandas as pd
from hashlib import sha1
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split

In [90]:
data = pd.read_csv("data/amazon_reviews_subset.csv")
data.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,38484071,RWCTJ6DAIGZ7T,61087122,683956970,Master of Seduction (Sea Wolves Series),Books,4.0,7.0,10.0,N,N,Good book,I have only read two books by Kinley MacGregor...,2002-07-06
1,US,51165577,R1H1YNIAYYLSKF,345422805,234414967,Team Rodent : How Disney Devours the World,Books,4.0,7.0,9.0,N,N,the shady side of Disney,I read this book several years ago and since t...,2003-01-11
2,US,40669813,R1BQAW6T2KKOI4,679764410,864803147,American Sphinx: The Character of Thomas Jeffe...,Books,5.0,46.0,56.0,N,N,Sphinx?,Five star effort by Ellis for what he did. If ...,2001-11-04
3,US,50358298,RVKRFPUQGVE30,679879250,872660777,"The Subtle Knife (His Dark Materials, Book 2)",Books,5.0,2.0,2.0,N,N,"Where there's Will, there's a way...",This book was a worthy successor to The Golden...,2001-01-02
4,US,20997233,R3RW172U8DSP4O,684840057,505830594,Radical Son: A Generational Odyssey,Books,5.0,19.0,23.0,N,N,Exceptionally Truthful Examination of a Tumult...,"\\""Radical Son\\"" is the well-written and brut...",2004-10-01


In [91]:
ratings = data[["customer_id", "product_id", "star_rating"]]

In [92]:
user_key = "customer_id"
item_key = "product_id"
N = len(ratings[user_key].unique())
M = len(ratings[item_key].unique())
print(f"Number of users (N) : {N}")
print(f"Number of Products (M) : {M}")


Number of users (N) : 1210
Number of Products (M) : 1615


In [93]:
non_nan_ratings_percentage = (ratings.shape[0] / (N * M)) * 100
print(f"Non-nan ratings percentage: {np.round(non_nan_ratings_percentage,3)}")

Non-nan ratings percentage: 0.231


In [94]:
avg_nratings_per_user = ratings.groupby(user_key).size().mean()
avg_nratings_per_product = ratings.groupby(item_key).size().mean()

print(f"Average number of ratings per user : {avg_nratings_per_user}")
print(f"Average number of ratings per product: {avg_nratings_per_product}")

Average number of ratings per user : 3.727272727272727
Average number of ratings per product: 2.7925696594427243


In [95]:
X = ratings.copy()
y = ratings[user_key]
X_train, X_valid, y_train, y_valid = train_test_split(
X, y, test_size=0.2, random_state=123
)
X_train.shape, X_valid.shape

((3608, 3), (902, 3))

In [96]:
user_mapper = dict(zip(np.unique(ratings[user_key]), list(range(N))))
item_mapper = dict(zip(np.unique(ratings[item_key]), list(range(M))))
user_inverse_mapper = dict(zip(list(range(N)), np.unique(ratings[user_key])))
item_inverse_mapper = dict(zip(list(range(M)), np.unique(ratings[item_key])))

def create_Y_from_ratings(
    data, N, M, user_mapper, item_mapper, user_key="customer_id", item_key="product_id"
):  # Function to create a dense utility matrix
    Y = np.zeros((N, M))
    Y.fill(np.nan)
    for index, val in data.iterrows():
        n = user_mapper[val[user_key]]
        m = item_mapper[val[item_key]]
        Y[n, m] = val["star_rating"]

    return Y


train_mat = create_Y_from_ratings(X_train, N, M, user_mapper, item_mapper)
valid_mat = create_Y_from_ratings(X_valid, N, M, user_mapper, item_mapper)

In [97]:
nnn_train_mat = np.count_nonzero(~np.isnan(train_mat))
nnn_valid_mat = np.count_nonzero(~np.isnan(valid_mat))

print(f"Number of non-nan elements in train_mat: {nnn_train_mat}")
print(f"Number of non-nan elements in valid_mat: {nnn_valid_mat}")

Number of non-nan elements in train_mat: 3507
Number of non-nan elements in valid_mat: 892


In [98]:
def error(Y1, Y2):
    """
    Given two matrices of the same shape, 
    returns the root mean squared error (RMSE).
    """
    return np.sqrt(np.nanmean((Y1 - Y2) ** 2))


def evaluate(pred_Y, train_mat, valid_mat, model_name="Global average"):
    """
    Given predicted utility matrix and train and validation utility matrices 
    print train and validation RMSEs.
    """
    print("%s train RMSE: %0.2f" % (model_name, error(pred_Y, train_mat)))
    print("%s valid RMSE: %0.2f" % (model_name, error(pred_Y, valid_mat)))

In [99]:
# Global Average Method

avg = np.nanmean(train_mat)
pred_g = np.zeros(train_mat.shape) + avg
evaluate(pred_g, train_mat, valid_mat, model_name="Global average")

Global average train RMSE: 1.15
Global average valid RMSE: 1.16


In [100]:
# Per User Average

pred_n = np.zeros(train_mat.shape)
avg_rate = []
for i in range (train_mat.shape[0]):
    pred_n[i, :] = np.nanmean(train_mat[i, :])

evaluate(pred_n, train_mat, valid_mat, model_name="Per-user average")

Per-user average train RMSE: 0.81
Per-user average valid RMSE: 1.20


  pred_n[i, :] = np.nanmean(train_mat[i, :])


In [101]:
# Per product Average

pred_m = np.zeros(train_mat.shape)
for j in range(train_mat.shape[1]):
    column_mean = np.nanmean(train_mat[:, j])
    if np.isnan(column_mean):
        pred_m[:, j] = avg
    else:
        pred_m[:, j] = column_mean

evaluate(pred_m, train_mat, valid_mat, model_name="Per-product average")

Per-product average train RMSE: 0.81
Per-product average valid RMSE: 1.29


  column_mean = np.nanmean(train_mat[:, j])


In [102]:
# Per-user and per-product average 

pred_n_m = np.zeros(train_mat.shape)
for i in range(train_mat.shape[0]):
    for j in range(train_mat.shape[1]):
        pred_n_m[i, j] = (pred_n[i, j] + pred_m[i, j]) / 2

evaluate(pred_n_m, train_mat, valid_mat, model_name="Average of per-user and per-product average")

Average of per-user and per-product average train RMSE: 0.70
Average of per-user and per-product average valid RMSE: 1.10
