# Recommender System for Amazon's Books Section

In [1]:
import os
import numpy as np
import pandas as pd
from hashlib import sha1
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("data/amazon_reviews_subset_100.csv")
data.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,50230169,R23MCAR8GSV3T0,0451526341,380925201,Animal farm: A Fairy Story,Books,4.0,2.0,2.0,N,N,Simple Yet Profound,"A generation ago, the sight of the cover of Ge...",2005-10-14
1,US,50776149,RUCZYTA3MP0MR,038551428X,970964974,"The Traveler (Fourth Realm Trilogy, Book 1)",Books,5.0,2.0,5.0,N,N,Great Marketing for a Pretty Good Book,The most interesting thing about this book is ...,2005-10-14
2,US,12598621,RCL2ARHKWH6RL,059035342X,667539744,Harry Potter and the Sorcerer's Stone,Books,5.0,2.0,2.0,N,N,I Think Part Of The Charm Is You Feel Like You...,Even though this is the shortest book in the H...,2005-10-14
3,US,49770667,R2P4B3STC980QP,1594480001,659516630,The Kite Runner,Books,5.0,4.0,4.0,N,N,Praiseworthy first novel,Well I thoroughly enjoyed this book. Although ...,2005-10-14
4,US,49828549,RM0CSYVWKHW5W,0671027360,141370518,Angels & Demons,Books,1.0,31.0,39.0,N,N,Preposterous,"Early in this novel, our hero finds out that a...",2005-10-14


In [3]:
ratings = data[["customer_id", "product_id", "star_rating"]]

In [4]:
user_key = "customer_id"
item_key = "product_id"
N = len(ratings[user_key].unique())
M = len(ratings[item_key].unique())
print(f"Number of users (N) : {N}")
print(f"Number of Products (M) : {M}")


Number of users (N) : 1230
Number of Products (M) : 1672


In [5]:
non_nan_ratings_percentage = (ratings.shape[0] / (N * M)) * 100
print(f"Non-nan ratings percentage: {np.round(non_nan_ratings_percentage,3)}")

Non-nan ratings percentage: 1.19


In [6]:
avg_nratings_per_user = ratings.groupby(user_key).size().mean()
avg_nratings_per_product = ratings.groupby(item_key).size().mean()

print(f"Average number of ratings per user : {avg_nratings_per_user}")
print(f"Average number of ratings per product: {avg_nratings_per_product}")

Average number of ratings per user : 19.891056910569105
Average number of ratings per product: 14.632775119617225


In [7]:
X = ratings.copy()
y = ratings[user_key]
X_train, X_valid, y_train, y_valid = train_test_split(
X, y, test_size=0.2, random_state=123
)
X_train.shape, X_valid.shape

((19572, 3), (4894, 3))

In [8]:
y_train

4555     52947077
8687     33382577
7789     48135836
23583    50780673
1650     50732546
           ...   
15377    51801617
21602    50442542
17730    50452306
15725    49985562
19966    51823791
Name: customer_id, Length: 19572, dtype: int64

In [9]:
user_mapper = dict(zip(np.unique(ratings[user_key]), list(range(N))))
item_mapper = dict(zip(np.unique(ratings[item_key]), list(range(M))))
user_inverse_mapper = dict(zip(list(range(N)), np.unique(ratings[user_key])))
item_inverse_mapper = dict(zip(list(range(M)), np.unique(ratings[item_key])))

def create_Y_from_ratings(
    data, N, M, user_mapper, item_mapper, user_key="customer_id", item_key="product_id"
):  # Function to create a dense utility matrix
    Y = np.zeros((N, M))
    Y.fill(np.nan)
    for index, val in data.iterrows():
        n = user_mapper[val[user_key]]
        m = item_mapper[val[item_key]]
        Y[n, m] = val["star_rating"]

    return Y


train_mat = create_Y_from_ratings(X_train, N, M, user_mapper, item_mapper)
valid_mat = create_Y_from_ratings(X_valid, N, M, user_mapper, item_mapper)

In [10]:
valid_mat

array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])

In [11]:
nnn_train_mat = np.count_nonzero(~np.isnan(train_mat))
nnn_valid_mat = np.count_nonzero(~np.isnan(valid_mat))

print(f"Number of non-nan elements in train_mat: {nnn_train_mat}")
print(f"Number of non-nan elements in valid_mat: {nnn_valid_mat}")

Number of non-nan elements in train_mat: 19085
Number of non-nan elements in valid_mat: 4855


In [12]:
def error(Y1, Y2):
    """
    Given two matrices of the same shape, 
    returns the root mean squared error (RMSE).
    """
    return np.sqrt(np.nanmean((Y1 - Y2) ** 2))


def evaluate(pred_Y, train_mat, valid_mat, model_name="Global average"):
    """
    Given predicted utility matrix and train and validation utility matrices 
    print train and validation RMSEs.
    """
    print("%s train RMSE: %0.2f" % (model_name, error(pred_Y, train_mat)))
    print("%s valid RMSE: %0.2f" % (model_name, error(pred_Y, valid_mat)))

## Baseline Approaches

In [13]:
# Global Average Method

avg = np.nanmean(train_mat)
pred_g = np.zeros(train_mat.shape) + avg
evaluate(pred_g, train_mat, valid_mat, model_name="Global average")

Global average train RMSE: 1.07
Global average valid RMSE: 1.06


In [14]:
# Per User Average

pred_n = np.zeros(train_mat.shape)
avg_rate = []
for i in range (train_mat.shape[0]):
    pred_n[i, :] = np.nanmean(train_mat[i, :])

evaluate(pred_n, train_mat, valid_mat, model_name="Per-user average")

Per-user average train RMSE: 0.95
Per-user average valid RMSE: 1.01


  pred_n[i, :] = np.nanmean(train_mat[i, :])


In [15]:
# Per product Average

pred_m = np.zeros(train_mat.shape)
for j in range(train_mat.shape[1]):
    column_mean = np.nanmean(train_mat[:, j])
    if np.isnan(column_mean):
        pred_m[:, j] = avg
    else:
        pred_m[:, j] = column_mean

evaluate(pred_m, train_mat, valid_mat, model_name="Per-product average")

Per-product average train RMSE: 0.94
Per-product average valid RMSE: 1.03


  column_mean = np.nanmean(train_mat[:, j])


In [16]:
# Per-user and per-product average 

pred_n_m = np.zeros(train_mat.shape)
for i in range(train_mat.shape[0]):
    for j in range(train_mat.shape[1]):
        pred_n_m[i, j] = (pred_n[i, j] + pred_m[i, j]) / 2

evaluate(pred_n_m, train_mat, valid_mat, model_name="Average of per-user and per-product average")

Average of per-user and per-product average train RMSE: 0.89
Average of per-user and per-product average valid RMSE: 0.96


In [17]:
# k-nearest neighbours imputation

from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5, keep_empty_features=True)
train_mat_imp = imputer.fit_transform(train_mat)
evaluate(train_mat_imp, train_mat, valid_mat, model_name="KNN imputer n_neighbors=2")

KNN imputer n_neighbors=2 train RMSE: 0.00
KNN imputer n_neighbors=2 valid RMSE: 1.06


## Content-Based Approach (Using Ridge)

In [18]:
summarized_review = pd.read_csv("data/summarized_review.csv")
summarized_review.head()

Unnamed: 0,product_id,average_rating,aggregated_reviews,summarized_reviews
0,0020425651,5.0,susan cooper dark rising sequence joined pryda...,susan cooper dark rising sequence joined pryda...
1,0028610105,4.4,sheer diversity recipe japanese thai indian fr...,sheer diversity recipe japanese thai indian fr...
2,006001203X,4.1,health care proffesional tell way traumatising...,health care proffesional tell way traumatising...
3,0060096195,4.428571,started reading one bathtub get id gotten fina...,started reading one bathtub get id gotten fina...
4,006016848X,3.5625,really like book time everyone want equality s...,really like book time everyone want equality s...


In [19]:
vectorized_data = pd.read_csv("data/vectorized_data.csv")
vectorized_data.head()

Unnamed: 0,product_id,average_rating,aggregated_reviews,summarized_reviews,0,1,2,3,4,5,...,374,375,376,377,378,379,380,381,382,383
0,0020425651,5.0,susan cooper dark rising sequence joined pryda...,susan cooper dark rising sequence joined pryda...,-0.070799,-0.061837,-0.003631,0.012133,-0.035551,0.093195,...,0.065155,0.053727,0.003597,0.088892,-0.042067,0.041044,0.070728,-0.043085,-0.064512,0.038242
1,0028610105,4.4,sheer diversity recipe japanese thai indian fr...,sheer diversity recipe japanese thai indian fr...,-0.073018,-0.023593,0.066772,0.036159,0.006666,-0.010815,...,0.042942,0.037886,-0.001067,-0.009074,0.065551,-0.054624,0.067726,0.079832,-0.015437,-0.041357
2,006001203X,4.1,health care proffesional tell way traumatising...,health care proffesional tell way traumatising...,0.004559,0.033951,0.020969,0.073957,-0.02227,0.056416,...,-0.005109,0.098458,0.004446,0.010148,-0.050168,0.036669,0.133147,0.01329,0.06339,0.043042
3,0060096195,4.428571,started reading one bathtub get id gotten fina...,started reading one bathtub get id gotten fina...,-0.066638,-0.083743,0.053587,0.066727,-0.009095,0.022768,...,0.046725,0.029233,0.014859,0.084216,-0.085885,0.048461,0.023749,0.003057,-0.0757,-0.034523
4,006016848X,3.5625,really like book time everyone want equality s...,really like book time everyone want equality s...,-0.061388,0.048214,0.015069,-0.003492,-0.092024,0.02274,...,0.041296,0.018716,0.018985,0.015178,-0.019459,0.000856,0.134094,-0.078086,0.004868,-0.001025


In [20]:
vec_list = [str(i) for i in range(384)]

vec_dataframe = vectorized_data[vec_list]
vec_dataframe.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.070799,-0.061837,-0.003631,0.012133,-0.035551,0.093195,-0.016502,-0.043888,0.047747,0.004803,...,0.065155,0.053727,0.003597,0.088892,-0.042067,0.041044,0.070728,-0.043085,-0.064512,0.038242
1,-0.073018,-0.023593,0.066772,0.036159,0.006666,-0.010815,0.028194,-0.013484,0.044148,-0.062531,...,0.042942,0.037886,-0.001067,-0.009074,0.065551,-0.054624,0.067726,0.079832,-0.015437,-0.041357
2,0.004559,0.033951,0.020969,0.073957,-0.02227,0.056416,0.097014,0.059796,-0.071576,0.020534,...,-0.005109,0.098458,0.004446,0.010148,-0.050168,0.036669,0.133147,0.01329,0.06339,0.043042
3,-0.066638,-0.083743,0.053587,0.066727,-0.009095,0.022768,0.044251,-0.016787,-0.010836,0.024886,...,0.046725,0.029233,0.014859,0.084216,-0.085885,0.048461,0.023749,0.003057,-0.0757,-0.034523
4,-0.061388,0.048214,0.015069,-0.003492,-0.092024,0.02274,0.012945,-0.029013,-0.006583,0.110357,...,0.041296,0.018716,0.018985,0.015178,-0.019459,0.000856,0.134094,-0.078086,0.004868,-0.001025


In [21]:
item_feats = vec_dataframe.to_numpy()
item_feats.shape

(1672, 384)

In [22]:
from collections import defaultdict


def get_X_y_per_user(ratings, d=item_feats.shape[1]):
    """
    Returns X and y for each user.

    Parameters:
    ----------
    ratings : pandas.DataFrame
         ratings data as a dataframe

    d : int
        number of item features

    Return:
    ----------
        dictionaries containing X and y for all users
    """
    lr_y = defaultdict(list)
    lr_X = defaultdict(list)

    for index, val in ratings.iterrows():
        n = user_mapper[val[user_key]]
        m = item_mapper[val[item_key]]
        lr_X[n].append(item_feats[m])
        lr_y[n].append(val["star_rating"])

    for n in lr_X:
        lr_X[n] = np.array(lr_X[n])
        lr_y[n] = np.array(lr_y[n])

    return lr_X, lr_y

In [23]:
X_train_lr, y_train_lr = get_X_y_per_user(X_train)
print(len(X_train_lr))
print(len(y_train_lr))

1214
1214


In [24]:
# Find users with no reviews (empty arrays)
users_with_no_reviews = [user_id for user_id, ratings in X_train_lr.items() if len(ratings) == 0]

print(f"Users with no reviews: {users_with_no_reviews}")

Users with no reviews: []


In [25]:
empty = 0
for i in range(len(X_train_lr)):
    value = len(X_train_lr[i])
    if value == 0:
        empty += 1

print(empty)

16


In [26]:
max = 0
for i in range(len(X_train_lr)):
    value = len(X_train_lr[i])
    if value > max:
        max = value
        position = i

print(f"MAX user ID: {position}")
print(f"The count of ratings for MAX: {max}")

MAX user ID: 1105
The count of ratings for MAX: 136


In [27]:
min = 100000
for i in range(len(X_train_lr)):
    value = len(X_train_lr[i])
    if value < min:
        min = value
        position = i

print(f"MIN user ID: {position}")
print(f"The count of ratings for MIN: {min}")

MIN user ID: 59
The count of ratings for MIN: 0


In [28]:
from sklearn.linear_model import Ridge
import numpy as np
import pandas as pd

Z = vec_dataframe.to_numpy()

models = dict()
pred_lin_reg = np.zeros((N, M))

for n in range(N):
    X_train_df = pd.DataFrame(X_train_lr[n])
    y_train_df = pd.DataFrame(y_train_lr[n])
    
    X_train_df.dropna(inplace=True)
    y_train_df.dropna(inplace=True)
    
    X_train_lr[n] = X_train_df.to_numpy()
    y_train_lr[n] = y_train_df.to_numpy()

    if X_train_lr[n].shape[0] > 0 and y_train_lr[n].shape[0] > 0:
        models[n] = Ridge()
        models[n].fit(X_train_lr[n], y_train_lr[n])
        pred_lin_reg[n] = models[n].predict(Z).flatten()
    else:
        print(f"Skipping model fitting for index {n} due to empty training data.")

evaluate(pred_lin_reg, train_mat, valid_mat, model_name="Content-based recommender")


Skipping model fitting for index 59 due to empty training data.
Skipping model fitting for index 100 due to empty training data.
Skipping model fitting for index 101 due to empty training data.
Skipping model fitting for index 250 due to empty training data.
Skipping model fitting for index 304 due to empty training data.
Skipping model fitting for index 367 due to empty training data.
Skipping model fitting for index 419 due to empty training data.
Skipping model fitting for index 424 due to empty training data.
Skipping model fitting for index 454 due to empty training data.
Skipping model fitting for index 571 due to empty training data.
Skipping model fitting for index 706 due to empty training data.
Skipping model fitting for index 846 due to empty training data.
Skipping model fitting for index 970 due to empty training data.
Skipping model fitting for index 1092 due to empty training data.
Skipping model fitting for index 1098 due to empty training data.
Skipping model fitting f