# **1. Libraries:**

In [None]:
#import the reqired libraries
import numpy as np 
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from scipy.sparse.linalg import svds

# **2. Load Data Set**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Import the dataset and give the column names
columns=['userId', 'productId', 'ratings','timestamp']
electronics_dataset=pd.read_csv('/content/drive/MyDrive/AI Final Project/Copy of ratings_Electronics.csv',names=columns)
electronics_dataset.drop('timestamp',axis=1,inplace=True)

electronics_dataset.head()
electronics_dataset.info()

Unnamed: 0,userId,productId,ratings
0,AKM1MP6P0OYPR,132793040,5.0
1,A2CX7LUOHB2NDG,321732944,5.0
2,A2NWSAGRHCP8N5,439886341,1.0
3,A2WNBOD3WNDNKT,439886341,3.0
4,A1GI0U4ZRJA8WN,439886341,1.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7824482 entries, 0 to 7824481
Data columns (total 3 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     object 
 1   productId  object 
 2   ratings    float64
dtypes: float64(1), object(2)
memory usage: 179.1+ MB


In [None]:
#Check the number of rows and columns
print('shape of the dataset (row,col):',electronics_dataset.shape)

#Check the datatypes
electronics_dataset.dtypes

shape of the dataset (row,col): (7824482, 3)


userId        object
productId     object
ratings      float64
dtype: object

In [None]:
#Taking subset of the dataset
electronics_dataset_subset=electronics_dataset.iloc[:50000,0:]
electronics_dataset_subset.info()
print('\n')

#Summary 
electronics_dataset_subset['ratings'].describe().transpose()

#minimum and maximum ratings
print('\n')
print('Minimum:',electronics_dataset_subset.ratings.min())
print('Maximum:',electronics_dataset_subset.ratings.max())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   userId     50000 non-null  object 
 1   productId  50000 non-null  object 
 2   ratings    50000 non-null  float64
dtypes: float64(1), object(2)
memory usage: 1.1+ MB




count    50000.00000
mean         4.03524
std          1.35555
min          1.00000
25%          4.00000
50%          5.00000
75%          5.00000
max          5.00000
Name: ratings, dtype: float64



Minimum: 1.0
Maximum: 5.0


In [None]:
# Count of unique user and product in the subset data
print('unique users = ', electronics_dataset_subset['userId'].nunique())
print('unique product = ', electronics_dataset_subset['productId'].nunique())

unique users =  46554
unique product =  3446


# **3. Taking the top 15 users**

In [None]:
#Check the top 15 users based on ratings (Count)
unique_users=electronics_dataset_subset.groupby('userId')
most_rated=unique_users.size().sort_values(ascending=False)[:15]
print('Top 15 users based on ratings: \n',most_rated)

Top 15 users based on ratings: 
 userId
A231WM2Z2JL0U3    37
AY8Q1X7G96HV5     31
ALUNVOQRXOZIA     20
A1NVD0TKNS1GT5    19
A243HY69GIAHFI    18
A1RPTVW5VEOSI     17
A1ISUNUWG0K02V    16
A1MJMYLRTZ76ZX    16
A7Y6AVS576M03     15
A3MEIR72XKQY88    15
A23ZO1BVFFLGHO    15
A3IBOQ8R44YG9L    14
A6ZPLVAUQ6695     13
ARXU3FESTWMJJ     13
A1WVMDRJU19AFD    12
dtype: int64


In [None]:
# working on best 15 users based on the count of their ranking:
# we call it final data
# electronics_dataset_final has the users who have rated 15 or more items.

counts=electronics_dataset_subset.userId.value_counts()
electronics_dataset_final=electronics_dataset_subset[electronics_dataset_subset.userId.isin(counts[counts>=15].index)]

print('Number of users who have rated 15 or more items =', len(electronics_dataset_final))
print('Number of unique users in the final data = ', electronics_dataset_final['userId'].nunique())
print('Number of unique products in the final data = ', electronics_dataset_final['productId'].nunique())

Number of users who have rated 15 or more items = 219
Number of unique users in the final data =  11
Number of unique products in the final data =  186


In [None]:
#constructing the pivot table for Algorithm
final_ratings_matrix = electronics_dataset_final.pivot(index = 'userId', columns ='productId', values = 'ratings').fillna(0)
final_ratings_matrix

print('Shape of final_ratings_matrix: ', final_ratings_matrix.shape)
#It shows that it is a sparse matrix. So, many cells are filled with 0 values.

productId,1400599997,B00000DM9M,B00000J061,B00000J08C,B00000J0A2,B00000J0E8,B00000J1QZ,B00000J1US,B00000J3H5,B00000J3HB,...,B00004TDWY,B00004TE75,B00004TEN2,B00004TH2W,B00004TH2Y,B00004THCX,B00004THCY,B00004THCZ,B00004THDE,B00004THM6
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1ISUNUWG0K02V,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,5.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1MJMYLRTZ76ZX,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1NVD0TKNS1GT5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1RPTVW5VEOSI,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
A231WM2Z2JL0U3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,5.0,5.0,0.0,4.0,0.0,0.0,0.0,5.0,0.0
A23ZO1BVFFLGHO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A243HY69GIAHFI,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A3MEIR72XKQY88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A7Y6AVS576M03,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ALUNVOQRXOZIA,1.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Shape of final_ratings_matrix:  (11, 186)


# **4. Splitting the data**

In [None]:
# Split the data randomnly into train and test datasets into 70:30 ratio
# with train_test_split function
train_data, test_data = train_test_split(electronics_dataset_final, test_size = 0.3, random_state=0)

print('Shape of training data: ',train_data.shape)
print('Shape of testing data: ',test_data.shape)

Shape of training data:  (153, 3)
Shape of testing data:  (66, 3)


# **5. Building Collaborative Filtering recommender model**

In [None]:
electronics_dataset_final_CF = pd.concat([train_data, test_data]).reset_index()
electronics_dataset_final_CF.head() 

Unnamed: 0,index,userId,productId,ratings
0,17509,AY8Q1X7G96HV5,B00000JSES,4.0
1,11968,A243HY69GIAHFI,B00000J3Q7,3.0
2,35533,A1RPTVW5VEOSI,B00003WGP5,5.0
3,31480,A1NVD0TKNS1GT5,B00002JXFH,4.0
4,13526,A23ZO1BVFFLGHO,B00000J570,5.0


## User base collaborative Model

In [None]:
# Constructing the pivot table for Algorithm
real_pivot_table = electronics_dataset_final_CF.pivot(index = 'userId', columns ='productId', values = 'ratings').fillna(0)

#define user index 
real_pivot_table['user_index'] = np.arange(0, real_pivot_table.shape[0], 1)
real_pivot_table.set_index(['user_index'], inplace=True)

## Singular Value Decomposition
It shows that it is a sparse matrix. So, many cells are filled with 0 values.
As this is a sparse matrix we will use SVD.

In [None]:
# Singular Value Decomposition
P, sigma, Qt = svds(real_pivot_table, k = 10)
# Construct diagonal array in SVD
sigma = np.diag(sigma)

In [None]:
#Predicted ratings
all_user_predicted_ratings = np.dot(np.dot(P, sigma), Qt) 
# Convert predicted ratings to dataframe
predicate_pivot_table = pd.DataFrame(all_user_predicted_ratings, columns = real_pivot_table.columns)

In [None]:
# Print Real Rating and Predicate Rating:
# so Actual ratings given by users:
print("Pivot Table:")
real_pivot_table

# and Predicate Rating given by SVD:
print("Predicate Table:")
predicate_pivot_table

Pivot Table:


productId,1400599997,B00000DM9M,B00000J061,B00000J08C,B00000J0A2,B00000J0E8,B00000J1QZ,B00000J1US,B00000J3H5,B00000J3HB,...,B00004TDWY,B00004TE75,B00004TEN2,B00004TH2W,B00004TH2Y,B00004THCX,B00004THCY,B00004THCZ,B00004THDE,B00004THM6
user_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,5.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,5.0,5.0,0.0,4.0,0.0,0.0,0.0,5.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Predicate Table:


productId,1400599997,B00000DM9M,B00000J061,B00000J08C,B00000J0A2,B00000J0E8,B00000J1QZ,B00000J1US,B00000J3H5,B00000J3HB,...,B00004TDWY,B00004TE75,B00004TEN2,B00004TH2W,B00004TH2Y,B00004THCX,B00004THCY,B00004THCZ,B00004THDE,B00004THM6
0,-0.005077,-0.010023,0.219864,-0.010023,-0.02031,-0.006562,2.962929,0.026594,4.938216,4.938216,...,-0.025387,0.026594,0.026594,0.219864,0.021275,-0.008018,-0.006014,-0.008018,0.026594,-0.008018
1,-0.000261,-0.000515,0.0113,-0.000515,-0.001044,-0.000337,-0.001905,0.001367,-0.003176,-0.003176,...,-0.001305,0.001367,0.001367,0.0113,0.001093,-0.000412,-0.000309,-0.000412,0.001367,-0.000412
2,-0.0163,-0.032177,0.705839,-0.032177,-0.065201,-0.021066,-0.119009,0.085374,-0.198349,-0.198349,...,-0.081502,0.085374,0.085374,0.705839,0.068299,-0.025742,-0.019306,-0.025742,0.085374,-0.025742
3,0.018068,0.035667,4.217599,0.035667,0.072274,0.023351,0.131918,-0.094635,0.219864,0.219864,...,0.090342,-0.094635,-0.094635,4.217599,-0.075708,0.028534,0.0214,0.028534,-0.094635,0.028534
4,0.002185,0.004314,-0.094635,0.004314,0.008742,0.002824,0.015956,4.988553,0.026594,0.026594,...,0.010927,4.988553,4.988553,-0.094635,3.990843,0.003451,0.002588,0.003451,4.988553,0.003451
5,-0.0009,-0.001777,0.038989,-0.001777,-0.003602,-0.001164,-0.006574,0.004716,-0.010956,-0.010956,...,-0.004502,0.004716,0.004716,0.038989,0.003773,-0.001422,-0.001066,-0.001422,0.004716,-0.001422
6,0.014638,0.028895,-0.633852,0.028895,0.058552,0.018918,0.106872,-0.076667,0.17812,0.17812,...,0.07319,-0.076667,-0.076667,-0.633852,-0.061334,0.023116,0.017337,0.023116,-0.076667,0.023116
7,-0.035258,-0.0696,1.526746,-0.0696,-0.141032,-0.045567,-0.25742,0.184667,-0.429034,-0.429034,...,-0.17629,0.184667,0.184667,1.526746,0.147733,-0.05568,-0.04176,-0.05568,0.184667,-0.05568
8,-0.000539,-0.001065,0.023351,-0.001065,-0.002157,4.999303,-0.003937,0.002824,-0.006562,-0.006562,...,-0.002696,0.002824,0.002824,0.023351,0.00226,-0.000852,-0.000639,-0.000852,0.002824,-0.000852
9,0.997914,-0.004118,0.090342,-0.004118,3.991655,-0.002696,-0.015232,0.010927,-0.025387,-0.025387,...,4.989568,0.010927,0.010927,0.090342,0.008742,-0.003295,-0.002471,-0.003295,0.010927,-0.003295


In [None]:
# Recommend the items with the highest predicted ratings
def recommend_items(userID, real_pivot_table, predicate_pivot_table, num_recommendations):
    # index starts at 0  
    user_index = userID-1 

    # Get and sort the user's ratings
        #sorted_user_ratings:
    sorted_user_ratings = real_pivot_table.iloc[user_index].sort_values(ascending=False)
        #sorted_user_predictions:
    sorted_user_predictions = predicate_pivot_table.iloc[user_index].sort_values(ascending=False)

    temp = pd.concat([sorted_user_ratings, sorted_user_predictions], axis=1)
    temp.index.name = 'Recommended Items'
    temp.columns = ['user_ratings', 'user_predictions']
    temp = temp.loc[temp.user_ratings == 0]   
    temp = temp.sort_values('user_predictions', ascending=False)
    print('\nBelow are the recommended items for user(user_id = {}):\n'.format(userID))
    print(temp.head(num_recommendations))

In [None]:
# pivod_df --> before SVD
# preds_df --> after SVD
userID = 4
num_recommendations = 5
recommend_items(userID, real_pivot_table, predicate_pivot_table, num_recommendations) 


Below are the recommended items for user(user_id = 4):

                   user_ratings  user_predictions
Recommended Items                                
B00004T1WZ                  0.0          1.526746
B00000JYLO                  0.0          1.526746
B00000J4ER                  0.0          1.526746
B00001P4XA                  0.0          1.526746
B00004T1WX                  0.0          1.526746


# **6. Evaluation**

In [None]:
# Actual ratings (users)
real_pivot_table.head()
# Average ACTUAL rating for each product
real_pivot_table.mean().head()

# Predicted ratings 
predicate_pivot_table.head()
# Average PREDICTED rating for each product
predicate_pivot_table.mean().head()

productId,1400599997,B00000DM9M,B00000J061,B00000J08C,B00000J0A2,B00000J0E8,B00000J1QZ,B00000J1US,B00000J3H5,B00000J3HB,...,B00004TDWY,B00004TE75,B00004TEN2,B00004TH2W,B00004TH2Y,B00004THCX,B00004THCY,B00004THCZ,B00004THDE,B00004THM6
user_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,5.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,5.0,5.0,0.0,4.0,0.0,0.0,0.0,5.0,0.0


productId
1400599997    0.090909
B00000DM9M    0.454545
B00000J061    0.454545
B00000J08C    0.454545
B00000J0A2    0.363636
dtype: float64

productId,1400599997,B00000DM9M,B00000J061,B00000J08C,B00000J0A2,B00000J0E8,B00000J1QZ,B00000J1US,B00000J3H5,B00000J3HB,...,B00004TDWY,B00004TE75,B00004TEN2,B00004TH2W,B00004TH2Y,B00004THCX,B00004THCY,B00004THCZ,B00004THDE,B00004THM6
0,-0.005077,-0.010023,0.219864,-0.010023,-0.02031,-0.006562,2.962929,0.026594,4.938216,4.938216,...,-0.025387,0.026594,0.026594,0.219864,0.021275,-0.008018,-0.006014,-0.008018,0.026594,-0.008018
1,-0.000261,-0.000515,0.0113,-0.000515,-0.001044,-0.000337,-0.001905,0.001367,-0.003176,-0.003176,...,-0.001305,0.001367,0.001367,0.0113,0.001093,-0.000412,-0.000309,-0.000412,0.001367,-0.000412
2,-0.0163,-0.032177,0.705839,-0.032177,-0.065201,-0.021066,-0.119009,0.085374,-0.198349,-0.198349,...,-0.081502,0.085374,0.085374,0.705839,0.068299,-0.025742,-0.019306,-0.025742,0.085374,-0.025742
3,0.018068,0.035667,4.217599,0.035667,0.072274,0.023351,0.131918,-0.094635,0.219864,0.219864,...,0.090342,-0.094635,-0.094635,4.217599,-0.075708,0.028534,0.0214,0.028534,-0.094635,0.028534
4,0.002185,0.004314,-0.094635,0.004314,0.008742,0.002824,0.015956,4.988553,0.026594,0.026594,...,0.010927,4.988553,4.988553,-0.094635,3.990843,0.003451,0.002588,0.003451,4.988553,0.003451


productId
1400599997    0.088513
B00000DM9M    0.449816
B00000J061    0.558292
B00000J08C    0.449816
B00000J0A2    0.354053
dtype: float64

In [None]:
rmse_df = pd.concat([final_ratings_matrix.mean(), predicate_pivot_table.mean()], axis=1)
rmse_df.columns = ['Avg_REAL_ratings', 'Avg_PREDICATE_ratings']
rmse_df['item_index'] = np.arange(0, rmse_df.shape[0], 1)
rmse_df.head()


RMSE = round((((rmse_df.Avg_REAL_ratings - rmse_df.Avg_PREDICATE_ratings) ** 2).mean() ** 0.5), 5)
print('\nRMSE = {} \n'.format(RMSE))

Unnamed: 0_level_0,Avg_REAL_ratings,Avg_PREDICATE_ratings,item_index
productId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1400599997,0.090909,0.088513,0
B00000DM9M,0.454545,0.449816,1
B00000J061,0.454545,0.558292,2
B00000J08C,0.454545,0.449816,3
B00000J0A2,0.363636,0.354053,4



RMSE = 0.05854 

