<a href="https://colab.research.google.com/github/avyaktawrat/Evaluat-inator/blob/master/Singular_Value_Decomposition(SVD).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Collaborative filtering technique using Singular Value Decomposition

**All the EDA has been saved in another colab file**

# Data Pre-processing

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [0]:
# 1M MovieLens dataset has been used
movies = pd.read_csv('https://github.com/avyaktawrat/Evaluat-inator/raw/master/data/movies.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
users = pd.read_csv('https://github.com/avyaktawrat/Evaluat-inator/raw/master/data/users.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
ratings = pd.read_csv('https://github.com/avyaktawrat/Evaluat-inator/blob/master/data/ratings.dat?raw=true', sep = '::', header = None, engine = 'python', encoding = 'latin-1')

movies.columns = ['movieId', 'title', 'genres']
users.columns = ['userId',	'gender',	'age_desc',	'occ_desc', 'zipcode']
ratings.columns = ['userId',	'movieId',	'rating',	'timestamp']

In [3]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   userId     1000209 non-null  int64
 1   movieId    1000209 non-null  int64
 2   rating     1000209 non-null  int64
 3   timestamp  1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB


In [0]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(ratings, test_size=0.2)

In [5]:
n_users = ratings.userId.unique().shape[0]
n_movies = ratings.movieId.unique().shape[0]
print ('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_movies))

Number of users = 6040 | Number of movies = 3706


In [6]:
ratings.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [0]:
# Creating a dataframe having userId as index and movieId as columns
# Ratings are on scale of 1-5, unrated moveis are rated 0 initially

index = ratings.userId.unique()
index.sort()

columns = ratings.movieId.astype(np.int64).unique()
columns.sort()

Ratings = pd.DataFrame(np.zeros((index.shape[0], columns.shape[0])), index, columns)
for i in train_data.itertuples():
    Ratings.loc[i[1], i[2]] = i[3]

Ratings_test = pd.DataFrame(np.zeros((index.shape[0], columns.shape[0])), index, columns)
for i in test_data.itertuples():
    Ratings_test.loc[i[1], i[2]] = i[3]

total = Ratings
for i in test_data.itertuples():
    total.loc[i[1], i[2]] = i[3]

In [0]:
R = Ratings.to_numpy()
user_ratings_mean = np.mean(R, axis = 1)
Ratings_demeaned = R - user_ratings_mean.reshape(-1, 1)

In [9]:
Ratings_demeaned[:4,:4]

array([[ 4.94009714, -0.05990286, -0.05990286, -0.05990286],
       [-0.12924987, -0.12924987, -0.12924987, -0.12924987],
       [-0.05369671, -0.05369671, -0.05369671, -0.05369671],
       [-0.02374528, -0.02374528, -0.02374528, -0.02374528]])

# Matrix Factorization based Collaborative Filtering
This section covers buliding the model manually. Next section uses surprise library

**Goals:**


*   learn the latent preferences of users and the latent attributes of items from known ratings and then predict the unknown ratings through the dot product of the latent features of users and items.
*   by doing matrix factorization, you can restructure the user-item matrix into low-rank structure, and you can represent the matrix by the multiplication of two low-rank matrices, where the rows contain the latent vector
*   fit this matrix to approximate your original matrix, as closely as possible, by multiplying the low-rank matrices together, which fills in the entries missing in the original matrix





Sparsity level = total zeros/ total elements

In [10]:
sparsity = round(1.0 - len(ratings) / float(n_users * n_movies), 3)
print ('The sparsity level of MovieLens1M dataset is ' +  str(sparsity * 100) + '%')

The sparsity level of MovieLens1M dataset is 95.5%


In [11]:
Ratings_demeaned.shape

(6040, 3706)

In [0]:
# Reducing N = 3706 features to k = 50
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(Ratings_demeaned, k=50)

In [0]:
sigma = np.diag(sigma)

In [0]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [16]:
preds = pd.DataFrame(all_user_predicted_ratings, columns = Ratings.columns)
preds.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,3913,3914,3915,3916,3917,3918,3919,3920,3921,3922,3923,3924,3925,3926,3927,3928,3929,3930,3931,3932,3933,3934,3935,3936,3937,3938,3939,3940,3941,3942,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
0,4.288861,0.143055,-0.19508,-0.018843,0.012232,-0.176604,-0.07412,0.141358,-0.059553,-0.19595,0.512867,-0.089172,0.310181,-0.002005,-0.052401,-0.189827,0.23836,0.006466,-0.099315,-0.069682,-0.321492,0.111577,0.034795,0.320576,-0.118217,-0.012647,0.065573,-0.098318,0.064081,-0.005914,0.091936,0.180563,-0.009566,2.641693,-0.012495,0.765179,0.019784,0.002917,0.053079,0.014856,...,0.01881,-0.018782,0.022249,0.227852,-0.067653,-0.046039,-0.023574,-0.019405,-0.005116,-0.032921,-0.008259,-0.019157,0.007527,-0.008687,-0.02563,-0.013563,0.01524,-0.044665,-0.009568,-0.043549,-0.003131,-0.008221,-0.005948,0.031885,-0.003424,-0.001159,-0.002124,-0.002827,0.010393,-0.001068,0.027807,0.00164,0.026395,-0.022024,-0.085415,0.403529,0.105579,0.031912,0.05045,0.08891
1,0.744716,0.169659,0.335418,0.000758,0.022475,1.35305,0.051426,0.071258,0.161601,1.567246,0.772656,0.046179,-0.054562,0.042344,0.04839,0.347313,1.074905,-0.099782,0.008163,0.250869,2.186638,0.018789,-0.002199,0.218934,0.824475,0.139274,-0.007135,0.053071,-0.156952,0.044739,-0.00296,0.453298,-0.007484,0.920325,0.016566,1.335129,-0.015066,-0.045602,0.034649,0.12201,...,-0.042363,-0.137822,-0.112071,0.380783,-0.036273,-0.016174,0.00292,-0.148021,-0.017614,-0.033474,0.086133,0.008153,-0.126819,0.109208,0.001798,0.151866,0.014118,0.032897,0.005764,0.042259,0.022404,0.00326,0.010556,0.137181,-0.042184,0.006759,-0.005789,0.00034,0.002024,0.016013,-0.056502,-0.013733,-0.01058,0.062576,-0.016248,0.15579,-0.418737,-0.101102,-0.054098,-0.140188
2,1.818824,0.456136,0.090978,-0.043037,-0.025694,-0.158617,-0.131778,0.098977,0.030551,0.73547,-0.023476,0.034796,0.065942,0.008661,0.110348,-0.002952,-0.122061,0.063974,0.061033,0.081799,0.329471,0.149579,0.095352,-0.161493,0.022545,-0.009284,-0.002677,-0.14271,0.012345,-0.085331,0.076139,-0.355795,-0.008579,1.046871,-0.088946,0.383583,-0.018144,-0.038618,0.113984,0.006942,...,0.007233,-0.047221,0.066474,-0.179455,0.097428,0.034113,0.008098,-0.024784,-0.012749,-0.007394,-0.01722,0.004719,0.113348,-0.074943,-0.145795,0.128619,0.112567,0.0455,-0.018027,-0.058946,-0.00277,-0.035276,-0.008085,0.132182,-0.017005,0.014383,0.006598,-0.006217,-0.000342,0.000518,0.040481,-0.005301,0.012832,0.029349,0.020866,0.121532,0.076205,0.012345,0.015148,-0.109956
3,0.408057,-0.07296,0.039642,0.089363,0.04195,0.237753,-0.049426,0.009467,0.045469,-0.11137,-0.375831,0.068658,0.011199,0.069699,-0.037529,-0.238788,0.060607,-0.043418,0.053152,0.078237,0.357185,-0.096005,-0.028243,-0.067169,0.246164,-0.020379,0.034461,-0.022225,-0.012327,0.009182,0.01473,0.215893,-0.019687,-0.293933,-0.011511,0.145326,-0.029213,0.030029,-0.045409,-0.030684,...,-0.015077,-0.030208,0.028357,-0.072643,-0.135727,-0.053318,-0.012962,-0.054465,0.00587,-0.018048,-0.006836,-0.008222,-0.027214,-0.071677,-0.094072,-0.010745,-0.103191,-0.031297,-0.02392,-0.015053,-0.017914,-0.029561,-0.024299,-0.057678,-0.11145,-0.015473,-0.007123,-0.007416,-0.011508,-0.010038,0.008571,-0.005425,-0.0085,-0.003417,-0.083982,0.094512,0.057557,-0.02605,0.014841,-0.034224
4,1.574272,0.021239,-0.0513,0.246884,-0.032406,1.552281,-0.19963,-0.01492,-0.060498,0.450512,-0.251178,0.012337,-0.084051,0.258937,0.01657,0.980536,1.267869,0.275619,-0.008139,-0.038832,1.849627,0.107649,-0.168424,0.386541,1.790343,0.192379,-0.054356,0.267566,1.027817,0.374665,-0.010445,1.94798,0.017468,2.784035,0.274397,1.422393,0.040553,0.022926,1.3458,0.104507,...,0.075475,0.330767,0.15047,-0.261636,0.085163,-0.014229,-0.029247,0.124172,0.092875,0.061895,0.034757,0.054386,0.047055,0.048403,0.082926,0.129035,-0.174646,0.102727,0.024732,0.04728,0.017818,0.041451,0.041595,-0.007138,-0.080448,0.018639,0.034068,0.026941,0.035905,0.024459,0.110151,0.04601,0.006934,-0.01594,-0.05008,-0.052539,0.507189,0.03383,0.125706,0.199244


In [17]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


Function to return the movies with the highest predicted rating that the specified user hasn't already rated

In [0]:
dataset = (pd.merge(movies, ratings))

In [0]:
# dataset.head()

In [0]:
# Building the recommendation system manually
def recommend_movies(predictions, userID, movies, original_ratings, num_recommendations):
  
  id_movies = original_ratings.loc[userID].sort_values(ascending = False)
  # top 10 rated movies by user
  user_full = dataset[dataset.movieId.isin(id_movies.index[:10])* dataset.userId.isin([userID])]
  user_full = user_full.loc[user_full.rating.sort_values(ascending = False).index]
  
  # df series having recomm movieId as index and rating as values
  df = predictions.loc[userID][Ratings.loc[userID] ==0].sort_values(ascending = False).to_frame()
  id_pred = df.index[:num_recommendations]
  recommendations = movies[movies.movieId.isin(id_pred)].dropna()
  recommendations = recommendations.set_index(recommendations.movieId)

  df_recom_ratings = df.loc[id_pred].sort_index()

  recommendations = pd.concat([recommendations, df_recom_ratings], axis =1)
  recommendations = recommendations.loc[recommendations.sort_values(by=userID, ascending=False).index]

  return user_full, recommendations

In [31]:
already_rated, predictions = recommend_movies(preds, 110, movies, total, 20)

  f"evaluating in Python space because the {repr(op_str)} "


In [32]:
predictions

Unnamed: 0,movieId,title,genres,110
318,318,"Shawshank Redemption, The (1994)",Drama,3.433425
1704,1704,Good Will Hunting (1997),Drama,3.327164
1265,1265,Groundhog Day (1993),Comedy|Romance,3.109517
1784,1784,As Good As It Gets (1997),Comedy|Drama,3.049993
593,593,"Silence of the Lambs, The (1991)",Drama|Thriller,2.917356
2762,2762,"Sixth Sense, The (1999)",Thriller,2.762926
1641,1641,"Full Monty, The (1997)",Comedy,2.716587
357,357,Four Weddings and a Funeral (1994),Comedy|Romance,2.607189
1923,1923,There's Something About Mary (1998),Comedy,2.581997
150,150,Apollo 13 (1995),Drama,2.511755


In [22]:
already_rated

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
992511,3897,Almost Famous (2000),Comedy|Drama,110,5,977514279
933908,3578,Gladiator (2000),Action|Drama,110,5,977514932
766652,2858,American Beauty (1999),Comedy|Drama,110,5,977514902
510543,1912,Out of Sight (1998),Action|Crime|Romance,110,5,977514377
469460,1673,Boogie Nights (1997),Drama,110,5,977515330
456216,1617,L.A. Confidential (1997),Crime|Film-Noir|Mystery|Thriller,110,5,977514344
267276,1089,Reservoir Dogs (1992),Crime|Thriller,110,5,977514344
172045,608,Fargo (1996),Crime|Drama|Thriller,110,5,977514344
23328,50,"Usual Suspects, The (1995)",Crime|Thriller,110,5,977514344
21782,47,Seven (Se7en) (1995),Crime|Thriller,110,5,977514421


## Evaluation

In [0]:
from sklearn.metrics import mean_squared_error
from math import sqrt

# Function to calculate RMSE
def rmse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, actual))

In [24]:
# RMSE on train data
rmse(all_user_predicted_ratings, Ratings.to_numpy())

2.236214927907401

In [25]:
# RMSE on total data
rmse(all_user_predicted_ratings, Ratings_test.to_numpy())

2.2345772086550983

# Building better SVD model using Surprise Library

In [0]:
# !pip install surprise

In [0]:
# Import libraries from Surprise package
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

# Load Reader library
reader = Reader()

# Load ratings dataset with Dataset library
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Split the dataset for 5-fold evaluation
# data.split(n_folds=5)

In [27]:
# Use the SVD algorithm.
svd = SVD()

# Compute the RMSE of the SVD algorithm.
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8717  0.8714  0.8748  0.8759  0.8753  0.8738  0.0019  
MAE (testset)     0.6840  0.6844  0.6866  0.6873  0.6872  0.6859  0.0014  
Fit time          48.94   48.49   49.74   48.69   49.31   49.03   0.45    
Test time         2.72    2.11    2.66    2.18    2.62    2.46    0.26    


{'fit_time': (48.9361047744751,
  48.48717021942139,
  49.741254806518555,
  48.688448905944824,
  49.30824685096741),
 'test_mae': array([0.68399274, 0.68436791, 0.68655253, 0.68734698, 0.68722883]),
 'test_rmse': array([0.8716679 , 0.87135291, 0.87477667, 0.87586284, 0.87531126]),
 'test_time': (2.7163233757019043,
  2.1096761226654053,
  2.6595633029937744,
  2.1825783252716064,
  2.6225593090057373)}

In [28]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f34ef4d8dd8>

In [29]:
ratings[ratings['userId'] == 10]

Unnamed: 0,userId,movieId,rating,timestamp
799,10,2622,5,978228212
800,10,648,4,978224925
801,10,2628,3,978228408
802,10,3358,5,978226378
803,10,3359,3,978227125
...,...,...,...,...
1195,10,2045,3,978228575
1196,10,2046,4,978228966
1197,10,2047,4,978229459
1198,10,1247,3,979167795


In [30]:
svd.predict(110,0)

Prediction(uid=110, iid=0, r_ui=None, est=3.355135987931979, details={'was_impossible': False})