In [1]:
# Math libraries
import numpy as np
import pandas as pd

In [2]:
# Preprocessing Libraries
from surprise.model_selection import cross_validate, train_test_split
from surprise import Reader, Dataset

In [3]:
# SVD algorithm Library
from surprise import SVD

In [4]:
# Analysis Libraries
from surprise import accuracy

In [5]:
# Ratings dataset
ratings = pd.read_csv('movielens/ratings.csv', usecols=['userId','movieId','rating','timestamp'])

In [6]:
# Movies dataset
movies = pd.read_csv('movielens/movies.csv', usecols=['movieId','title','genres'])

# EDA

### Ratings Dataset

In [7]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [8]:
# Number of unique users in the ratings dataset
n_users = ratings.userId.unique().shape[0]
n_users

7120

In [9]:
ratings.shape

(1048575, 4)

In [10]:
ratings.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

### Movies Dataset

In [11]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [12]:
# Number of unique movies in the movies dataset
n_movies = movies.movieId.unique().shape[0]
n_movies

27278

In [13]:
movies.shape

(27278, 3)

In [14]:
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

# Pre-processing

In [15]:
# Create a User-Movie-Ratings matrix
Ratings = ratings.pivot(index='userId',columns='movieId',values='rating').fillna(0)   # Fill missing values with 0
Ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,129350,129354,129428,129707,130052,130073,130219,130462,130490,130642
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# Converting the dataset into Surprise's format
reader = Reader()   # Initialize the Reader object
data = Dataset.load_from_df(ratings[['userId','movieId','rating']], reader)

# Cross Validations on SVD Algorithm

Performing cross-validation to compute RMSE and MAE for Analysis of the SVD algorithm

In [17]:
# Initialize the SVD algorithm
svd = SVD()

In [18]:
# Cross-validation with 5 folds
cv_results_5 = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8358  0.8343  0.8326  0.8320  0.8338  0.8337  0.0013  
MAE (testset)     0.6390  0.6380  0.6379  0.6356  0.6378  0.6376  0.0011  
Fit time          6.28    5.68    5.76    5.64    5.28    5.73    0.32    
Test time         0.91    0.77    0.88    0.78    0.75    0.82    0.06    


In [19]:
# Cross-validation with 3 folds
cv_results_3 = cross_validate(svd, data, measures=['RMSE','MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8455  0.8447  0.8444  0.8448  0.0005  
MAE (testset)     0.6477  0.6470  0.6473  0.6473  0.0003  
Fit time          4.10    4.00    4.21    4.10    0.09    
Test time         1.29    1.14    1.25    1.23    0.06    


# Finding user_1's Preferences

In [20]:
# Creating ratings_1 with user_1's ratings 

    # Movies rated 4 stars by user_1
ratings_1 = ratings[(ratings['userId']==1) & (ratings['rating']==4)]

    # movieId as the index for easier joining
ratings_1 = ratings_1.set_index('movieId')

    # Combine with the movies dataset to get movie titles
ratings_1 = ratings_1.join(movies)['title']

In [21]:
# Top 10 movies rated 4 stars by user_1
pd.DataFrame(ratings_1.head(10))

Unnamed: 0_level_0,title
movieId,Unnamed: 1_level_1
151,Batman Forever (1995)
223,Dream Man (1995)
253,Junior (1994)
260,Ladybird Ladybird (1994)
293,Pulp Fiction (1994)
296,Priest (1994)
318,Strawberry and Chocolate (Fresa y chocolate) (...
541,Harem (1985)
1036,Jude (1996)
1079,Top Gun (1986)


# Generating Recommendations for user_1

In [22]:
# Creating a copy of the movies dataset for user 1
user_1 = movies.copy()

In [23]:
# Reset the index for easier manipulation
user_1 = user_1.reset_index()

In [24]:
user_1

Unnamed: 0,index,movieId,title,genres
0,0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,2,Jumanji (1995),Adventure|Children|Fantasy
2,2,3,Grumpier Old Men (1995),Comedy|Romance
3,3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...,...
27273,27273,131254,Kein Bund für's Leben (2007),Comedy
27274,27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy
27275,27275,131258,The Pirates (2014),Adventure
27276,27276,131260,Rentun Ruusu (2001),(no genres listed)


In [25]:
# Training the SVD model on the full dataset
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x16a55d690>

In [26]:
# Predictions

    # Estimated ratings for all movies for user 1
user_1['Estimate_Score'] = user_1['movieId'].apply(lambda x: svd.predict(1,x).est)

    # Drop unnecessary columns
user_1 = user_1.drop(['movieId','genres','index'], axis=1)

    # Sort movies by estimated score in descending order
user_1 = user_1.sort_values('Estimate_Score', ascending=False)

In [27]:
# Top 10 movies with the highest estimated ratings for user 1
user_1.head(10)

Unnamed: 0,title,Estimate_Score
7416,Pride and Prejudice (1995),4.588133
8937,"Decalogue, The (Dekalog) (1989)",4.563421
9448,Head-On (Gegen die Wand) (2004),4.518761
2849,"Lady Eve, The (1941)",4.501951
315,"Shawshank Redemption, The (1994)",4.501051
15208,Cosmos (1980),4.491059
10462,"Short Film About Love, A (Krótki film o milosc...",4.483029
18990,Black Mirror (2011),4.473775
7356,Band of Brothers (2001),4.472526
17877,"Separation, A (Jodaeiye Nader az Simin) (2011)",4.445787


# Model Evaluation

In [28]:
# Re-splitting and training the SVD model to create a testset
trainset, testset = train_test_split(data, test_size=0.2)
svd.fit(trainset) 

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x16a55d690>

In [29]:
# Making predictions on the test set to evaluate the model
predictions = svd.test(testset)

In [30]:
model_analysis = pd.DataFrame({
    'cv_3': [np.mean(cv_results_3['test_rmse']), np.mean(cv_results_3['test_mae'])],
    'cv_5': [np.mean(cv_results_5['test_rmse']), np.mean(cv_results_5['test_mae'])],
    'Final': [accuracy.rmse(predictions, verbose=False), accuracy.mae(predictions, verbose=False)]
}, index=['RMSE', 'MAE'])

model_analysis

Unnamed: 0,cv_3,cv_5,Final
RMSE,0.844839,0.833684,0.833551
MAE,0.647318,0.637644,0.637786
