### Movie recommendation system 

#### Dataset used: csv files in the folder 

In [2]:
#Importing dependencies
import pandas as pd
import numpy as np

#creating dataframes to hold required data
movies_data = pd.read_csv('movies.csv')
ratings_data =pd.read_csv('ratings.csv')



users = ratings_data['userId'].unique() 
movies =movies_data['movieId'].unique() 
print("Number of users:", len(users))
print("Number of movies:", len(movies))

Number of users: 7120
Number of movies: 27278


In [3]:
#count of rows,columns in movies_data  dataset

print(movies_data.shape)
movies_data.head(3)

(27278, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [4]:
#count of rows,columns in ratings_data  dataset

print(ratings_data.shape)
ratings_data.head(3)


(1048575, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819


In [5]:
# Creating a  ratings  matrix with rows as movies and columns as users.

Ratings = np.ndarray(shape=(np.max(ratings_data.movieId.values), np.max(ratings_data.userId.values)),
    dtype=np.uint8)

Ratings[ratings_data.movieId.values-1, ratings_data.userId.values-1] = ratings_data.rating.values
Ratings


array([[0, 0, 4, ..., 0, 5, 4],
       [3, 0, 0, ..., 0, 0, 4],
       [0, 4, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

####  loading  the 'ratings' dataset as SVD’s dataset object and computing a  3-fold cross-validation using the SVD object 

In [8]:
#using the scikit based Surprise library to import Singular-value decomposition algorithm

from surprise import SVD
from surprise import Dataset,Reader
from surprise.model_selection import cross_validate

# ratings_data['rating'].max()

reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(ratings_data[['userId','movieId','rating']],reader)
algo = SVD()

#3-fold cross-validation 
cross_validate(algo, data, cv=3, verbose=True)


Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8436  0.8440  0.8481  0.8452  0.0020  
MAE (testset)     0.6462  0.6473  0.6494  0.6477  0.0013  
Fit time          5.51    5.56    6.20    5.76    0.31    
Test time         1.90    1.77    6.33    3.33    2.12    


{'test_rmse': array([0.84357549, 0.8440295 , 0.84812036]),
 'test_mae': array([0.64624808, 0.64730416, 0.6494061 ]),
 'fit_time': (5.5117340087890625, 5.561073541641235, 6.19551944732666),
 'test_time': (1.8995249271392822, 1.7653532028198242, 6.3250532150268555)}

In [9]:
##check the data types of the columns in 'ratings' dataset

ratings_data.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [11]:

print('total number of 5 star rated movies:',ratings_data[ratings_data['rating']==5].shape[0])


total number of 5 star rated movies: 152562


#### Finding all the movies rated as 5 stars by user id '5' and storing it in 'ratings_1' data frame 

In [12]:
#user 5 --5star rated movies
filter = (ratings_data['userId']==5) & (ratings_data['rating'] == 5.0)



ratings_1 = ratings_data[filter]
# ratings_1 = ratings_data[ratings_data['rating']==5.0]
print('No.of movies rated 5 stars by user5:', ratings_1.shape[0],'movies')
ratings_1.head(3)


No.of movies rated 5 stars by user5: 38 movies


Unnamed: 0,userId,movieId,rating,timestamp
452,5,11,5.0,851527751
455,5,62,5.0,851526935
459,5,141,5.0,851526935


#### Creating a shallow copy of the 'movies' dataset and storing the result in the 'user_5'

In [13]:

import copy
user_5 = copy.copy(movies_data)
user_5.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


#### Training  a recommender system using the SVD object and predicting the ratings for user id '5'


In [14]:
# using the scikit-surprise library for building the recommender system

from surprise import dataset,Reader
from surprise.model_selection import train_test_split
from surprise.model_selection import KFold


d1 = ratings_data[ratings_data['userId']==5]
# d1.head()
reader = Reader(rating_scale=(1,5)) 


#creating a class to read the data with likewise ratings
class MyDataset(dataset.DatasetAutoFolds):

    def __init__(self, d1, reader):
        self.raw_ratings = [(uid, mid, r, None) for (uid, mid, r) in
                            zip(d1['userId'], d1['movieId'], d1['rating'])]
        self.reader=reader


data = MyDataset(d1, reader)

kf = KFold(n_splits=3)

algo = SVD()

trainset, testset = train_test_split(data, test_size=0.30)

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)

 ####  Predicting the top 10 movie recommendations for the user id '5'

In [15]:
# Top 10 recommendations for user 5

df = pd.DataFrame(predictions)
# df.head()
new_df = df.iloc[:,0:3]
new_df.rename(columns = {'uid':'userId','iid':'movieId','r_ui':'rating'},inplace=True)
# new_df.head()

top_10 = new_df.sort_values(by='rating',ascending=False)[:10]
top_10
s = pd.Series(range(1,11))
top_10.merge(user_5,on='movieId',how='left').set_index(s)

Unnamed: 0,userId,movieId,rating,title,genres
1,5,141,5.0,"Birdcage, The (1996)",Comedy
2,5,500,5.0,Mrs. Doubtfire (1993),Comedy|Drama
3,5,531,5.0,"Secret Garden, The (1993)",Children|Drama
4,5,590,5.0,Dances with Wolves (1990),Adventure|Drama|Western
5,5,11,5.0,"American President, The (1995)",Comedy|Drama|Romance
6,5,1079,5.0,"Fish Called Wanda, A (1988)",Comedy|Crime
7,5,780,5.0,Independence Day (a.k.a. ID4) (1996),Action|Adventure|Sci-Fi|Thriller
8,5,377,5.0,Speed (1994),Action|Romance|Thriller
9,5,1196,5.0,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
10,5,150,5.0,Apollo 13 (1995),Adventure|Drama|IMAX
