# Movie Recommendation System


## Importing required libraries

In [1]:
import pandas as pd
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise import SVD
from surprise import SVDpp
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

## Importing data

In [2]:
import os
path = os.getcwd()

ratings_df = pd.read_csv(os.path.join(path,'ratings.csv'))

movies_df = pd.read_csv(os.path.join(path,'movies.csv'))

tmdb_df = pd.read_csv(os.path.join(path,'tmdb_data_combine.csv'))

links_df = pd.read_csv(os.path.join(path,'links.csv'))

In [3]:
# Creating a master db
links_df = links_df.dropna(axis=0)
links_df['tmdbId'] = links_df['tmdbId'].astype('int')
master_df = ratings_df.merge(links_df , how='left' , on='movieId')
master_df = master_df.dropna(axis=0)
master_df['imdbId'] = master_df['imdbId'].astype('int')
master_df['tmdbId'] = master_df['tmdbId'].astype('int')
master_df = master_df.merge(tmdb_df, how='left' , left_on='tmdbId' , right_on='id')
master_df = master_df.dropna(axis=0)
master_df = master_df.merge(movies_df[['movieId','title','genres']].copy() , how='left' , on='movieId')

In [4]:
master_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,imdbId,tmdbId,overview,popularity,original_title,runtime,release_date,vote_average,vote_count,status,tagline,spoken_languages,cast,id,title,genres
0,1,1,4.0,964982703,114709,862,"Led by Woody, Andy's toys live happily in his ...",100.954,Toy Story,81.0,1995-10-30,7.97,17277.0,Released,Hang on for the comedy that goes to infinity a...,en,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,862.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,113228,15602,A family wedding reignites the ancient feud be...,12.686,Grumpier Old Men,101.0,1995-12-22,6.494,350.0,Released,Still Yelling. Still Fighting. Still Ready for...,en,Walter Matthau|Jack Lemmon|Ann-Margret|Sophia ...,15602.0,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,113277,949,Obsessive master thief Neil McCauley leads a t...,58.478,Heat,170.0,1995-12-15,7.908,6620.0,Released,A Los Angeles crime saga.,en|es,Al Pacino|Robert De Niro|Val Kilmer|Jon Voight...,949.0,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,114369,807,Two homicide detectives are on a desperate hun...,60.472,Se7en,127.0,1995-09-22,8.369,19575.0,Released,Seven deadly sins. Seven ways to die.,en,Morgan Freeman|Brad Pitt|Gwyneth Paltrow|Andre...,807.0,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,114814,629,"Held in an L.A. interrogation room, Verbal Kin...",39.571,The Usual Suspects,106.0,1995-07-19,8.193,9684.0,Released,Five criminals. One line up. No coincidence.,es|en|fr|hu,Gabriel Byrne|Kevin Spacey|Stephen Baldwin|Kev...,629.0,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [5]:
df = master_df[['userId','movieId','rating']].copy()

In [6]:
df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [7]:
df.columns = ['userID', 'item', 'rating']

In [8]:
df.shape

(96014, 3)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96014 entries, 0 to 96013
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   userID  96014 non-null  int64  
 1   item    96014 non-null  int64  
 2   rating  96014 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 2.2 MB


In [10]:
print('Dataset shape: {}'.format(df.shape))
print('-Dataset examples-')
print(df.iloc[::20000, :])

Dataset shape: (96014, 3)
-Dataset examples-
       userID   item  rating
0           1      1     4.0
20000     140    553     4.0
40000     287    355     0.5
60000     414   1663     5.0
80000     534  65685     4.0


## EDA

### Ratings Distribution

In [11]:
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

data = df['rating'].value_counts().sort_index(ascending=False)
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / df.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               )
# Create layout
layout = dict(title = 'Distribution Of {} ratings'.format(df.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
fig.show()

### Ratings Distribution By Item

In [12]:
# Number of ratings per book
data = df.groupby('item')['rating'].count().clip(upper=50)

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Ratings Per Item (Clipped at 50)',
                   xaxis = dict(title = 'Number of Ratings Per Item'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
fig.show()

In [13]:
df.groupby('item')['rating'].count().reset_index().sort_values('rating', ascending=False)[:10]

Unnamed: 0,item,rating
286,356,329
253,318,317
239,296,307
463,593,279
1778,2571,278
208,260,251
381,480,238
88,110,237
460,589,224
417,527,220


### Ratings Distribution By User

In [14]:
# Number of ratings per user
data = df.groupby('userID')['rating'].count().clip(upper=50)

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Ratings Per User (Clipped at 50)',
                   xaxis = dict(title = 'Ratings Per User'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
fig.show()

In [15]:
df.groupby('userID')['rating'].count().reset_index().sort_values('rating', ascending=False)[:10]

Unnamed: 0,userID,rating
413,414,2529
598,599,2227
473,474,1922
447,448,1745
273,274,1310
609,610,1218
67,68,1214
379,380,1152
287,288,1023
248,249,1013


### Dimensionality

To reduce the dimensionality of the dataset, we will filter out rarely rated movies and rarely rating users

In [16]:
min_ratings = 5
filter_items = df['item'].value_counts() > min_ratings
filter_items = filter_items[filter_items].index.tolist()

# min_user_ratings = 5
# filter_users = df['userID'].value_counts() > min_user_ratings
# filter_users = filter_users[filter_users].index.tolist()

# df_new = df[(df['item'].isin(filter_items)) & (df['userID'].isin(filter_users))]
df_new = df[(df['item'].isin(filter_items))]
print('The original data frame shape:\t{}'.format(df.shape))
print('The new data frame shape:\t{}'.format(df_new.shape))

The original data frame shape:	(96014, 3)
The new data frame shape:	(86007, 3)


## Surprise

To load a dataset from a pandas dataframe, we will use the load_from_df() method, we will also need a Reader object, and the rating_scale parameter must be specified. The dataframe must have three columns, corresponding to the user ids, the item ids, and the ratings in this order. Each row thus corresponds to a given rating.

In [17]:

reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(df_new[['userID', 'item', 'rating']], reader)

### Basic algorithms

With the Surprise library, we will benchmark the following algorithms



#### NormalPredictor

* NormalPredictor algorithm predicts a random rating based on the distribution of the training set, which is assumed to be normal. This is one of the most basic algorithms that do not do much work.


#### SVD

* SVD algorithm is equivalent to Probabilistic Matrix Factorization (http://papers.nips.cc/paper/3208-probabilistic-matrix-factorization.pdf)

#### SVDpp

* The SVDpp algorithm is an extension of SVD that takes into account implicit ratings.


We use rmse as our accuracy metric for the predictions.

In [18]:
cross_validate(NormalPredictor(), data, measures=['RMSE'], cv=3, verbose=False)

{'test_rmse': array([1.41089467, 1.40582082, 1.3985662 ]),
 'fit_time': (0.03587937355041504, 0.03855085372924805, 0.03635907173156738),
 'test_time': (0.07608485221862793, 0.0713040828704834, 0.10525894165039062)}

In [19]:
cross_validate(SVD(), data, measures=['RMSE'], cv=3, verbose=False)

{'test_rmse': array([0.8635445 , 0.86141166, 0.86038796]),
 'fit_time': (0.4461991786956787, 0.46120405197143555, 0.4756948947906494),
 'test_time': (0.13310694694519043, 0.09882688522338867, 0.12195992469787598)}

In [20]:
cross_validate(SVDpp(), data, measures=['RMSE'], cv=3, verbose=False)

{'test_rmse': array([0.84323028, 0.84915307, 0.8559815 ]),
 'fit_time': (13.630270957946777, 14.278714895248413, 13.479437112808228),
 'test_time': (5.306225299835205, 5.0642077922821045, 4.9998109340667725)}

In [21]:
# param_grid = {
#     "n_epochs": [5, 10, 15, 20, 30, 40, 50, 100],
#     "lr_all": [0.001, 0.002, 0.005],
#     "reg_all": [0.02, 0.08, 0.4, 0.6]
# }

# smaller grid for testing
param_grid = {
    "n_epochs": [20],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.02,0.1]
}
gs = GridSearchCV(SVDpp, param_grid, measures=["rmse", "mae"], refit=True, cv=5)

gs.fit(data)

training_parameters = gs.best_params["rmse"]

print("BEST RMSE: \t", gs.best_score["rmse"])
print("BEST MAE: \t", gs.best_score["mae"])
print("BEST params: \t", gs.best_params["rmse"])

BEST RMSE: 	 0.841990914950542
BEST MAE: 	 0.6453074542700078
BEST params: 	 {'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.02}


In [22]:
from datetime import datetime
print(training_parameters)
reader = Reader(rating_scale=(0.5, 5.0))

print("\n\n\t\t STARTING\n\n")
start = datetime.now()

print("> Loading data...")
data = Dataset.load_from_df(df_new[['userID', 'item', 'rating']], reader)
print("> OK")

print("> Creating trainset...")
trainset = data.build_full_trainset()
print("> OK")


startTraining = datetime.now()
print("> Training...")

algo = SVDpp(n_epochs = training_parameters['n_epochs'], lr_all = training_parameters['lr_all'], reg_all = training_parameters['reg_all'])

algo.fit(trainset)

endTraining = datetime.now()
print("> OK \t\t It Took: ", (endTraining-startTraining).seconds, "seconds")

end = datetime.now()
print (">> DONE \t\t It Took", (end-start).seconds, "seconds" )

{'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.02}


		 STARTING


> Loading data...
> OK
> Creating trainset...
> OK
> Training...
> OK 		 It Took:  31 seconds
>> DONE 		 It Took 31 seconds


In [23]:
## SAVING TRAINED MODEL
from surprise import dump
import os
model_filename = "./model.pickle"
print (">> Starting dump")
# Dump algorithm and reload it.
file_name = os.path.expanduser(model_filename)
dump.dump(file_name, algo=algo)
print (">> Dump done")
print(model_filename)

>> Starting dump
>> Dump done
./model.pickle


In [24]:
## LOAD SAVED MODEL
def load_model(model_filename):
    print (">> Loading dump")
    from surprise import dump
    import os
    file_name = os.path.expanduser(model_filename)
    _, loaded_model = dump.load(file_name)
    print (">> Loaded dump")
    return loaded_model

In [25]:
svd_load_model = load_model('./model.pickle')

>> Loading dump
>> Loaded dump


In [26]:
import numpy as np
def generate_recommendation(model, user_id, ratings_df, movies_df, n_items):
   # Get a list of all movie IDs from dataset
   movie_ids = ratings_df["movieId"].unique()
 
   # Get a list of all movie IDs that have been watched by user
   movie_ids_user = ratings_df.loc[ratings_df["userId"] == user_id, "movieId"]
    # Get a list off all movie IDS that that have not been watched by user
   movie_ids_to_pred = np.setdiff1d(movie_ids, movie_ids_user)
 
   # Apply a rating of 4 to all interactions (only to match the Surprise dataset format)
   test_set = [[user_id, movie_id, 1] for movie_id in movie_ids_to_pred]
 
   # Predict the ratings and generate recommendations
   predictions = model.test(test_set)
   pred_ratings = np.array([pred.est for pred in predictions])
   print("Top {0} item recommendations for user {1}:".format(n_items, user_id))
   # Rank top-n movies based on the predicted ratings
   index_max = (-pred_ratings).argsort()[:n_items]
   for i in index_max:
       movie_id = movie_ids_to_pred[i]
       print(movies_df[movies_df["movieId"]==movie_id]["title"].values[0], pred_ratings[i])
 
 
# define which user ID that we want to give recommendation
userID = 1427142714373147
# this user is not present in the dataset. For every such user, recommendations will be gloabl highest rated movies
# recommendations will be the same for every user
# define how many top-n movies that we want to recommend
n_items = 10
# generate recommendation using the model that we have trained
generate_recommendation(svd_load_model,userID,ratings_df,movies_df,n_items)

Top 10 item recommendations for user 1427142714373147:
Shawshank Redemption, The (1994) 4.336908174630239
Streetcar Named Desire, A (1951) 4.254423234991092
Life Is Beautiful (La Vita è bella) (1997) 4.239513163403128
Philadelphia Story, The (1940) 4.2319064669270325
Seventh Seal, The (Sjunde inseglet, Det) (1957) 4.2302303870056175
Guess Who's Coming to Dinner (1967) 4.228690523091266
Hoop Dreams (1994) 4.218805258722267
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964) 4.215050212706568
Rear Window (1954) 4.210666486067483
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) 4.194871531599587


In [27]:
userID = 2
# this user is present in the data and hence recommendations are different

# define how many top-n movies that we want to recommend 
n_items = 10
# generate recommendation using the model that we have trained
generate_recommendation(svd_load_model,userID,ratings_df,movies_df,n_items)

Top 10 item recommendations for user 2:
Seventh Seal, The (Sjunde inseglet, Det) (1957) 4.395348706344615
Rear Window (1954) 4.354243155717697
Lawrence of Arabia (1962) 4.354225356225093
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964) 4.353067632621399
Streetcar Named Desire, A (1951) 4.343921654886927
Paths of Glory (1957) 4.320817538336384
Ran (1985) 4.307089369187342
Guess Who's Coming to Dinner (1967) 4.306197290281669
Hoop Dreams (1994) 4.301064969830309
Forrest Gump (1994) 4.300371068836011


In [28]:
userID = 7
# this user is present in the data and hence recommendations are different

# define how many top-n movies that we want to recommend
n_items = 10
# generate recommendation using the model that we have trained
generate_recommendation(svd_load_model,userID,ratings_df,movies_df,n_items)

Top 10 item recommendations for user 7:
Godfather, The (1972) 4.367156425029181
Pulp Fiction (1994) 4.311689743096045
Schindler's List (1993) 4.300415494458508
Shawshank Redemption, The (1994) 4.22882576095356
Matrix, The (1999) 4.144597326284393
Philadelphia Story, The (1940) 4.104930118478092
Rear Window (1954) 4.091856510173147
12 Angry Men (1957) 4.066064112305794
American History X (1998) 4.051790109026935
My Fair Lady (1964) 4.0261240504667555
