In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 5.3 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1619416 sha256=4a8b69f799b5ad219d5a108e67a9133b24661520b5eafc8825658b34a0de7799
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


In [3]:
from datetime import datetime
import pandas as pd
import numpy as np
import seaborn as sns
sns.set_style("whitegrid")
import os
import random
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error


import xgboost as xgb
from surprise import Reader, Dataset
from surprise import BaselineOnly
from surprise import KNNBaseline
from surprise import SVD
from surprise import accuracy
from surprise import SVDpp
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV


In [4]:
df = pd.read_csv("/content/drive/MyDrive/NetflixProject/Netflix/archive/NetflixRatings.csv", sep=",", names = ["MovieID","CustID", "Ratings", "Date"])
df["Date"] = pd.to_datetime(df["Date"])
df_movie=pd.read_csv('/content/drive/MyDrive/Netflix/Data/additions/updated_movie_titles.csv')

ID_values=df_movie[df_movie['Year'].isnull()].ID.values
years=[]
for i in ID_values:
  temp=df[df['MovieID']==i].sort_values('Date').iloc[0]
  years.append(temp.Date.year)
temp1=df_movie[df_movie['Year'].isnull()]
count=0
for i in years:
  temp1.Year.values[count]=i
  count=count+1
df_movie.dropna(inplace=True)
df_movie=pd.concat([df_movie,temp1])
df_movie.sort_index(inplace=True)
df_movie.rename(columns = {'ID':'MovieID'}, inplace = True)

In [5]:
dfnew=df.copy()

In [6]:
dfnew=dfnew.merge(df_movie,on='MovieID')
dfnew.head()

Unnamed: 0,MovieID,CustID,Ratings,Date,Year,Name
0,1,1488844,3,2005-09-06,2003.0,Dinosaur Planet
1,1,822109,5,2005-05-13,2003.0,Dinosaur Planet
2,1,885013,4,2005-10-19,2003.0,Dinosaur Planet
3,1,30878,4,2005-12-26,2003.0,Dinosaur Planet
4,1,823519,3,2004-05-03,2003.0,Dinosaur Planet


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
data_sample=dfnew.sample(30000000)

In [9]:
train,test = train_test_split(data_sample[['CustID','MovieID','Ratings']], test_size=0.3)

In [10]:
train.shape

(21000000, 3)

In [11]:
test.shape

(9000000, 3)

In [12]:
train.head()

Unnamed: 0,CustID,MovieID,Ratings
70257360,815079,12732,5
53400632,974317,9728,2
40572998,1635135,7193,3
43982736,2172301,7763,4
92284896,1398961,16377,4


In [13]:
reader = Reader(rating_scale=(1,5))
# create the traindata from the data frame
train_data_mf = Dataset.load_from_df(train[['CustID', 'MovieID', 'Ratings']], reader)

# build the train set from traindata.
#It is of dataset format from surprise library
trainset = train_data_mf.build_full_trainset()

In [14]:
testset = list(zip(test["CustID"].values, test["MovieID"].values, test["Ratings"].values))

In [15]:
testset[:5]

[(1251005, 9757, 5),
 (1235309, 14364, 5),
 (710029, 11677, 4),
 (716034, 5671, 4),
 (235224, 16357, 5)]

In [16]:
model_train_evaluation = dict()
model_test_evaluation = dict()

def get_ratings(predictions):
    actual = np.array([pred.r_ui for pred in predictions])
    predicted = np.array([pred.est for pred in predictions])
    return actual, predicted

def get_error(predictions):
    actual, predicted = get_ratings(predictions)
    rmse = np.sqrt(mean_squared_error(actual, predicted)) 
    mape = np.mean(abs((actual - predicted)/actual))*100
    return rmse, mape

my_seed = 15
random.seed(my_seed)
np.random.seed(my_seed)

def run_surprise(algo, trainset, testset, model_name):
    startTime = datetime.now()
    
    train = dict()
    test = dict()
    
    algo.fit(trainset)
    
    print("-"*50)
    print("TRAIN DATA")
    train_pred = algo.test(trainset.build_testset())
    
#-----------------Evaluating Train Data------------------#
    train_actual, train_predicted = get_ratings(train_pred)
    train_rmse, train_mape = get_error(train_pred)
    print("RMSE = {}".format(train_rmse))
    print("MAPE = {}".format(train_mape))
    print("-"*50)
    train = {"RMSE": train_rmse, "MAPE": train_mape, "Prediction": train_predicted}
    
#-----------------Evaluating Test Data------------------#
    print("TEST DATA")
    test_pred = algo.test(testset)
    
    test_actual, test_predicted = get_ratings(test_pred)
    test_rmse, test_mape = get_error(test_pred)
    print("RMSE = {}".format(test_rmse))
    print("MAPE = {}".format(test_mape))
    print("-"*50)
    test = {"RMSE": test_rmse, "MAPE": test_mape, "Prediction": test_predicted}
    
    print("Time Taken = "+str(datetime.now() - startTime))
    
    #make_table(model_name, train_rmse, train_mape, test_rmse, test_mape)
    
    return train, test

In [17]:

#param_grid  = {'n_factors': [5,7,10,15,20,25,35,50,70,90]}   #here, n_factors is the equivalent to dimension 'd' when matrix 'A'
#is broken into 'b' and 'c'. So, matrix 'A' will be of dimension n*m. So, matrices 'b' and 'c' will be of dimension n*d and m*d.

#gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

#gs.fit(train_data_mf)

# best RMSE score
#print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
#print(gs.best_params['rmse'])


In [18]:
#data_sample=dfnew.sample(30000000)
#train,test = train_test_split(data_sample[['CustID','MovieID','Ratings']], test_size=0.3)
algo = SVD(n_factors=100,n_epochs=20,biased=True, random_state=15, verbose=True)

train_result, test_result = run_surprise(algo, trainset, testset, "SVD")

model_train_evaluation["SVD"] = train_result
model_test_evaluation["SVD"] = test_result

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
--------------------------------------------------
TRAIN DATA
RMSE = 0.7204859116599033
MAPE = 21.446647138618097
--------------------------------------------------
TEST DATA
RMSE = 0.9002603029889283
MAPE = 27.11465518280302
--------------------------------------------------
Time Taken = 0:26:36.445206
