# Netflix 

## Load libraries

In [None]:
# Useful starting lines
%matplotlib inline

import numpy as np
import seaborn as sns 
import pandas as pd

from scipy import sparse
from scipy.sparse import csr_matrix

import scipy
import scipy.io
import scipy.sparse as sp
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import cosine_similarity

import models as m
import utils as u

from surprise.dataset import * 
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold
from surprise import accuracy
from surprise.model_selection import GridSearchCV
from surprise import BaselineOnly,CoClustering,SVD,SVDpp,NMF,SlopeOne,KNNBasic

import xgboost as xgb

%load_ext autoreload
%autoreload 2

## Load data

In [None]:
raw_data = pd.read_csv('data/data_train.csv', header=0, index_col=0, names=['Id', 'rating'])
data = u.preprocess(raw_data).reset_index().drop(columns=['Id'])

In [None]:
print("Total data ")
print("-"*50)
print("\nTotal no of ratings :",data.shape[0])
print("Total No of Users   :", len(np.unique(data.user)))
print("Total No of movies  :", len(np.unique(data.item)))

## Surprise models

### Suprise Reader

In [None]:
reader=Reader(rating_scale=(1.0,5.0))
formatted_data= Dataset.load_from_df(data[['user','item','rating']],reader)

In [None]:


n_factorsSVD=80
n_epochsSVD=800
lr_allSVD=0.001667
reg_allSVD=0.1

epochs_SVDpp= 30

n_cltr_uCC=13
n_cltr_iCC=13
n_epochsCC=200

bsl_options= {'method': 'als', 'n_epochs': 10, 'reg_u': 15, 'reg_i': 5}

model_parameters_user = {
      'name': 'pearson',
      'user_based': True
    }

k_user=100

model_parameters_movie = {
      'name': 'pearson',
      'user_based': False
    }

k_movie=300

In [None]:
#Split the data
print("Seperating the data in 2 datasets: one for training the models and one for training the blending model:")
trainset, blending_trainset = train_test_split(formatted_data, test_size=.2 ,random_state=1)

In [None]:
import utils as u 
df_trainset = u.trainset_from_surprise_to_df(trainset)
ratings = sparse.csr_matrix((df_trainset.Rating.values, (df_trainset.Movie.values,df_trainset.User.values)))
print("The training matrix shape is : (movie, user) : ",ratings.shape)

In [None]:
num_items_per_user, num_users_per_item = u.stat_data(ratings)

In [None]:
train, test = m.split_data(ratings, num_items_per_user, num_users_per_item, p_test=0.1)

In [None]:
print("-Training MF SGD")
user_sgd, movie_sgd = m.matrix_factorization_SGD(train, test)

In [None]:
dfMFSGD=[]

for uid,iid,_ in blending_trainset: #(row,col) => (user,movie)

    user_data = user_sgd[:,uid]  
    movie_data = movie_sgd[:,iid]
            
    prediciton= movie_data @ user_data.T
    
    dfMFSGD.append(prediciton)
dfMFSGD=pd.DataFrame([dfMFSGD])

In [None]:
user_als,movie_als = m.ALS(train, test)

In [None]:
dfMFALS=[]

for uid,iid,_ in blending_trainset: #(row,col) => (user,movie)

    user_data = user_als[:,uid]  
    movie_data = movie_als[:,iid]
            
    prediciton= movie_data @ user_data.T
    
    dfMFALS.append(prediciton)
dfMFALS=pd.DataFrame([dfMFALS])

In [None]:
print("-Training global baseline")
baseline_global=m.baseline_global_mean(train, test)

In [None]:
dfBLGlobal = pd.DataFrame( [baseline_global] *len(blending_trainset) )

In [None]:
print("-Training user baseline")
baseline_user=m.baseline_user_mean(train, test)

In [None]:
dfBLUser = []
for user,movie,_ in blending_trainset:
    dfBLUser.append(baseline_user[0,user])
dfBLUser = pd.DataFrame(dfBLUser)

In [None]:
print("-Training movie baseline")
baseline_movie=m.baseline_movie_mean(train, test)

In [None]:
dfBLMovie = []
for user,movie,_ in blending_trainset:
    dfBLMovie.append(baseline_movie [movie,0])
dfBLMovie = pd.DataFrame(dfBLMovie)  

In [None]:
#Save the label of the second dataset in a dataframe
label_blending_trainset = []

for a,b,c in blending_trainset:
    label_blending_trainset.append(c)

df_label_blending_trainset=pd.DataFrame(label_blending_trainset)

In [None]:
print("-Training CoCluster")
algoCC= CoClustering(n_cltr_i=n_cltr_iCC, n_cltr_u=n_cltr_uCC, n_epochs=n_epochsCC)
algoCC.fit(trainset)

print("-Training Baseline")
algoBL=BaselineOnly(bsl_options=bsl_options)
algoBL.fit(trainset)

print("-Training SVD")
algoSVD=SVD( n_factors=n_factorsSVD, n_epochs=n_epochsSVD, lr_all=lr_allSVD,reg_all=reg_allSVD)
algoSVD.fit(trainset)

print("-Training SVD++")
algoSVDpp = SVDpp(n_factors=n_factorsSVD, n_epochs=epochs_SVDpp, lr_all=lr_allSVD,reg_all=reg_allSVD)
algoSVDpp.fit(trainset)

print("-Training NMF")
algoNMF = NMF(n_factors=n_factorsNMF, n_epochs=n_epochsNMF, reg_pu=reg_puNMF, reg_qi=reg_qiNMF, reg_bu=reg_buNMF, reg_bi=reg_biNMF)
algoNMF.fit(trainset)

print("-Training KNN on movie")
algoKNNMovie =KNNBasic(model_parameters=model_parameters_movie, k=k_movie)
algoKNNMovie.fit(trainset)

print("-Training KNN on user")
algoKNNUser =KNNBasic(model_parameters=model_parameters_user,k=k_user)
algoKNNUser.fit(trainset)

print("-Training Slope One")
algoSO = SlopeOne()
algoSO.fit(trainset)

print("-For the Blending algorithm, we predict on the second dataset using the trained models")
predCC=algoCC.test(blending_trainset)
dfCC=u.pred_from_suprise_to_df(predCC)

predBL=algoBL.test(blending_trainset)
dfBL=u.pred_from_suprise_to_df(predBL)

predSVD=algoSVD.test(blending_trainset)
dfSVD=u.pred_from_suprise_to_df(predSVD)

predSVDpp=algoSVDpp.test(blending_trainset)
dfSVDpp=u.pred_from_suprise_to_df(predSVDpp)

predNMF=algoNMF.test(blending_trainset)
dfNMF=u.pred_from_suprise_to_df(predNMF)

predKNNMovie=algoKNNMovie.test(blending_trainset)
dfKNNMovie=u.pred_from_suprise_to_df(predKNNMovie)

predKNNUser=algoKNNUser.test(blending_trainset)
dfKNNUser=u.pred_from_suprise_to_df(predKNNUser)

predSO=algoSO.test(blending_trainset)
dfSO=u.pred_from_suprise_to_df(predSO)

## Sparse Matrix Training 

In [None]:
sparse_matrix = sparse.csr_matrix((df_trainset.Rating.values, (df_trainset.User.values,df_trainset.Movie.values)))
print("The training matrix shape is : (user, movie) : ",sparse_matrix.shape)

In [None]:
users, movies = sparse_matrix.shape
elem = sparse_matrix.count_nonzero()

print("Sparsity of the training matrix : {0} % ".format((1 - (elem / (users * movies))) * 100))

## Rating's averages

### Rating's average over all data

In [None]:
global_average = sparse_matrix.sum() / sparse_matrix.count_nonzero()
print("The average rating over all movies of trainset is : {0} ".format(global_average) )

### Rating's average per user

In [None]:
print("Computing the rating's average per user")

user_mean = []   #contains the mean rating for user i at index i

for user_index in range(users):
    
        # find the non-zero ratings for each user in the dataset
        ratings = sparse_matrix[user_index, :]
        nonzeros_ratings = ratings[ratings.nonzero()]
        
        # calculate the mean if the number of elements is not 0
        if nonzeros_ratings.shape[1] != 0:
            user_mean.append(nonzeros_ratings.mean())
        else:
            user_mean.append(0)

### Rating's average per movie

In [None]:
print("Computing the rating's average per movie")

movie_mean = []   #contains the mean rating for movie j at index j

for movie_index in range(movies):
    
        # find the non-zero ratings for each user in the dataset
        ratings = sparse_matrix[:, movie_index]
        nonzeros_ratings = ratings[ratings.nonzero()]
        
        # calculate the mean if the number of elements is not 0
        if nonzeros_ratings.shape[1] != 0:
            movie_mean.append(nonzeros_ratings.mean())
        else:
            movie_mean.append(0)

### Similarity Matrix 

In [None]:
# get the indices of  non zero rows(users) from our sparse matrix
row_ind, col_ind = sparse_matrix.nonzero()

row_ind = sorted(set(row_ind))   #to have unique values and sorted if needed  
col_ind = sorted(set(col_ind))

#### User-User similarity 

In [None]:
top = 5 
print("Computing top",top,"similar user for each user")

In [None]:
user_simil_matrix = []

for row in row_ind: 
    # get the similarity row for this user with all other users
    simil = cosine_similarity(sparse_matrix.getrow(row), sparse_matrix).ravel()
    
    # get the index of the top 5 
    top_users = np.argsort((simil))[::-1][1:top+1]
    user_simil_matrix.append(top_users)

#### Movie-Movie similarity

In [None]:
top = 5 
print("Computing top",top,"similar movie for each movie")

In [None]:
movie_simil_matrix = []

for col in col_ind: 
    # get the similarity col for this movie with all other movies
    simil = cosine_similarity(sparse_matrix.getcol(col).T, sparse_matrix.T).ravel()
    # get the index of the top 5 
    top_movies = np.argsort((simil))[::-1][1:top+1]
    movie_simil_matrix.append(top_movies)

### Featurizing the trainset

Global_Average : Average rating of all the ratings
 
User_Average : User's Average rating

Movie_Average : Average rating of this movie

Similar users rating of this movie:
SimUser1, SimUser2, SimUser3, SimUser4, SimUser5 ( top 5 similar users who rated that movie.. )

Similar movies rated by this user:
SimMovie1, SimMovie2, SimMovie3, SimMovie4, SimMovie5 ( top 5 similar movies rated by this user.. )

In [None]:
row_ind, col_ind = sparse_matrix.nonzero()

In [None]:
df_featured_data = pd.DataFrame({'User': row_ind, 'Movie' : col_ind, 'Grade' : sparse_matrix.data, 'Global_Average' : global_average })

In [None]:
df_featured_data['User_Average'] = df_featured_data['User'].map(lambda x: user_mean[x])
df_featured_data['Movie_Average'] = df_featured_data['Movie'].map(lambda x: movie_mean[x])

Get the indices of the similar users

In [None]:
df_featured_data['SimUser1'] = df_featured_data['User'].map(lambda x: int(user_simil_matrix[x][0]))
df_featured_data['SimUser2'] = df_featured_data['User'].map(lambda x: int(user_simil_matrix[x][1]))
df_featured_data['SimUser3'] = df_featured_data['User'].map(lambda x: int(user_simil_matrix[x][2]))
df_featured_data['SimUser4'] = df_featured_data['User'].map(lambda x: int(user_simil_matrix[x][3]))
df_featured_data['SimUser5'] = df_featured_data['User'].map(lambda x: int(user_simil_matrix[x][4]))

For each similar user need to find the rating that he put for that movie if not available put the average rating of that user as an estimator. 

In [None]:
def Userfunction1(row):
    if(sparse_matrix[row['SimUser1'],row['Movie']] == 0):
        return user_mean[int(row['SimUser1'])]
    else:
        return sparse_matrix[row['SimUser1'],row['Movie']]

In [None]:
df_featured_data['SimUser1'] = df_featured_data.apply(Userfunction1,axis=1)

In [None]:
def Userfunction2(row):
    if(sparse_matrix[row['SimUser2'],row['Movie']] == 0):
        return user_mean[int(row['SimUser2'])]
    else:
        return sparse_matrix[row['SimUser2'],row['Movie']]

In [None]:
df_featured_data['SimUser2'] = df_featured_data.apply(Userfunction2,axis=1)

In [None]:
def Userfunction3(row):
    if(sparse_matrix[row['SimUser3'],row['Movie']] == 0):
        return user_mean[int(row['SimUser3'])]
    else:
        return sparse_matrix[row['SimUser3'],row['Movie']]

In [None]:
df_featured_data['SimUser3'] = df_featured_data.apply(Userfunction3,axis=1)

In [None]:
def Userfunction4(row):
    if(sparse_matrix[row['SimUser4'],row['Movie']] == 0):
        return user_mean[int(row['SimUser4'])]
    else:
        return sparse_matrix[row['SimUser4'],row['Movie']]

In [None]:
df_featured_data['SimUser4'] = df_featured_data.apply(Userfunction4,axis=1)

In [None]:
def Userfunction5(row):
    if(sparse_matrix[row['SimUser5'],row['Movie']] == 0):
        return user_mean[int(row['SimUser5'])]
    else:
        return sparse_matrix[row['SimUser5'],row['Movie']]

In [None]:
df_featured_data['SimUser5'] = df_featured_data.apply(Userfunction5,axis=1)

Get the indices of the similar movies

In [None]:
df_featured_data['SimMovie1'] = df_featured_data['Movie'].map(lambda x: int(movie_simil_matrix[x][0]))
df_featured_data['SimMovie2'] = df_featured_data['Movie'].map(lambda x: int(movie_simil_matrix[x][1]))
df_featured_data['SimMovie3'] = df_featured_data['Movie'].map(lambda x: int(movie_simil_matrix[x][2]))
df_featured_data['SimMovie4'] = df_featured_data['Movie'].map(lambda x: int(movie_simil_matrix[x][3]))
df_featured_data['SimMovie5'] = df_featured_data['Movie'].map(lambda x: int(movie_simil_matrix[x][4]))

For each similar movie we need to find the rating that the user has given to it if not available give the similar movie average rating.  

In [None]:
def Moviefunction1(row):
    if(sparse_matrix[row['User'],row['SimMovie1']] == 0):
        return movie_mean[int(row['SimMovie1'])]
    else:
        return sparse_matrix[row['User'],row['SimMovie1']]

In [None]:
df_featured_data['SimMovie1'] = df_featured_data.apply(Moviefunction1,axis=1)

In [None]:
def Moviefunction2(row):
    if(sparse_matrix[row['User'],row['SimMovie2']] == 0):
        return movie_mean[int(row['SimMovie2'])]
    else:
        return sparse_matrix[row['User'],row['SimMovie2']]

In [None]:
df_featured_data['SimMovie2'] = df_featured_data.apply(Moviefunction2,axis=1)

In [None]:
def Moviefunction3(row):
    if(sparse_matrix[row['User'],row['SimMovie3']] == 0):
        return movie_mean[int(row['SimMovie3'])]
    else:
        return sparse_matrix[row['User'],row['SimMovie3']]

In [None]:
df_featured_data['SimMovie3'] = df_featured_data.apply(Moviefunction3,axis=1)

In [None]:
def Moviefunction4(row):
    if(sparse_matrix[row['User'],row['SimMovie4']] == 0):
        return movie_mean[int(row['SimMovie4'])]
    else:
        return sparse_matrix[row['User'],row['SimMovie4']]

In [None]:
df_featured_data['SimMovie4'] = df_featured_data.apply(Moviefunction4,axis=1)

In [None]:
def Moviefunction5(row):
    if(sparse_matrix[row['User'],row['SimMovie5']] == 0):
        return movie_mean[int(row['SimMovie5'])]
    else:
        return sparse_matrix[row['User'],row['SimMovie5']]

In [None]:
df_featured_data['SimMovie5'] = df_featured_data.apply(Moviefunction5,axis=1)

### Featurizing the blending trainset

In [None]:
df_blending_trainset=[]

for u,m,r in blending_trainset:
    df_blending_trainset.append([u,m,r])
    
df_featured_blending_trainset = pd.DataFrame(df_blending_trainset)
df_featured_blending_trainset = df_featured_blending_trainset.rename({0:'User',1:'Movie',2:'Rating'},axis =1)

In [None]:
df_featured_blending_trainset['User_Average'] = df_featured_blending_trainset['User'].map(lambda x: user_mean[x])
df_featured_blending_trainset['Movie_Average'] = df_featured_blending_trainset['Movie'].map(lambda x: movie_mean[x])

Get the indices of the similar users

In [None]:
df_featured_blending_trainset['SimUser1'] = df_featured_blending_trainset['User'].map(lambda x: int(user_simil_matrix[x][0]))
df_featured_blending_trainset['SimUser2'] = df_featured_blending_trainset['User'].map(lambda x: int(user_simil_matrix[x][1]))
df_featured_blending_trainset['SimUser3'] = df_featured_blending_trainset['User'].map(lambda x: int(user_simil_matrix[x][2]))
df_featured_blending_trainset['SimUser4'] = df_featured_blending_trainset['User'].map(lambda x: int(user_simil_matrix[x][3]))
df_featured_blending_trainset['SimUser5'] = df_featured_blending_trainset['User'].map(lambda x: int(user_simil_matrix[x][4]))

For each similar user need to find the rating that he put for that movie if not available put the average rating of that user as an estimator. 

In [None]:
df_featured_blending_trainset['SimUser1'] = df_featured_blending_trainset.apply(Userfunction1,axis=1)
df_featured_blending_trainset['SimUser2'] = df_featured_blending_trainset.apply(Userfunction2,axis=1)
df_featured_blending_trainset['SimUser3'] = df_featured_blending_trainset.apply(Userfunction3,axis=1)
df_featured_blending_trainset['SimUser4'] = df_featured_blending_trainset.apply(Userfunction4,axis=1)
df_featured_blending_trainset['SimUser5'] = df_featured_blending_trainset.apply(Userfunction5,axis=1)

In [None]:
df_featured_blending_trainset['SimMovie1'] = df_featured_blending_trainset['Movie'].map(lambda x: int(movie_simil_matrix[x][0]))
df_featured_blending_trainset['SimMovie2'] = df_featured_blending_trainset['Movie'].map(lambda x: int(movie_simil_matrix[x][1]))
df_featured_blending_trainset['SimMovie3'] = df_featured_blending_trainset['Movie'].map(lambda x: int(movie_simil_matrix[x][2]))
df_featured_blending_trainset['SimMovie4'] = df_featured_blending_trainset['Movie'].map(lambda x: int(movie_simil_matrix[x][3]))
df_featured_blending_trainset['SimMovie5'] = df_featured_blending_trainset['Movie'].map(lambda x: int(movie_simil_matrix[x][4]))

For each similar movie we need to find the rating that the user has given to it if not available give the similar movie average rating.  

In [None]:
df_featured_blending_trainset['SimMovie1'] = df_featured_blending_trainset.apply(Moviefunction1,axis=1)
df_featured_blending_trainset['SimMovie2'] = df_featured_blending_trainset.apply(Moviefunction2,axis=1)
df_featured_blending_trainset['SimMovie3'] = df_featured_blending_trainset.apply(Moviefunction3,axis=1)
df_featured_blending_trainset['SimMovie4'] = df_featured_blending_trainset.apply(Moviefunction4,axis=1)
df_featured_blending_trainset['SimMovie5'] = df_featured_blending_trainset.apply(Moviefunction5,axis=1)

In [None]:
df_featured_blending_trainset.SimUser1=df_featured_blending_trainset.SimUser1.astype(float)
df_featured_blending_trainset.SimUser2=df_featured_blending_trainset.SimUser2.astype(float)
df_featured_blending_trainset.SimUser3=df_featured_blending_trainset.SimUser3.astype(float)
df_featured_blending_trainset.SimUser4=df_featured_blending_trainset.SimUser4.astype(float)
df_featured_blending_trainset.SimUser5=df_featured_blending_trainset.SimUser5.astype(float)
df_featured_blending_trainset.SimMovie1=df_featured_blending_trainset.SimMovie1.astype(float)
df_featured_blending_trainset.SimMovie2=df_featured_blending_trainset.SimMovie2.astype(float)
df_featured_blending_trainset.SimMovie3=df_featured_blending_trainset.SimMovie3.astype(float)
df_featured_blending_trainset.SimMovie4=df_featured_blending_trainset.SimMovie4.astype(float)
df_featured_blending_trainset.SimMovie5=df_featured_blending_trainset.SimMovie5.astype(float)

In [None]:
df_featured_blending_trainset.drop(['Rating'],inplace=True, axis=1)

In [None]:
#Must save a copy with the "User" and "Movie" columns to be used in predict_on_models
df_featured_blending_trainset_no_user_movie=df_featured_blending_trainset.drop(['User','Movie'], axis=1)

## xgboosting case

In [None]:

df_val=pd.concat([dfCC,dfBL,dfSVD,dfSVDpp,dfNMF,dfKNNMovie,dfKNNUser,dfSO,dfMFSGD,dfMFALS,dfBLGlobal,dfBLMovie, dfBLUser],ignore_index=True,axis=1)
df_val=df_val.rename({0:'dfCC',1:'dfBL',2:'dfSVD',3:'dfSVDpp',4:'dfNMF',5:'dfKNNMovie',6:'dfKNNUser',7:'dfSO',8:'dfMFSGD',9:'dfMFALS',10:'dfBLGlobal',11:'dfBLMovie',12:'dfBLUser'},axis=1)

In [None]:
df_val=pd.concat([df_val,df_featured_blending_trainset_no_user_movie],axis=1)


In [None]:
model_xgb= xgb.XGBRegressor(silent=True, n_jobs=25, random_state=1,n_estimators=100)

model_xgb.fit(df_val,label_blending_trainset, eval_metric='rmse')


In [None]:
ids, preds = u.predict_on_all_models_and_features_xgb(model_xgb,[algoCC, algoBL,algoSVD, algoSVDpp,algoNMF,algoKNNMovie,algoKNNUser,algoSO],
                                          [user_sgd, movie_sgd],[user_als, movie_als],
                                          baseline_global,baseline_movie,baseline_user,df_featured_blending_trainset)



In [None]:

    
u.create_csv_submission(ids, preds, "submissionBlendedXgbFull.csv")
