In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
!head -5 /kaggle/input/netflix-prize-data/combined_data_4.txt

13368:
2385003,4,2004-07-08
659432,3,2005-03-16
751812,2,2002-12-16
2625420,2,2004-05-25


Read reviews file

In [3]:
df1 = pd.read_csv('/kaggle/input/netflix-prize-data/combined_data_1.txt', header = None, names = ['UserID', 'Rating'], usecols = [0,1])
df1_null = df1.isnull()

Append new column with MovieID value

In [4]:
movie_index_list = df1_null[df1_null['Rating'] == True].index
l1 = len(movie_index_list)
movie_list = []
for i in tqdm(range(len(movie_index_list)-1)):
    movie_list.append(np.full((movie_index_list[i+1]-movie_index_list[i]), i+1))

# Append last MovieID
movie_list.append(np.full((len(df1) - movie_index_list[l1-1]), l1))
movie_list = np.concatenate(movie_list)
# Add list to dataframe
df1['MovieID'] = movie_list
# Discard NaN rows containing MovieID
df1 = df1[df1['Rating'].notna()]
df1['Rating'] = df1['Rating'].astype(np.uint16)
del movie_index_list, movie_list

100%|██████████| 4498/4498 [00:00<00:00, 28783.03it/s]


Remove users which have given less reviews

In [5]:
min_user_reviews = 60
df1_user = df1.groupby(['UserID'], sort=False)
df1_user = df1_user.agg('count')
user_index = df1_user[df1_user['Rating'] > min_user_reviews].index
df1 = df1[df1['UserID'].isin(user_index)]
del df1_user, user_index

Remove movies which have been reviewed less. They are not popular and less important to recommend.

In [6]:
min_movie_reviews = 2000
df1_movie = df1.groupby(['MovieID'], sort=False)
df1_movie = df1_movie.agg('count')
movie_index = df1_movie[df1_movie['Rating'] > min_movie_reviews].index
df1 = df1[df1['MovieID'].isin(movie_index)]

Generate pivot table

In [7]:
#df = pd.pivot(df1, values='Rating', index='UserID', columns='MovieID')

# Generate new sparse dataframe
#sparse_dtype = pd.SparseDtype(np.uint32, fill_value = np.nan)
#df_sparse = df.astype(sparse_dtype)

In [8]:
import surprise
from surprise import SVD, Reader, Dataset, accuracy
from surprise.model_selection import cross_validate, KFold, train_test_split, GridSearchCV

In [9]:
reader = Reader()
data = Dataset.load_from_df(df1[['UserID', 'MovieID', 'Rating']], reader)
#cross_validate(SVD(), data, measures=['RMSE', 'MAE'], cv = 3)

In [10]:
#trainset, testset = train_test_split(data, test_size=0.25)
#algo = SVD()
# Train the algorithm on the trainset, and predict ratings for the testset
#algo.fit(trainset)
#predictions = algo.test(testset)

# Then compute RMSE
#accuracy.rmse(predictions)

Grid Search Hyperparameters

In [11]:
#param_grid = {'n_epochs': [5, 10], 'lr_all': [0.001, 0.005],
#              'reg_all': [0.4, 0.6]}
#gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)

#gs.fit(data)

#View best score

# best RMSE score
#print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
#print(gs.best_params['rmse'])

In [12]:
# We can now use the algorithm that yields the best rmse:
algo = SVD()#gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f3a1ee14950>

Predict on test set

In [13]:
df_movies = pd.read_csv("/kaggle/input/netflix-prize-data/movie_titles.csv", header = None, names = ['MovieID', 'Year', 'Name'], encoding = "ISO-8859-1")
df_movies.set_index('MovieID', inplace = True)

df_movies.head()

Unnamed: 0_level_0,Year,Name
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2003.0,Dinosaur Planet
2,2004.0,Isle of Man TT 2004 Review
3,1997.0,Character
4,1994.0,Paula Abdul's Get Up & Dance
5,2004.0,The Rise and Fall of ECW


Predict for user 

In [14]:
pred_user = '1000062'
df_temp = df1[df1['UserID'] == pred_user]
#df_temp = df_temp[df_temp['Rating'] == 5]
df_temp = df_temp.set_index('MovieID')
df_temp = df_temp.join(df_movies)

In [15]:
df_temp.head()

Unnamed: 0_level_0,UserID,Rating,Year,Name
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
30,1000062,3,2003.0,Something's Gotta Give
138,1000062,5,1995.0,Star Trek: Voyager: Season 1
143,1000062,3,1997.0,The Game
197,1000062,3,2004.0,Taking Lives
199,1000062,3,1978.0,The Deer Hunter


In [16]:
temp_movies = df_movies.copy().reset_index()
temp_movies = temp_movies[temp_movies['MovieID'].isin(movie_index)]
#temp_movies['UserID'] = pred_user

to_pred = temp_movies[['MovieID']]

to_pred[['EstimatedScore']] = to_pred['MovieID'].apply(lambda x: algo.predict(pred_user, x).est)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[k] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


View Predicted Scores

In [17]:
to_pred = to_pred.set_index('MovieID')
to_pred = to_pred.join(df_movies)
to_pred = to_pred.sort_values('EstimatedScore', ascending = False)

In [18]:
to_pred.head(10)

Unnamed: 0_level_0,EstimatedScore,Year,Name
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1363,1.842183,1993.0,Leprechaun
1525,1.85533,2001.0,Monkeybone
749,1.923683,1990.0,Ernest Goes to Jail
3021,1.943274,2003.0,House of the Dead
659,1.945381,1972.0,The Last House on the Left
4127,1.950977,1994.0,The Flintstones
3567,1.968667,2004.0,Starship Troopers 2: Hero of the Federation
1100,1.979509,2000.0,Dr. T & the Women
362,2.029235,2000.0,The Flintstones in Viva Rock Vegas
2109,2.088123,1982.0,Grease 2
