In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/netflix-prize-data/qualifying.txt
/kaggle/input/netflix-prize-data/movie_titles.csv
/kaggle/input/netflix-prize-data/combined_data_4.txt
/kaggle/input/netflix-prize-data/combined_data_2.txt
/kaggle/input/netflix-prize-data/README
/kaggle/input/netflix-prize-data/combined_data_1.txt
/kaggle/input/netflix-prize-data/combined_data_3.txt
/kaggle/input/netflix-prize-data/probe.txt


In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [3]:
!head -5 /kaggle/input/netflix-prize-data/combined_data_4.txt

13368:
2385003,4,2004-07-08
659432,3,2005-03-16
751812,2,2002-12-16
2625420,2,2004-05-25


Read reviews file

In [4]:
df1 = pd.read_csv('/kaggle/input/netflix-prize-data/combined_data_1.txt', header = None, names = ['UserID', 'Rating'], usecols = [0,1])
df1_null = df1.isnull()

Append new column with MovieID value

In [5]:
movie_index_list = df1_null[df1_null['Rating'] == True].index
l1 = len(movie_index_list)
movie_list = []
for i in tqdm(range(len(movie_index_list)-1)):
    movie_list.append(np.full((movie_index_list[i+1]-movie_index_list[i]), i+1))

# Append last MovieID
movie_list.append(np.full((len(df1) - movie_index_list[l1-1]), l1))
movie_list = np.concatenate(movie_list)
# Add list to dataframe
df1['MovieID'] = movie_list
# Discard NaN rows containing MovieID
df1 = df1[df1['Rating'].notna()]
df1[['Rating', 'MovieID']] = df1[['Rating', 'MovieID']].astype(np.uint16)
del movie_index_list, movie_list

100%|██████████| 4498/4498 [00:00<00:00, 33041.92it/s]


Remove users which have given less reviews

In [6]:
min_user_reviews = 20
df1_user = df1.groupby(['UserID'], sort=False)
df1_user = df1_user.agg('count')
user_index = df1_user[df1_user['Rating'] > min_user_reviews].index
df1 = df1[df1['UserID'].isin(user_index)]
del df1_user, user_index

Remove movies which have been reviewed less. They are not popular and less important to recommend.

In [7]:
min_movie_reviews = 2000
df1_movie = df1.groupby(['MovieID'], sort=False)
df1_movie = df1_movie.agg('count')
movie_index = df1_movie[df1_movie['Rating'] > min_movie_reviews].index
df1 = df1[df1['MovieID'].isin(movie_index)]

Generate pivot table

In [8]:
#df = pd.pivot(df1, values='Rating', index='UserID', columns='MovieID')

# Generate new sparse dataframe
#sparse_dtype = pd.SparseDtype(np.uint32, fill_value = np.nan)
#df_sparse = df.astype(sparse_dtype)

In [9]:
import surprise
from surprise import SVD, Reader, Dataset, accuracy
from surprise.model_selection import cross_validate, KFold, train_test_split, GridSearchCV

In [10]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df1[['UserID', 'MovieID', 'Rating']][:10000], reader)
cross_validate(SVD(), data, measures=['RMSE', 'MAE'], cv = 3)

{'test_rmse': array([1.3216922 , 1.29390946, 1.30746086]),
 'test_mae': array([1.10287351, 1.07457285, 1.10260417]),
 'fit_time': (0.7636096477508545, 0.7737209796905518, 0.7735085487365723),
 'test_time': (0.03450465202331543, 0.03345465660095215, 0.03345012664794922)}

In [11]:
trainset, testset = train_test_split(data, test_size=.25)
algo = SVD()
# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 1.3039


1.3039043541870476

In [12]:
kf = KFold(n_splits=3)

algo = surprise.prediction_algorithms.knns.KNNBasic()

for trainset, testset in kf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.3082
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.3193
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.2926


Grid Search Hyperparameters

In [None]:
param_grid = {'n_epochs': [5, 10, 15], 'lr_all': [0.001, 0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

View best score

In [None]:
# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

Fit using best hyperparameters

In [None]:
# We can now use the algorithm that yields the best rmse:
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())

Predict on test set

In [None]:
predict = algo.test(testset)

In [None]:
df_movies = pd.read_csv("/kaggle/input/netflix-prize-data/movie_titles.csv", header = None, names = ['MovieID', 'Year', 'Name'], encoding = "ISO-8859-1")
df_movies.set_index('MovieID', inplace = True)

df_movies.head()

Predict for user 

In [None]:
pred_user = '1000062'
df_temp = df1[df1['UserID'] == pred_user]
#df_temp = df_temp[df_temp['Rating'] == 5]
df_temp = df_temp.set_index('MovieID')
df_temp = df_temp.join(df_movies)

In [None]:
df_temp.head()

In [None]:
temp_movies = df_movies.copy().reset_index()
temp_movies = temp_movies[temp_movies['MovieID'].isin(movie_index)]
temp_movies['UserID'] = pred_user

to_pred = temp_movies[['MovieID', 'UserID']]

to_pred['EstimatedScore'] = to_pred['MovieID'].apply(lambda x: algo.predict(pred_user, x).est)

View Predicted Scores

In [None]:
to_pred = to_pred.set_index('MovieID')
to_pred = to_pred.join(df_movies)

In [None]:
to_pred