In [1]:
import pandas as pd
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise import SVD
from surprise import SVDpp
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

In [2]:
import os
path = os.getcwd()

ratings_df = pd.read_csv(os.path.join(path,'ratings.csv'))

movies_df = pd.read_csv(os.path.join(path,'movies.csv'))

tmdb_df = pd.read_csv(os.path.join(path,'tmdb_data_combine.csv'))

links_df = pd.read_csv(os.path.join(path,'links.csv'))

In [3]:
# Creating a master db
links_df = links_df.dropna(axis=0)
links_df['tmdbId'] = links_df['tmdbId'].astype('int')
master_df = ratings_df.merge(links_df , how='left' , on='movieId')
master_df = master_df.dropna(axis=0)
master_df['imdbId'] = master_df['imdbId'].astype('int')
master_df['tmdbId'] = master_df['tmdbId'].astype('int')
master_df = master_df.merge(tmdb_df, how='left' , left_on='tmdbId' , right_on='id')
master_df = master_df.dropna(axis=0)
master_df = master_df.merge(movies_df[['movieId','title','genres']].copy() , how='left' , on='movieId')

In [4]:
df = master_df[['userId','movieId','rating']].copy()

In [5]:
df.columns = ['userID', 'item', 'rating']

In [6]:
min_ratings = 5
filter_items = df['item'].value_counts() > min_ratings
filter_items = filter_items[filter_items].index.tolist()

# min_user_ratings = 5
# filter_users = df['userID'].value_counts() > min_user_ratings
# filter_users = filter_users[filter_users].index.tolist()

# df_new = df[(df['item'].isin(filter_items)) & (df['userID'].isin(filter_users))]
df_new = df[(df['item'].isin(filter_items))]
print('The original data frame shape:\t{}'.format(df.shape))
print('The new data frame shape:\t{}'.format(df_new.shape))

The original data frame shape:	(96014, 3)
The new data frame shape:	(86007, 3)


## Divide data into test and train and train the model

In [7]:
train_data=df_new.iloc[:int(df_new.shape[0]*0.80)]
test_data=df_new.iloc[int(df_new.shape[0]*0.80):]

reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(train_data[['userID', 'item', 'rating']], reader)

In [8]:
algo = SVDpp(n_epochs = 20, lr_all = 0.005 , reg_all = 0.02)

algo.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x107465a00>

## Make predictions

In [9]:
test_set_business = []
for index , data in test_data.iterrows():
    test_set_business.append(list(data.values))

In [10]:
predictions = algo.test(test_set_business)

In [11]:
pred_test_data = [pred.est for pred in predictions]

In [12]:
test_data['predictions'] = pred_test_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predictions'] = pred_test_data


In [13]:
test_data.head()

Unnamed: 0,userID,item,rating,predictions
76227,500,539,1.0,3.071442
76228,500,543,4.0,2.862351
76229,500,588,4.0,3.072725
76230,500,594,3.0,3.252562
76231,500,595,4.0,3.328335


## Calculate business evaluation metircs

In [14]:
df_2 = test_data.dropna().groupby('userID').apply(lambda grp: grp.nlargest(5, 'predictions'))

In [15]:
df_2

Unnamed: 0_level_0,Unnamed: 1_level_0,userID,item,rating,predictions
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
500,76234,500,720,4.0,3.617129
500,76288,500,4306,5.0,3.567536
500,76241,500,1274,4.0,3.526710
500,76278,500,3083,1.0,3.493697
500,76235,500,838,3.0,3.476684
...,...,...,...,...,...
610,94816,610,318,3.0,4.323706
610,94842,610,750,4.5,4.278575
610,95533,610,58559,4.5,4.192699
610,94850,610,904,5.0,4.185872


In [16]:
df_2['revenue'] = df_2['rating'].apply(lambda x : 12 if x >= 4.5 else 8.5 if x<4.5 and x >= 4 else 2.5 if x<4 and x >= 3.5 else 0)

In [17]:
sum(df_2.revenue)

4619.5

In [18]:
random_revenue = sum(train_data['rating']\
                     .apply(lambda x : 12 if x >= 4.5 else 8.5 if x<4.5 and x >= 4 else 2.5 if x<4 and x >= 3.5 else 0))/len(train_data)

total_revenue_random = random_revenue*555

In [19]:
total_revenue_random

2993.983213429257

In [20]:
random_revenue

5.394564348521183

In [21]:
df_2['rating'].mean(),train_data['rating'].mean()

(4.151351351351352, 3.5534990189666447)

In [22]:
# % change in revenue 
(4530 - 3000)/3000 

0.51

In [23]:
# Profit calculation (rec sys)
sum(df_2.revenue) - 0.75*555 - 0.01*555

4197.7

In [24]:
# Profit calculation (no rec sys)
total_revenue_random - 0.75*555

2577.733213429257