# Recommender System

### Load Libraries

In [1]:
import os
from flask_sqlalchemy import SQLAlchemy
from flask import Flask, session

import pandas as pd
from surprise import Reader
from surprise import Dataset
from surprise import SVD
from surprise.model_selection import cross_validate





import pandas as pd
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go

import warnings
warnings.filterwarnings('ignore')

from surprise import dump
import os


### Initialize Database

In [2]:

app = Flask(__name__)
app.config['SECRET_KEY'] = os.urandom(32)  # os.getenv('SECRET_KEY')
app.config["SESSION_PERMANENT"] = False
app.config["CACHE_TYPE"] = "null"
app.config["SESSION_TYPE"] = "null"
#basedir = os.path.abspath(os.path.dirname(__file__))

# Database
app.config['SQLALCHEMY_DATABASE_URI'] = 'postgresql://{}:{}@{}:{}/{}'.format(os.getenv('POSTGRES_USER'),
                                                                             os.getenv('POSTGRES_PW'),
                                                                             os.getenv('POSTGRES_URL'),
                                                                             os.getenv('POSTGRES_PORT'),
                                                                             os.getenv('POSTGRES_DB'))
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
#Session(app)
session = session

db = SQLAlchemy(app)

### Get User - Movie Matrix

In [17]:
users_query = """SELECT username FROM users"""
users = [g[0] for g in db.session.execute(users_query).all()]

likes_query = lambda username: """SELECT movie_id FROM movies_user_like WHERE username = '{}'""".format(username)
dislikes_query = lambda username: """SELECT movie_id FROM movies_user_dislike WHERE username = '{}'""".format(username)

user_movie_df_list = []

for user in users:

    
    user_likes = [l[0] for l in db.session.execute( likes_query(user) ).all()]
    user_dislikes = [d[0] for d in db.session.execute( dislikes_query(user) ).all()]

    for movie in user_likes:
        user_movie_df_list.append([user,movie,5])
    for movie in user_dislikes:
        user_movie_df_list.append([user,movie,1])

df = pd.DataFrame(user_movie_df_list, columns = ['userID', 'movieId', 'rating'])


In [18]:
df.tail()

Unnamed: 0,userID,movieId,rating
47826,okeke@gmail.com,tt6096780,1
47827,okeke@gmail.com,tt6473066,1
47828,okeke@gmail.com,tt5806876,1
47829,okeke@gmail.com,tt11316996,1
47830,okeke@gmail.com,,1


### Exploration

In [19]:
df.shape

(47831, 3)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47831 entries, 0 to 47830
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   userID   47831 non-null  object
 1   movieId  47831 non-null  object
 2   rating   47831 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.1+ MB


In [21]:
# Ratings Distribution


init_notebook_mode(connected=True)

data = df['rating'].value_counts().sort_index(ascending=False)
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / df.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               )
# Create layout
layout = dict(title = 'Distribution Of {} ratings'.format(df.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
fig.show()

### Load Dataset

In [22]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['userID', 'movieId', 'rating']], reader)


### Basic algorithms

With the Surprise library, we will benchmark the following algorithms


#### NormalPredictor

* NormalPredictor algorithm predicts a random rating based on the distribution of the training set, which is assumed to be normal. This is one of the most basic algorithms that do not do much work.

#### BaselineOnly

* BasiclineOnly algorithm predicts the baseline estimate for given user and item.

### k-NN algorithms

#### KNNBasic

* KNNBasic is a basic collaborative filtering algorithm.

#### KNNWithMeans

* KNNWithMeans is basic collaborative filtering algorithm, taking into account the mean ratings of each user.

#### KNNWithZScore

* KNNWithZScore is a basic collaborative filtering algorithm, taking into account the z-score normalization of each user.

#### KNNBaseline

* KNNBaseline is a basic collaborative filtering algorithm taking into account a baseline rating.

### Matrix Factorization-based algorithms

#### SVD

* SVD algorithm is equivalent to Probabilistic Matrix Factorization (http://papers.nips.cc/paper/3208-probabilistic-matrix-factorization.pdf)

#### SVDpp

* The SVDpp algorithm is an extension of SVD that takes into account implicit ratings.

#### NMF

* NMF is a collaborative filtering algorithm based on Non-negative Matrix Factorization. It is very similar with SVD.

### Slope One

* Slope One is a straightforward implementation of the SlopeOne algorithm. (https://arxiv.org/abs/cs/0702144)

### Co-clustering

* Co-clustering is a collaborative filtering algorithm based on co-clustering (http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.113.6458&rep=rep1&type=pdf)


We use rmse as our accuracy metric for the predictions.

In [23]:
benchmark = []
# Iterate over all algorithms

algorithms = [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), \
    BaselineOnly(), CoClustering()]
algorithms_str = ['SVD', 'SVDpp', 'SlopeOne', 'NMF', 'NormalPredictor', 'KNNBaseline', 'KNNBasic', \
    'KNNWithMeans', 'KNNWithZScore', 'BaselineOnly', 'CoClustering']

print ("Attempting: ", algorithms_str, '\n\n')

for i, algorithm in enumerate(algorithms):
    print("Starting: " ,algorithms_str[i])

    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    # results = cross_validate(algorithm, data, measures=['RMSE','MAE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series(algorithms_str[i], index=['Algorithm']))
    benchmark.append(tmp)

print ('\n\tDONE\n')

Attempting:  ['SVD', 'SVDpp', 'SlopeOne', 'NMF', 'NormalPredictor', 'KNNBaseline', 'KNNBasic', 'KNNWithMeans', 'KNNWithZScore', 'BaselineOnly', 'CoClustering'] 


Starting:  SVD
Starting:  SVDpp
Starting:  SlopeOne
Starting:  NMF
Starting:  NormalPredictor
Starting:  KNNBaseline
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Starting:  KNNBasic
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Starting:  KNNWithMeans
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.

In [24]:
surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
surprise_results

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
KNNBasic,0.845678,0.580413,3.710496
BaselineOnly,0.851695,0.138176,0.234887
KNNBaseline,0.857741,0.763861,4.300828
SVD,0.867583,4.166059,0.145503
SVDpp,0.874066,14.907406,0.620369
KNNWithMeans,0.884155,0.656744,3.691303
SlopeOne,0.886085,0.070424,0.477862
KNNWithZScore,0.887601,0.712289,3.955232
CoClustering,0.933116,1.0686,0.116925
NMF,0.952955,3.461866,0.201812


In [26]:
print(dir(BaselineOnly))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'compute_baselines', 'compute_similarities', 'default_prediction', 'estimate', 'fit', 'get_neighbors', 'predict', 'test']


In [28]:
# param_grid = {
#     "n_epochs": [5, 10, 15, 20, 30, 40, 50, 100],
#     "lr_all": [0.001, 0.002, 0.005],
#     "reg_all": [0.02, 0.08, 0.4, 0.6]
# }

# smaller grid for testing
param_grid = {
    "n_epochs": [10, 20],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.02]
}
param_grid = {'bsl_options':{'method': ['als','sgd'],'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}}
gs = GridSearchCV(BaselineOnly, param_grid, measures=["rmse", "mae"], refit=True, cv=5)

gs.fit(data)

training_parameters = gs.best_params["rmse"]

print("BEST RMSE: \t", gs.best_score["rmse"])
print("BEST MAE: \t", gs.best_score["mae"])
print("BEST params: \t", gs.best_params["rmse"])

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimati

In [33]:
from datetime import datetime
print(training_parameters)
reader = Reader(rating_scale=(1, 5))

print("\n\n\t\t STARTING\n\n")
start = datetime.now()

print("> Loading data...")
data = Dataset.load_from_df(df[['userID', 'movieId', 'rating']], reader)
print("> OK")

print("> Creating trainset...")
trainset = data.build_full_trainset()
print("> OK")


startTraining = datetime.now()
print("> Training...")

#algo = SVD(n_epochs = training_parameters['n_epochs'], lr_all = training_parameters['lr_all'], reg_all = training_parameters['reg_all'])
algo = BaselineOnly(training_parameters)

algo.fit(trainset)

endTraining = datetime.now()
print("> OK \t\t It Took: ", (endTraining-startTraining).seconds, "seconds")

end = datetime.now()
print (">> DONE \t\t It Took", (end-start).seconds, "seconds" )

{'bsl_options': {'method': 'sgd', 'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4}}


		 STARTING


> Loading data...
> OK
> Creating trainset...
> OK
> Training...
Estimating biases using als...
> OK 		 It Took:  0 seconds
>> DONE 		 It Took 0 seconds


In [36]:
## SAVING TRAINED MODEL
from surprise import dump
import os
model_filename = "./model.pickle"
print (">> Starting dump")
# Dump algorithm and reload it.
file_name = os.path.expanduser(model_filename)
dump.dump(file_name, algo=algo)
print (">> Dump done")
print(model_filename)

>> Starting dump
>> Dump done
./model.pickle


In [15]:
## LOAD SAVED MODEL
def load_model(model_filename):
    print (">> Loading dump")
    
    file_name = os.path.expanduser(model_filename)
    _, loaded_model = dump.load(file_name)
    print (">> Loaded dump")
    
    return loaded_model

In [16]:
# predicitng

def itemRating(userID,movieIDs,model_filename):
    loaded_model = load_model(model_filename)
    ratings = []
    for movieID in movieIDs:
        prediction = loaded_model.predict(userID, movieID, verbose=False)
        rating = prediction.est
        ratings.append(rating)
    ratingsSorted = [movieID for _, movieID in sorted(zip(ratings, movieIDs),reverse=True)]

    return ratingsSorted

model_filename = "./model.pickle"
num = 5
movie = list(set(df['movieId'].values.tolist()))

recommended_movies = itemRating("okeke@gmail.com", movie, model_filename)
print(recommended_movies)

>> Loading dump
>> Loaded dump
['tt9910206', 'tt9900782', 'tt9838780', 'tt9784954', 'tt9692046', 'tt9517188', 'tt9507276', 'tt9412726', 'tt9252468', 'tt9198440', 'tt9021234', 'tt9020536', 'tt9016540', 'tt8961674', 'tt8908288', 'tt8907974', 'tt8897986', 'tt8739752', 'tt8737152', 'tt8426594', 'tt8396238', 'tt8356942', 'tt8312728', 'tt8291224', 'tt8205656', 'tt8108274', 'tt8032912', 'tt8009578', 'tt7984766', 'tt7938336', 'tt7935892', 'tt7838252', 'tt7813294', 'tt7744192', 'tt7656570', 'tt7534102', 'tt7445206', 'tt7326186', 'tt7294534', 'tt7215232', 'tt7070638', 'tt7060460', 'tt6833964', 'tt6736084', 'tt6456318', 'tt6418918', 'tt6405208', 'tt6359554', 'tt6264938', 'tt6212984', 'tt6167894', 'tt6098734', 'tt6084202', 'tt6080746', 'tt6080504', 'tt6067750', 'tt5941336', 'tt5894410', 'tt5881528', 'tt5834426', 'tt5813916', 'tt5787290', 'tt5635026', 'tt5503472', 'tt5467928', 'tt5458088', 'tt5271442', 'tt5164308', 'tt5038806', 'tt4987556', 'tt4983780', 'tt4849438', 'tt4844140', 'tt4717534', 'tt465