# Database interactions

In [1]:
from sqlalchemy import create_engine
from sqlalchemy.exc import ResourceClosedError
from sqlalchemy.types import VARCHAR
from functools import partial

def DatabaseConnect(username, password, host, schema):
    conn_str = "mysql+pymysql://{username}:{password}@{host}/{schema}?charset=utf8&use_unicode=1"\
                             .format(username=username, password=password,schema=schema)
    engine = create_engine(conn_str, pool_recycle=1800)
    return engine

RecSysConnect = partial(DatabaseConnect, 'recsys', 'RecommenderSystems2017', 'localhost', 'recsys')
e = RecSysConnect()

KeyError: 'host'

In [2]:
import numpy as np
import pandas as pd
import scipy as sc

In [3]:
# df = pd.read_excel("c:/recsys/materials/alice.xlsx", index_col='Name').astype(float)
ex ={'Item1': {'Alice': 5.0,    'User1': 3.0, 'User2': 4.0, 'User3': 3.0, 'User4': 1.0},
     'Item2': {'Alice': 3.0,    'User1': 1.0, 'User2': 3.0, 'User3': 3.0, 'User4': 5.0},
     'Item3': {'Alice': 4.0,    'User1': 2.0, 'User2': 4.0, 'User3': 1.0, 'User4': 5.0},
     'Item4': {'Alice': 4.0,    'User1': 3.0, 'User2': 3.0, 'User3': 5.0, 'User4': 2.0},
     'Item5': {'Alice': np.nan, 'User1': 3.0, 'User2': 5.0, 'User3': 4.0, 'User4': 1.0},
}
alice = pd.DataFrame(ex)
ualice = alice.unstack(0).dropna().reset_index()
ualice.columns = ["itemId", "userId", "rating"]
ualice = ualice[["userId", "itemId", "rating"]]


# Surprise (scikit)
- Introductory [tutorial](https://medium.com/@m_n_malaeb/the-easy-guide-for-building-python-collaborative-filtering-recommendation-system-in-2017-d2736d2e92a8)
- Official [documentation](http://surprise.readthedocs.io/en/latest/index.html)
- Basic explanation of [collaborative filtering](https://medium.com/@cfpinela/recommender-systems-user-based-and-item-based-collaborative-filtering-5d5f375a127f)

In [4]:
from surprise import SVD, SVDpp, NMF, KNNBasic, KNNWithMeans, KNNWithZScore, SlopeOne
from surprise import Dataset, Reader
from surprise import evaluate, print_perf

In [5]:
sim_item = {'name': 'cosine',
               'user_based': False  # compute  similarities between items
           }
sim_user = {'name': 'pearson',
               'user_based': True  # compute  similarities between items
           }

algos = [ KNNBasic(2, sim_options=sim_user),     KNNBasic(2, sim_options=sim_item), 
          KNNWithMeans(2, sim_options=sim_user), KNNWithMeans(2, sim_options=sim_item), 
          SlopeOne(), SVD(2), SVDpp(2), NMF(2)]

In [6]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))
# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(ualice, reader)
#data.df

In [127]:
trainset = data.build_full_trainset()
for algo in algos:
    algo.train(trainset)
    #print(algo.predict("Alice", "Item5"))
    for i in range(1, 5+1):
        print(i, algo.predict("Alice", "Item%d"%i, alice.loc["Alice","Item%d"%i]))
    print("-----")

Computing the pearson similarity matrix...
Done computing similarity matrix.
1 user: Alice      item: Item1      r_ui = 5.00   est = 4.08   {'actual_k': 2, 'was_impossible': False}
2 user: Alice      item: Item2      r_ui = 3.00   est = 2.08   {'actual_k': 2, 'was_impossible': False}
3 user: Alice      item: Item3      r_ui = 4.00   est = 3.08   {'actual_k': 2, 'was_impossible': False}
4 user: Alice      item: Item4      r_ui = 4.00   est = 3.54   {'actual_k': 2, 'was_impossible': False}
5 user: Alice      item: Item5      r_ui = nan   est = 3.91   {'actual_k': 2, 'was_impossible': False}
-----
Computing the cosine similarity matrix...
Done computing similarity matrix.
1 user: Alice      item: Item1      r_ui = 5.00   est = 4.51   {'actual_k': 2, 'was_impossible': False}
2 user: Alice      item: Item2      r_ui = 3.00   est = 3.49   {'actual_k': 2, 'was_impossible': False}
3 user: Alice      item: Item3      r_ui = 4.00   est = 3.51   {'actual_k': 2, 'was_impossible': False}
4 user: Al

# MovieLens

In [11]:
# Load the movielens-100k dataset (download it if needed),
# and split it into 3 folds for cross-validation.
data = Dataset.load_builtin('ml-100k')
data.split(n_folds=3)

# We'll use the famous SVD algorithm.
algo = SVD()

# Evaluate performances of our algorithm on the dataset.
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])

print_perf(perf)


Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.9469
MAE:  0.7482
------------
Fold 2
RMSE: 0.9370
MAE:  0.7407
------------
Fold 3
RMSE: 0.9498
MAE:  0.7481
------------
------------
Mean RMSE: 0.9445
Mean MAE : 0.7457
------------
------------
        Fold 1  Fold 2  Fold 3  Mean    
RMSE    0.9469  0.9370  0.9498  0.9445  
MAE     0.7482  0.7407  0.7481  0.7457  


In [12]:
data.raw_ratings

[('305', '184', 3.0, '886323937'),
 ('705', '28', 4.0, '883427640'),
 ('249', '478', 4.0, '879572911'),
 ('880', '546', 3.0, '880167410'),
 ('619', '809', 1.0, '885954238'),
 ('255', '834', 4.0, '883216358'),
 ('206', '361', 1.0, '888180082'),
 ('524', '191', 4.0, '884634707'),
 ('758', '890', 3.0, '880672552'),
 ('819', '302', 5.0, '884012512'),
 ('903', '276', 5.0, '891380461'),
 ('666', '134', 5.0, '880567695'),
 ('269', '96', 1.0, '891450755'),
 ('472', '685', 3.0, '875978740'),
 ('938', '151', 4.0, '891356679'),
 ('64', '8', 4.0, '889737968'),
 ('343', '257', 3.0, '876402941'),
 ('194', '81', 2.0, '879523576'),
 ('295', '79', 4.0, '879517600'),
 ('593', '155', 5.0, '875671579'),
 ('76', '324', 4.0, '875027206'),
 ('776', '769', 3.0, '892920446'),
 ('246', '284', 1.0, '884922475'),
 ('401', '64', 3.0, '891032757'),
 ('897', '195', 5.0, '879991137'),
 ('821', '427', 5.0, '874793649'),
 ('916', '650', 4.0, '880844711'),
 ('222', '2', 3.0, '878183837'),
 ('82', '222', 3.0, '876311365'