In [1]:
from sqlalchemy import create_engine
from sqlalchemy.exc import ResourceClosedError
from sqlalchemy.types import VARCHAR
from functools import partial
import pandas as pd
from scipy.spatial import distance
from sklearn.neighbors import NearestNeighbors
from IPython.display import display, HTML, Markdown, Latex
from sklearn.utils.extmath import randomized_svd
import numpy as np

## Reading the data from database into a pandas dataframe

In [2]:
#connection
def DatabaseConnect(username, password, schema):
    conn_str = "mysql+pymysql://{username}:{password}@localhost/{schema}?charset=utf8&use_unicode=1"\
                             .format(username=username, password=password,schema=schema)
    engine = create_engine(conn_str, pool_recycle=1800)
    return engine
RecSysConnect = partial(DatabaseConnect, 'root', 'mysql-password', 'recsys')
e = RecSysConnect()

In [3]:
#read ratings
sql_cmt = "select userId, movieId, rating from ml100k_ratings;"
raw_rating = pd.read_sql(sql_cmt, con=e)
rating = raw_rating.pivot(index="userId", columns="movieId", values="rating")
rating.head(10)

movieId,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
6,4.0,,,,,,2.0,4.0,4.0,,...,,,,,,,,,,
7,,,,5.0,,,5.0,5.0,5.0,4.0,...,,,,,,,,,,
8,,,,,,,3.0,,,,...,,,,,,,,,,
9,,,,,,5.0,4.0,,,,...,,,,,,,,,,
10,4.0,,,4.0,,,4.0,,4.0,,...,,,,,,,,,,


In [4]:
#read holdout data
sql_cmt = "select userId from ml100k_ratings group by userId order by count(*) DESC limit 100;"
user_100 = pd.read_sql(sql_cmt, con=e)['userId']
sql_cmt = "select movieId from ml100k_ratings group by movieId order by count(*) DESC limit 10;"
movie_10 = pd.read_sql(sql_cmt, con=e)['movieId']
holdout = rating.loc[user_100,movie_10]

In [5]:
rating.loc[user_100,movie_10]=np.nan

## Calculation of similarities

In [6]:
import sklearn.metrics.pairwise 

def center(df):
    return df.sub( df.mean(axis=1), axis=0 )

def cosine(df, axis=0):
    dff = df.fillna(0)
    if axis == 0: # Columns
        return pd.DataFrame(sklearn.metrics.pairwise.cosine_similarity(dff.T), index=df.columns, columns=df.columns)
    else:
        return pd.DataFrame(sklearn.metrics.pairwise.cosine_similarity(dff),   index=df.index,   columns=df.index)

In [9]:
r_cent = center(rating)
userSim = cosine(r_cent,1) 
itemSim = cosine(r_cent)
display(userSim.head(),itemSim.head())

userId,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.026614,0.007809,0.038385,0.112732,0.103112,0.110214,0.166263,0.00121,-0.011662,...,0.026726,-0.027281,0.087616,-0.009101,0.032321,0.080344,0.060521,-0.001568,-0.035668,0.023752
2,0.026614,1.0,0.013658,-0.017016,0.03577,0.094503,0.027659,0.05564,0.027294,0.097846,...,0.012853,-0.028798,0.056659,0.197835,0.090009,0.032505,0.015053,-0.017344,0.012068,0.039173
3,0.007809,0.013658,1.0,-0.059638,0.016037,-0.017158,0.010407,0.041177,-0.010093,0.023856,...,0.001615,0.000658,-0.006888,0.036157,-0.018513,-0.00624,-0.023907,0.034414,-0.009187,0.001489
4,0.038385,-0.017016,-0.059638,1.0,0.007373,-0.053929,-0.012356,0.136046,0.016082,-0.013588,...,0.011895,0.002174,-0.028,-0.025021,0.022882,-0.00596,0.279818,0.258594,0.064504,-0.019222
5,0.112732,0.03577,0.016037,0.007373,1.0,0.038484,0.062234,0.140106,0.010195,0.014335,...,0.070014,-0.070821,0.024278,0.038672,0.093567,0.051782,0.02954,0.036234,0.043318,0.099324


movieId,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,-0.023667,-0.042012,0.01212,0.007037,-0.002027,0.020884,0.100244,-0.046859,0.002856,...,0.066344,0.0,0.0,0.0,0.012261,0.0,0.0,0.0,0.0,0.0
2,-0.023667,1.0,0.031568,0.030879,0.013452,-0.0106,-0.053574,-0.007358,-0.10366,-0.020773,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003661,0.034941
3,-0.042012,0.031568,1.0,-0.118348,0.012564,0.054819,-0.087453,-0.1515,-0.066376,-0.049901,...,0.0,0.0,0.0,0.0,0.201111,0.0,0.0,0.0,0.0,0.031866
4,0.01212,0.030879,-0.118348,1.0,-0.149492,-0.022921,0.005652,0.111663,0.029485,0.009932,...,0.0,0.0,-0.114413,-0.114413,0.090004,0.0,0.0,0.0,0.002727,-0.048234
5,0.007037,0.013452,0.012564,-0.149492,1.0,-0.041295,-0.043651,0.012695,-0.061202,-0.033263,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043673


## One simple recommender  
### - randomized_svd

In [104]:
rating.columns = rating.columns.map(str)
means = rating.mean(axis=1)
cent  = rating.sub( means, axis=0 )
U_, Sigma, VT_ = randomized_svd(cent.to_sparse().to_coo().tocsc(), n_components=2, n_iter=5, random_state=1)

In [105]:
U = pd.DataFrame(U_, index=rating.index)
VT = pd.DataFrame(VT_, columns=rating.columns)

#display(U, Sigma, VT)
full = U.mul(Sigma).dot(VT)

In [106]:
full.columns=full.columns.map(int)
means = rating.mean(axis=1).loc[user_100]
pred_SVD = full.loc[user_100,movie_10].add(means,axis=0)

In [131]:
def svd_pred(rating,holdout):
    rating.columns = rating.columns.map(str)
    means = rating.mean(axis=1)
    cent  = rating.sub( means, axis=0 )
    U_, Sigma, VT_ = randomized_svd(cent.to_sparse().to_coo().tocsc(), n_components=2, n_iter=5, random_state=1)
    U = pd.DataFrame(U_, index=rating.index)
    VT = pd.DataFrame(VT_, columns=rating.columns)
    full = U.mul(Sigma).dot(VT)
    full.columns=full.columns.map(int)
    means = rating.mean(axis=1).loc[user_100]
    prediction = full.loc[user_100,movie_10].add(means,axis=0)
    y_true=[]
    y_predicted=[]
    for u in holdout.index:
        for m in holdout.columns:
            if (holdout.isna().loc[u,m]==False):
                y_true=y_true+[holdout.loc[u,m]]
                y_predicted=y_predicted+[prediction.loc[u,m]]
    return y_true, y_predicted

#### Use suprise package

In [18]:
from surprise import SVD
from surprise.model_selection import cross_validate
from surprise import Reader,Dataset

In [19]:
reader = Reader(rating_scale=(1, 5))
# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(raw_rating, reader)
cross_validate(SVD(), data, measures=['RMSE', 'MAE'], cv=2, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 2 split(s).

                  Fold 1  Fold 2  Mean    Std     
RMSE (testset)    0.9566  0.9559  0.9562  0.0003  
MAE (testset)     0.7557  0.7557  0.7557  0.0000  
Fit time          3.47    3.84    3.65    0.19    
Test time         0.67    0.70    0.68    0.01    


{'fit_time': (3.4658191204071045, 3.8425168991088867),
 'test_mae': array([0.75568051, 0.75565186]),
 'test_rmse': array([0.95657211, 0.95588335]),
 'test_time': (0.6702389717102051, 0.6990277767181396)}

### - user-based 

In [119]:
def filterRatingsUU(ratings, user, item):
    df = ratings
    # Keep only user's items (not really necessary)
    #df = df[list(df.columns[df.loc[user].notnull()])+[item]]
    # Keep only users that rated item
    #df = df[df[item].notnull()]
    # Drop user's row (in case User has rated Item)
    df = df.drop(user, axis=0, errors='ignore')
    # Impute and return
    return df.fillna(0)
def predictRatingUU(df, user, item, K=3):
    #ratings, means = center(df)
    means = df.mean(axis=1)
    ratings = df.sub( means, axis=0 )
    trainingSet = filterRatingsUU(ratings, user, item)
    knn = NearestNeighbors(n_neighbors=K,  metric = 'cosine')
    knn = knn.fit(trainingSet)  

    # Find k nearest neighbors and their similarities
    dist, ind = knn.kneighbors(ratings.loc[[user]].fillna(0), return_distance=True)
    similarity = 1 - dist[0]
    nearest    = ind[0] 

    weighted_prediction   = means[user] + (trainingSet.iloc[nearest][item] * similarity).sum()/np.abs(similarity).sum()
    return weighted_prediction

In [132]:
def user_based(rating,holdout):
    rating.columns = rating.columns.map(int)
    y_true=[]
    y_predicted=[]
    for u in holdout.index:
        for i in holdout.columns:
            y_true=y_true+[holdout.loc[u,i]]
            y_predicted=y_predicted+[predictRatingUU(rating, u, i)]
    return y_true,y_predicted 

## Evaluation: MSE MAE

In [124]:
from sklearn.metrics import mean_squared_error
def evaluation(y_ture, y_predicted):
    result=pd.DataFrame([y_true,y_predicted])
    MAE=np.mean(np.abs(result[0] - result[1]))/755
    #RMSE=np.sqrt(mean_squared_error(y_true,y_predicted))
    RMSE=np.mean((result[0] - result[1])**2)/755
    display(MAE,RMSE)

In [134]:
y_true, y_pred = svd_pred(rating,holdout)
evaluation(y_true, y_pred)

0.00022829827827636906

7.87013568345712e-05

In [135]:
y_true, y_pred = user_based(rating,holdout)
evaluation(y_true, y_pred)

0.0004426518326595613

0.00014793518694243554