In [1]:
#Importing libraries
import sys
import random
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA, TruncatedSVD

my_seed = 1337
random.seed(my_seed)
np.random.seed(my_seed)

In [2]:
#Read Database
data = pd.read_csv('rating_final.csv')

In [3]:
data.head()

Unnamed: 0,userID,placeID,rating,food_rating,service_rating
0,U1077,135085,2,2,2
1,U1077,135038,2,2,1
2,U1077,132825,2,2,2
3,U1077,135060,1,2,2
4,U1068,135104,1,1,2


In [4]:
ratings = data[['rating','food_rating','service_rating']].copy()

In [5]:
ratings.head()

Unnamed: 0,rating,food_rating,service_rating
0,2,2,2
1,2,2,1
2,2,2,2
3,1,2,2
4,1,1,2


In [6]:
ratings['rating'] = ratings.sum(axis=1)

In [7]:
ratings.head()

Unnamed: 0,rating,food_rating,service_rating
0,6,2,2
1,5,2,1
2,6,2,2
3,5,2,2
4,4,1,2


In [8]:
ratings=ratings.drop(ratings.columns[[1,2]], 1)

In [9]:
ratings.head()

Unnamed: 0,rating
0,6
1,5
2,6
3,5
4,4


In [10]:
ratings=ratings.div(9)

In [11]:
ratings.head()

Unnamed: 0,rating
0,0.666667
1,0.555556
2,0.666667
3,0.555556
4,0.444444


In [12]:
ratings=ratings.mul(3)

In [13]:
ratings.head()

Unnamed: 0,rating
0,2.0
1,1.666667
2,2.0
3,1.666667
4,1.333333


In [14]:
data = data.drop(data.columns[[3,4]], 1)

In [15]:
data.head()

Unnamed: 0,userID,placeID,rating
0,U1077,135085,2
1,U1077,135038,2
2,U1077,132825,2
3,U1077,135060,1
4,U1068,135104,1


In [16]:
frames = [data,ratings]

In [17]:
data.rating = ratings.rating.copy()

In [18]:
data.head()

Unnamed: 0,userID,placeID,rating
0,U1077,135085,2.0
1,U1077,135038,1.666667
2,U1077,132825,2.0
3,U1077,135060,1.666667
4,U1068,135104,1.333333


In [86]:
# Spliting data!

from surprise.model_selection import train_test_split
from surprise import SVD, BaselineOnly , accuracy
from surprise import Reader, Dataset
import numpy as np
from typing import *
from IPython.display import display, HTML, Markdown

In [20]:
reader = Reader(rating_scale=(1,3))

In [21]:
dataset = Dataset.load_from_df(data,reader)

In [22]:
trainset, testset = train_test_split(dataset, test_size=0.25)

In [23]:
trainset

<surprise.trainset.Trainset at 0x7fd52a6e47f0>

In [24]:
model = SVD(n_factors=100)


In [25]:
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fd52e73b250>

In [59]:
model.qi.shape
#we have 130 unique items and each movie has 100 latent features


(130, 100)

In [69]:
model.qi[1]

array([ 6.86179169e-02,  7.15761829e-02, -5.43069306e-02,  2.25240979e-02,
       -1.99271475e-02, -8.74001224e-02, -7.21030028e-02, -4.69275038e-03,
        1.98983136e-04, -9.07557560e-03,  4.26799037e-02, -5.09199967e-03,
       -5.01680487e-02, -4.67221177e-02,  8.95926958e-02, -1.76939498e-01,
       -2.31433119e-01,  8.31486562e-02,  1.69492063e-03, -3.80489100e-02,
        6.06651638e-02, -3.51468730e-02,  4.09344058e-02, -6.69276158e-04,
       -2.42835526e-02,  1.10940378e-01,  1.63346709e-01,  2.60715480e-02,
       -1.06481570e-01,  1.88687665e-01, -2.80941190e-02, -1.29706504e-03,
       -3.94048617e-02,  1.04819262e-02, -3.51125019e-02, -2.96888053e-02,
       -8.77235602e-02,  2.37357972e-02,  1.24003388e-01, -1.02117152e-01,
        9.98469444e-02,  1.12304828e-02, -6.93050820e-02,  2.24765267e-02,
       -1.30043365e-01,  1.08932693e-01,  2.31397285e-01,  4.19802474e-03,
       -4.37614010e-02,  8.11110614e-02,  1.64240399e-01,  5.24171756e-02,
        1.48632891e-01,  

In [60]:
# est is the predicted rating here
a_user = "U1077"
a_product = "135060"
model.predict(a_user, a_product)


Prediction(uid='U1077', iid='135060', r_ui=None, est=1.2753553798521906, details={'was_impossible': False})

In [72]:
a_user = "U1075"
a_product = "135000"
model.predict(a_user, a_product)


Prediction(uid='U1075', iid='135000', r_ui=None, est=1.1507476323385044, details={'was_impossible': False})

In [92]:
predictions = model.fit(trainset).test(testset)
accuracy.rmse(predictions)

RMSE: 0.6355


0.6355401434803425

In [70]:
# two products are similar if cosine distanse is btw two items are close to 0
from scipy.spatial.distance import cosine
cosine(model.qi[1], model.qi[2])



0.9468269898077357

In [73]:
cosine(model.qi[1], model.qi[3])


1.059063853106506

In [62]:
def get_vector_items(items: str, trained_model: SVD) -> np.array:
    item_row_idx = trained_model.trainset._raw2inner_id_items[items]
    return trained_model.qi[item_row_idx]


def cosine_distance(vector_a: np.array, vector_b: np.array) -> float:
    return cosine(vector_a, vector_b)

In [79]:
item_to_row_idx: Dict[Any, int] = model.trainset._raw2inner_id_items

display(item_to_row_idx)

{132925: 0,
 135047: 1,
 135041: 2,
 132834: 3,
 132921: 4,
 135063: 5,
 135075: 6,
 135080: 7,
 135044: 8,
 135106: 9,
 135042: 10,
 132951: 11,
 135060: 12,
 134983: 13,
 135108: 14,
 132733: 15,
 135028: 16,
 135085: 17,
 135082: 18,
 135039: 19,
 135032: 20,
 132830: 21,
 135025: 22,
 132955: 23,
 135045: 24,
 132862: 25,
 134992: 26,
 132825: 27,
 132872: 28,
 135058: 29,
 135027: 30,
 135072: 31,
 132723: 32,
 132773: 33,
 135104: 34,
 135054: 35,
 132958: 36,
 134999: 37,
 132870: 38,
 135086: 39,
 135043: 40,
 132768: 41,
 132875: 42,
 132846: 43,
 132630: 44,
 132766: 45,
 132856: 46,
 135062: 47,
 135051: 48,
 135046: 49,
 135016: 50,
 132754: 51,
 135013: 52,
 135065: 53,
 135064: 54,
 135052: 55,
 132954: 56,
 132608: 57,
 135071: 58,
 135059: 59,
 135079: 60,
 135076: 61,
 135049: 62,
 135069: 63,
 135088: 64,
 135038: 65,
 134996: 66,
 132663: 67,
 132922: 68,
 132755: 69,
 132572: 70,
 135026: 71,
 132584: 72,
 132561: 73,
 134986: 74,
 132594: 75,
 132740: 76,
 135030: 

In [81]:
users_to_row_idx: Dict[Any, int] = model.trainset._raw2inner_id_users

display(users_to_row_idx)

{'U1027': 0,
 'U1116': 1,
 'U1061': 2,
 'U1016': 3,
 'U1137': 4,
 'U1109': 5,
 'U1134': 6,
 'U1003': 7,
 'U1083': 8,
 'U1126': 9,
 'U1032': 10,
 'U1128': 11,
 'U1091': 12,
 'U1017': 13,
 'U1124': 14,
 'U1103': 15,
 'U1089': 16,
 'U1131': 17,
 'U1071': 18,
 'U1101': 19,
 'U1132': 20,
 'U1025': 21,
 'U1097': 22,
 'U1036': 23,
 'U1090': 24,
 'U1133': 25,
 'U1045': 26,
 'U1104': 27,
 'U1095': 28,
 'U1009': 29,
 'U1077': 30,
 'U1075': 31,
 'U1084': 32,
 'U1108': 33,
 'U1040': 34,
 'U1013': 35,
 'U1067': 36,
 'U1105': 37,
 'U1059': 38,
 'U1066': 39,
 'U1057': 40,
 'U1052': 41,
 'U1041': 42,
 'U1120': 43,
 'U1082': 44,
 'U1099': 45,
 'U1112': 46,
 'U1096': 47,
 'U1033': 48,
 'U1030': 49,
 'U1024': 50,
 'U1051': 51,
 'U1093': 52,
 'U1073': 53,
 'U1018': 54,
 'U1062': 55,
 'U1006': 56,
 'U1043': 57,
 'U1088': 58,
 'U1098': 59,
 'U1014': 60,
 'U1004': 61,
 'U1010': 62,
 'U1102': 63,
 'U1115': 64,
 'U1044': 65,
 'U1081': 66,
 'U1005': 67,
 'U1114': 68,
 'U1087': 69,
 'U1106': 70,
 'U1022': 71,
 '

{'U1027': 0,
 'U1116': 1,
 'U1061': 2,
 'U1016': 3,
 'U1137': 4,
 'U1109': 5,
 'U1134': 6,
 'U1003': 7,
 'U1083': 8,
 'U1126': 9,
 'U1032': 10,
 'U1128': 11,
 'U1091': 12,
 'U1017': 13,
 'U1124': 14,
 'U1103': 15,
 'U1089': 16,
 'U1131': 17,
 'U1071': 18,
 'U1101': 19,
 'U1132': 20,
 'U1025': 21,
 'U1097': 22,
 'U1036': 23,
 'U1090': 24,
 'U1133': 25,
 'U1045': 26,
 'U1104': 27,
 'U1095': 28,
 'U1009': 29,
 'U1077': 30,
 'U1075': 31,
 'U1084': 32,
 'U1108': 33,
 'U1040': 34,
 'U1013': 35,
 'U1067': 36,
 'U1105': 37,
 'U1059': 38,
 'U1066': 39,
 'U1057': 40,
 'U1052': 41,
 'U1041': 42,
 'U1120': 43,
 'U1082': 44,
 'U1099': 45,
 'U1112': 46,
 'U1096': 47,
 'U1033': 48,
 'U1030': 49,
 'U1024': 50,
 'U1051': 51,
 'U1093': 52,
 'U1073': 53,
 'U1018': 54,
 'U1062': 55,
 'U1006': 56,
 'U1043': 57,
 'U1088': 58,
 'U1098': 59,
 'U1014': 60,
 'U1004': 61,
 'U1010': 62,
 'U1102': 63,
 'U1115': 64,
 'U1044': 65,
 'U1081': 66,
 'U1005': 67,
 'U1114': 68,
 'U1087': 69,
 'U1106': 70,
 'U1022': 71,
 '

In [56]:
def get_top_similarities(items: str, model: SVD) -> pd.DataFrame:
    # Get the first items vector
    items_vector: np.array = get_vector_items(items, model)
    similarity_table = []
    for other_items in model.trainset._raw2inner_id_items.keys():
        other_items_vector = get_vector_items(items, model)
        similarity_score = cosine_distance(items, items_vector)
        similarity_table.append((similarity_score, other_items))
    return display(sorted(similarity_table))

In [57]:
get_top_similarities(135060, model)

[(0.0, 135060),
 (0.8044910286113957, 132834),
 (0.8177596948919518, 134976),
 (0.838351234708343, 132755),
 (0.8387313883295311, 132660),
 (0.845800368174341, 135109),
 (0.8500798289821629, 132870),
 (0.8518814260167586, 132885),
 (0.8664145668673461, 135021),
 (0.8727142425230296, 132667),
 (0.8758501817780345, 132583),
 (0.8811037281556295, 132654),
 (0.8840210546536923, 135011),
 (0.8840826584246629, 135013),
 (0.8867413145633973, 135057),
 (0.8881429367124906, 135028),
 (0.8881535816483963, 135039),
 (0.8937594008717191, 132740),
 (0.8953206245618573, 132921),
 (0.9051334077791663, 132715),
 (0.9083429166472277, 135016),
 (0.9155502226308431, 132856),
 (0.9201389893405143, 135001),
 (0.9209067396663944, 135082),
 (0.923051753056026, 132872),
 (0.9238519930207022, 135030),
 (0.924339997832078, 135018),
 (0.9254267989323037, 135058),
 (0.9278503044660449, 132851),
 (0.9287030604074378, 135055),
 (0.9294942566311101, 132875),
 (0.9350967010784798, 132825),
 (0.9361479803311601, 13266

In [58]:
get_top_similarities(132825, model)

[(0.0, 132825),
 (0.7163116032542982, 135016),
 (0.7432624748525993, 135048),
 (0.7445641791854728, 132866),
 (0.7906729816245338, 135043),
 (0.7916825390615803, 132872),
 (0.7955124374291032, 132609),
 (0.8219363000683576, 132630),
 (0.8223536057152223, 135069),
 (0.8290695740926013, 132667),
 (0.8471390869642705, 135046),
 (0.8512581236008621, 135019),
 (0.8513467581356862, 135042),
 (0.8536978082128449, 135027),
 (0.855916722691708, 132564),
 (0.8628533953011674, 135055),
 (0.8640022722959616, 135033),
 (0.8654189700415891, 135034),
 (0.8708175116075615, 132884),
 (0.8782102467840458, 135045),
 (0.8785266696218925, 135057),
 (0.8889764371886973, 135076),
 (0.8931613958894729, 135047),
 (0.8997974069438519, 132861),
 (0.9010478589209665, 132668),
 (0.9050775055780339, 135088),
 (0.9078393529761244, 132885),
 (0.9116739265344224, 135049),
 (0.9132360949914616, 134992),
 (0.9161057307866541, 132875),
 (0.916872179453601, 134975),
 (0.9176115839211965, 132560),
 (0.9188171290374326, 132

In [100]:
# n_factors = 20
for i in range(120):
    model = SVD(n_factors=i)
    predictions = model.fit(trainset).test(testset)
    print("n_factors = " ,i)
    accuracy.rmse(predictions)
    print("====")

n_factors =  0
RMSE: 0.6363
====
n_factors =  1
RMSE: 0.6369
====
n_factors =  2
RMSE: 0.6356
====
n_factors =  3
RMSE: 0.6378
====
n_factors =  4
RMSE: 0.6379
====
n_factors =  5
RMSE: 0.6357
====
n_factors =  6
RMSE: 0.6346
====
n_factors =  7
RMSE: 0.6385
====
n_factors =  8
RMSE: 0.6376
====
n_factors =  9
RMSE: 0.6386
====
n_factors =  10
RMSE: 0.6381
====
n_factors =  11
RMSE: 0.6384
====
n_factors =  12
RMSE: 0.6397
====
n_factors =  13
RMSE: 0.6383
====
n_factors =  14
RMSE: 0.6374
====
n_factors =  15
RMSE: 0.6369
====
n_factors =  16
RMSE: 0.6371
====
n_factors =  17
RMSE: 0.6398
====
n_factors =  18
RMSE: 0.6382
====
n_factors =  19
RMSE: 0.6375
====
n_factors =  20
RMSE: 0.6393
====
n_factors =  21
RMSE: 0.6387
====
n_factors =  22
RMSE: 0.6370
====
n_factors =  23
RMSE: 0.6373
====
n_factors =  24
RMSE: 0.6364
====
n_factors =  25
RMSE: 0.6392
====
n_factors =  26
RMSE: 0.6346
====
n_factors =  27
RMSE: 0.6362
====
n_factors =  28
RMSE: 0.6373
====
n_factors =  29
RMSE: 0.

In [75]:
model.qi[1]

array([-0.05504829,  0.03014757, -0.11811604, -0.09448624,  0.05522235,
        0.1190548 ,  0.24749589, -0.05601078,  0.09751312,  0.01569583,
       -0.0434483 , -0.03046998, -0.22581118,  0.10261138, -0.12114852,
        0.08452071,  0.05116443,  0.06393496,  0.05223614,  0.02711122])

In [76]:
model.qi.shape

(130, 20)

In [77]:
# est is the predicted rating here
a_user = "U1077"
a_product = "135060"
model.predict(a_user, a_product)


Prediction(uid='U1077', iid='135060', r_ui=None, est=1.3073874218658215, details={'was_impossible': False})

In [89]:
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i':5}
algo = BaselineOnly(bsl_options=bsl_options)
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

Estimating biases using als...
RMSE: 0.6440


0.6440490803784167

In [None]:
surprise.accuracy.rmse(predictions, verbose=True)