In [1]:
import pandas as pd
from tqdm import tqdm
import json
import numpy as np
import time
from copy import deepcopy
import matplotlib.pyplot as plt
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split
from surprise import SVD, SVDpp, accuracy
from surprise import KNNBasic
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import math

In [31]:
df_users = pd.read_csv('users_restaurants_illinois_reduced.csv')
df_business = pd.read_csv('businesses_restaurants_illinois_reduced.csv')
df_ratings = pd.read_csv('ratings_restaurants_illinois_reduced.csv')
df_users = df_users.drop(columns=['Unnamed: 0'])
df_business = df_business.drop(columns=['Unnamed: 0'])
df_ratings = df_ratings.drop(columns=['Unnamed: 0'])

In [32]:
df_ratings_cop = df_ratings.copy()
df_ratings_cop = df_ratings_cop.groupby(['user_id'])['date'].max()
df_ratings_cop = df_ratings_cop.to_frame()
df_ratings_cop = df_ratings_cop.reset_index()

In [33]:
df_ratings['index']=df_ratings.index

In [34]:
df_ratings_test = pd.merge(df_ratings, df_ratings_cop, how='right')
df_ratings_test = df_ratings_test.drop([2327,2328])
df_ratings_test = df_ratings_test.reset_index(drop=True)

In [35]:
df_ratings_train = df_ratings.drop(list(df_ratings_test['index']))

In [36]:
df_ratings_train = df_ratings_train.drop(columns=['date','index'])
df_ratings_test = df_ratings_test.drop(columns=['date','index'])
df_ratings_train = df_ratings_train.reset_index(drop=True)
df_ratings_test = df_ratings_test.reset_index(drop=True)

# Reference measure: Average Rating

We evaluate the RMSE and MAE when we predict the average rating of the train set for the entire test set. It's a reference measure.

In [39]:
average_train = df_ratings_train.rating.mean()
pred_average = np.repeat(average_train, df_ratings_test.shape[0])

In [40]:
mean_squared_error(df_ratings_test['rating'], pred_average)

1.9393005832255554

In [41]:
mean_absolute_error(df_ratings_test['rating'], pred_average)

1.2065156420765029

# Graphs Construction

In [42]:
import networkx as nx

## Users Graph

In [43]:
df_users_graph = df_users[['user_id','friends']]

In [44]:
users = []
for i in range(df_users_graph.shape[0]):
    friends = df_users_graph['friends'][i].split(', ')
    users.append([df_users_graph['user_id'][i], friends])

In [45]:
G_users = nx.Graph()

for i in range(len(users)):
    G_users.add_node(users[i][0])

nl = list(G_users.nodes())
for i in range(len(users)):
    user = users[i][0]
    friends = users[i][1]
    for friend in friends:
        if friend in nl:
            G_users.add_edge(user, friend)


## User-Restaurant Bipartite Graph

In [46]:
users_list = df_users['user_id'].tolist()
restaurants_list = df_business['business_id'].tolist()

links_list = []
for i in range(df_ratings_train.shape[0]):
    links_list.append((df_ratings_train['user_id'][i], df_ratings_train['business_id'][i], df_ratings_train['rating'][i]))
    
    

In [47]:
G_us_re = nx.Graph()

G_us_re.add_nodes_from(users_list, bipartite=0)
G_us_re.add_nodes_from(restaurants_list, bipartite=1)

G_us_re.add_weighted_edges_from(links_list)

In [48]:
nx.is_connected(G_us_re)

False

In [49]:
if nx.is_connected(G_us_re):
    restaurants_nodes, users_nodes = nx.bipartite.sets(G_us_re)
else:
    users_nodes = {n for n, d in G_us_re.nodes(data=True) if d['bipartite']==0}
    restaurants_nodes = set(G_us_re) - users_nodes

# Recommendations

## Rating prediction based on User-Restaurant Bipartite Graph

The library Surprise allow to build recommendation systems thanks to collaborative filtering, but it does not support large datasets and leads to kernel crashes. We implemented the methods by our own.

In [50]:
def similarity(u,v,other_nodes, G):
    p, a, b = 0, 0, 0
    for i in other_nodes:
        p += G.get_edge_data(u,i,default={'weight': 0.0})['weight']*G.get_edge_data(v,i,default={'weight': 0.0})['weight']
        a += G.get_edge_data(u,i,default={'weight': 0.0})['weight']**2
        b += G.get_edge_data(v,i,default={'weight': 0.0})['weight']**2
    sim = p/(np.sqrt(a)*np.sqrt(b))
    return sim    

### User Collaborative Filtering

We compute cosine similarities among users.

In [51]:
ar_similarities_user = np.zeros((len(users_list), len(users_list)))
for u in range(len(users_list)):
    for v in range(u, len(users_list)):
        ar_similarities_user[u][v] = similarity(users_list[u],users_list[v],restaurants_nodes,G_us_re)

  import sys


In [52]:
ar_similarities_user_f = ar_similarities_user + ar_similarities_user.transpose() - np.eye(len(users_list))
df_similarities_user = pd.DataFrame(ar_similarities_user_f, columns = users_list, index = users_list)

Remark: Some NaN because some users have never published a Rating before the train/test separation date

In [53]:
df_similarities_user_na = df_similarities_user.fillna(0.0)

In [64]:
df_similarities_user.to_csv('similarities_bipartite_users_withnan_bis.csv')

In [65]:
df_similarities_user_na.to_csv('similarities_bipartite_users_bis.csv')

In [56]:
def prediction_userCF(df_similarities,u,i):
    '''
    Returns the predicted rating by user u for restaurant i
    
    Arguments:
        similarities: similarities between users
        u: a user
        i: a restaurant
    '''
    n,d = 0,0
    pred = 0
    for v in users_nodes:
        rating_v_i = G_us_re.get_edge_data(v,i,default={'weight': 0.0})['weight']
        if rating_v_i != 0:
            n += df_similarities[u][v]*rating_v_i
            d += df_similarities[u][v]
    if d !=0:
        pred = n/d
    else:
        pred = average_train
    return pred
    

In [57]:
def make_pred_userCF(df_ratings_test,df_similarities):
    pred = []
    for j in range(df_ratings_test.shape[0]):
        userid = df_ratings_test['user_id'][j]
        businessid = df_ratings_test['business_id'][j]
        pred.append(prediction_userCF(df_similarities,userid,businessid))
    return pred

In [58]:
pred_userCF = make_pred_userCF(df_ratings_test,df_similarities_user_na)

In [59]:
#np.sum(df_similarities_user_na.loc[df_similarities_user_na.index == '4mjnkd8oJVCfBKN3i4rB-g'].values)

In [60]:
mean_squared_error(df_ratings_test['rating'], pred_userCF)

1.873694307714286

In [61]:
mean_absolute_error(df_ratings_test['rating'], pred_userCF)

1.0962555588837808

In [62]:
nb_changed_values = df_ratings_test.shape[0]-pred_userCF.count(average_train)
nb_changed_values

2723

### Item Collaborative Filtering

In [66]:
ar_similarities_rest = np.zeros((len(restaurants_list), len(restaurants_list)))
for u in range(len(restaurants_list)):
    for v in range(u, len(restaurants_list)):
        ar_similarities_rest[u][v] = similarity(restaurants_list[u],restaurants_list[v],users_nodes,G_us_re)

  import sys


In [67]:
ar_similarities_rest_f = ar_similarities_rest + ar_similarities_rest.transpose() - np.eye(len(restaurants_list))
df_similarities_rest = pd.DataFrame(ar_similarities_rest_f, columns = restaurants_list, index = restaurants_list)

In [68]:
df_similarities_rest_na = df_similarities_rest.fillna(0.0)

In [69]:
df_similarities_rest.to_csv('similarities_bipartite_rest_withnan_bis.csv')

In [70]:
df_similarities_rest_na.to_csv('similarities_bipartite_rest_bis.csv')

In [71]:
def prediction_itemCF(df_similarities,u,i):
    '''
    Returns the predicted rating by user u for restaurant i
    
    Arguments:
        similarities: similarities between users
        u: a user
        i: a restaurant
    '''
    n,d = 0,0
    pred = 0
    for j in restaurants_nodes:
        rating_u_j = G_us_re.get_edge_data(u,j,default={'weight': 0.0})['weight']
        if rating_u_j != 0:
            n += df_similarities[i][j]*rating_u_j
            d += df_similarities[i][j]
    if d != 0:
        pred = n/d
    else:
        pred = average_train
    return pred


In [72]:
def make_pred_itemCF(df_ratings_test,df_similarities):
    pred = []
    for j in range(df_ratings_test.shape[0]):
        userid = df_ratings_test['user_id'][j]
        businessid = df_ratings_test['business_id'][j]
        pred.append(prediction_itemCF(df_similarities,userid,businessid))
    return pred

In [73]:
pred_itemCF = make_pred_itemCF(df_ratings_test,df_similarities_rest_na)

In [74]:
mean_squared_error(df_ratings_test['rating'], pred_itemCF)

2.220415561943532

In [75]:
mean_absolute_error(df_ratings_test['rating'], pred_itemCF)

1.1288097657832012

In [76]:
nb_changed_values = df_ratings_test.shape[0]-pred_itemCF.count(average_train)
nb_changed_values

2723

### Latent Collaborative Filtering

In [77]:
reader = Reader(rating_scale = (0.0, 5.0))

train_data = Dataset.load_from_df(df_ratings_train[['user_id', 'business_id', 'rating']], reader)
test_data = Dataset.load_from_df(df_ratings_test[['user_id', 'business_id', 'rating']], reader)

sr_train = train_data.build_full_trainset()
sr_test_before = test_data.build_full_trainset()
sr_test = sr_test_before.build_testset()


In [78]:
algo_latent = SVD()
algo_latent.fit(sr_train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x13931c9d0>

In [79]:
pred_latent = algo_latent.test(sr_test)

In [80]:
accuracy.mse(pred_latent)

MSE: 1.5960


1.5959517846195412

In [81]:
accuracy.mae(pred_latent)

MAE:  1.0409


1.040877588931486

In [82]:
pred_latent_list = [pred_latent[i][3] for i in range(len(pred_latent))]

In [83]:
nb_changed_values = df_ratings_test.shape[0]-pred_latent_list.count(average_train)
nb_changed_values

3000

In [84]:
algo_latent2 = SVDpp()
algo_latent2.fit(sr_train)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x139e5fc10>

In [85]:
pred_latent2 = algo_latent2.test(sr_test)

In [86]:
accuracy.mse(pred_latent2)

MSE: 1.5748


1.5747960043659566

In [87]:
accuracy.mae(pred_latent2)

MAE:  1.0279


1.0278570789684343

In [153]:
pred_latent2_list = [pred_latent2[i][3] for i in range(len(pred_latent2))]

## Rating prediction based on User-User Graph

#### Jaccard similarity

In [88]:
df_similarities_user_uni = pd.DataFrame(columns = users_list, index = users_list)
for u in users_list:
    for v in users_list:
        jac = nx.jaccard_coefficient(G_users, [(u,v)])
        for a,b,c in jac:
            df_similarities_user_uni[u][v] = c

In [89]:
for u in users_list:
    df_similarities_user_uni[u][u] = 1.0

In [90]:
df_similarities_user_uni.to_csv('similarities_unipartite_jaccard_users_bis.csv')

In [91]:
pred_useruser = make_pred_userCF(df_ratings_test,df_similarities_user_uni)


In [92]:
mean_squared_error(df_ratings_test['rating'], pred_useruser)

1.9932756406260481

In [93]:
mean_absolute_error(df_ratings_test['rating'], pred_useruser)

1.1869644644833475

In [94]:
nb_changed_values = df_ratings_test.shape[0]-pred_useruser.count(average_train)
nb_changed_values

629

#### Other similarity measure: FriendTNS

In [95]:
def friendTNS(G_users,u,v):
    sim = 0
    edges = [e for e in G_users.edges()]
    if (u,v) in edges or (v,u) in edges:
        sim = 1/(G_users.degree(u)+G_users.degree(v)-1)
    else:
        sim = 0
    return sim 

In [96]:
ar_similarities_user_uniTNS = np.zeros((len(users_list), len(users_list)))
for u in range(len(users_list)):
    for v in range(u, len(users_list)):
        ar_similarities_user_uniTNS[u][v] = friendTNS(G_users, users_list[u], users_list[v])

In [97]:
ar_similarities_user_uniTNS_f = ar_similarities_user_uniTNS + ar_similarities_user_uniTNS.transpose() - np.eye(len(users_list))
df_similarities_user_uniTNS = pd.DataFrame(ar_similarities_user_uniTNS_f, columns = users_list, index = users_list)

In [98]:
for u in users_list:
    df_similarities_user_uniTNS[u][u] = 1.0

In [99]:
df_similarities_user_uniTNS.to_csv('similarities_unipartite_TNS_users_bis.csv')

In [100]:
pred_useruserTNS = make_pred_userCF(df_ratings_test,df_similarities_user_uniTNS)

In [101]:
mean_squared_error(df_ratings_test['rating'], pred_useruserTNS)

1.9713999069857184

In [102]:
mean_absolute_error(df_ratings_test['rating'], pred_useruserTNS)

1.1936409528162337

In [103]:
nb_changed_values = df_ratings_test.shape[0]-pred_useruserTNS.count(average_train)
nb_changed_values

267

Remark:

In [104]:
len([a for a in nx.isolates(G_users)])

1812

We have 1812 users without any friend. That's why the prediction using the unipartite graph is not really performant.

## Rating prediction based on Multi Graph

We will use our different similarity matrices in order to develop a better recommendation system based on our two users similarity matrices.

In [105]:
from sklearn.preprocessing import MinMaxScaler

In [106]:
df_1 = pd.read_csv('similarities_bipartite_users_bis.csv')
df_2 = pd.read_csv('similarities_unipartite_TNS_users_bis.csv')
df_1 = df_1.set_index('Unnamed: 0')
df_2 = df_2.set_index('Unnamed: 0')

In [107]:
#Normalization of our similarity matrices
ar_1 = df_1.values
std1 = ar_1.std(ddof=1)
mean1 = ar_1.mean()
ar_1 = (ar_1-mean1)/std1
df_sim_1 = pd.DataFrame(ar_1, columns = users_list, index = users_list)

scaler = MinMaxScaler()
df_sim_1_sc = pd.DataFrame(scaler.fit_transform(df_sim_1), columns=df_sim_1.columns, index=df_sim_1.index)


In [108]:
ar_2 = df_2.values
std2 = ar_2.std(ddof=1)
mean2 = ar_2.mean()
ar_2 = (ar_2-mean2)/std2
df_sim_2 = pd.DataFrame(ar_2, columns = users_list, index = users_list)

scaler = MinMaxScaler()
df_sim_2_sc = pd.DataFrame(scaler.fit_transform(df_sim_2), columns=df_sim_2.columns, index=df_sim_2.index)


In [109]:
def rating_u_i(u,i,G_us_re,G_users,df_sim_1_sc,df_sim_2_sc,average_train):
    A = (G_users.degree(u)*G_users.number_of_nodes())/nx.adjacency_matrix(G_users).count_nonzero()
    R = (G_us_re.degree(u)*G_users.number_of_nodes())/G_us_re.number_of_edges()
    if A+R !=0:
        r = R/(A+R)
    else:
        r=1/2
    similarities_1 = np.asarray(df_sim_1_sc[u])
    similarities_2 = np.asarray(df_sim_2_sc[u])
    ar_sim = r*similarities_1 + (1-r)*similarities_2
    
    n,d = 0,0
    pred = 0
    for v in users_list:
        index_v = users_list.index(v)
        rating_v_i = G_us_re.get_edge_data(v,i,default={'weight': 0.0})['weight']
        if rating_v_i != 0:
            n += ar_sim[index_v]*rating_v_i
            d += ar_sim[index_v]
    if d !=0:
        pred = n/d
    else:
        pred = average_train
    return pred
    

In [110]:
def rating_predictions(df_ratings_test,G_us_re,G_users,df_sim_1_sc,df_sim_2_sc,average_train):
    pred = []
    for z in range(df_ratings_test.shape[0]):
        userid = df_ratings_test['user_id'][z]
        businessid = df_ratings_test['business_id'][z]
        pred.append(rating_u_i(userid,businessid,G_us_re,G_users,df_sim_1_sc,df_sim_2_sc,average_train))
    return pred

In [111]:
pred_multi = rating_predictions(df_ratings_test,G_us_re,G_users,df_sim_1_sc,df_sim_2_sc,average_train)


In [112]:
mean_squared_error(df_ratings_test['rating'], pred_multi)

1.8698799201133873

In [113]:
mean_absolute_error(df_ratings_test['rating'], pred_multi)

1.0943437483232776

In [114]:
nb_changed_values = df_ratings_test.shape[0]-pred_multi.count(average_train)
nb_changed_values

2727

In [115]:
df_ratings_test_pred = df_ratings_test[['user_id','business_id','rating']]
df_ratings_test_pred['predicted'] = pred_multi

In [132]:
df_ratings_test_pred.head()

Unnamed: 0,user_id,business_id,rating,predicted
0,6X0i-oGUbh5DZdTHzFuKfg,dHkbBWmXXjaO_-9BgQyEPg,1.0,2.568443
1,TPFWtsUi_GwNm6BX6vGv5g,F1qPjasn0R6-j8sa6iYNmA,4.0,2.940811
2,-o-EaM-C3PROpIcwtI_AFw,QxbVPV2xndVP-nT1IRKF1w,2.0,4.0
3,CHkneOHkRH9Yf-LY-XlTIg,eJtmOfqwGj5sYKQrB01oWQ,1.0,3.20436
4,oJl-C8UECsibhHS2dB8yzQ,PBmfdx-tC2D54FI3HtcKww,2.0,3.224926


In [141]:
df_ratings_test_pred_x = df_ratings_test_pred.copy()
df_ratings_test_pred_x['pred_round']= round(df_ratings_test_pred_x['predicted'])


In [143]:
mean_squared_error(df_ratings_test_pred_x['rating'], df_ratings_test_pred_x['pred_round'])

1.9683333333333333

In [144]:
mean_absolute_error(df_ratings_test_pred_x['rating'], df_ratings_test_pred_x['pred_round'])

1.0736666666666668

In [142]:
df_ratings_test_pred_x.loc[df_ratings_test_pred_x['pred_round']==df_ratings_test_pred_x['rating']]

Unnamed: 0,user_id,business_id,rating,predicted,pred_round
6,ZgWVyPXX6bpmLDV1xvOl-w,kNtToQSP_Y5U8tznLXuCaw,4.0,4.007034,4.0
7,D-Yf58g7ZqYucgja1TUdVQ,9A1C1f0m4nQltQrOOTl-Kw,3.0,3.000000,3.0
9,iQt3ya8qaVJ347rJi5jSmA,i_t8WTwztuHweRqQ89hmuQ,4.0,3.614830,4.0
12,Yk3-qONLlCxzxc5bS-ImSQ,6ockUmdOBYBApEsUCJkBzA,5.0,4.779984,5.0
15,DPldvSGto59lJadlRxpmrw,9A1C1f0m4nQltQrOOTl-Kw,4.0,3.539027,4.0
...,...,...,...,...,...
2965,aQ1ztNVjHnGFKKnVSATkTQ,rXIDrS8Rz8r09vH4gjHfGQ,4.0,4.320066,4.0
2970,ZdzoROWb3lpr4qNOAoxtzw,0tU2xdfTSfz-BwD4VhtVAQ,4.0,3.624949,4.0
2980,dV4wf3PTE70HdaANCfcoZA,joCHRYAw1a5m-1ZhJ7uAGQ,5.0,4.626765,5.0
2988,_J5b7ykbzdxa5tmuhb74gQ,zp3rZNu-5qonTL3ByPYOUg,4.0,3.624949,4.0


In [138]:
df_ratings_test_pred_x.loc[(df_ratings_test_pred_x['pred_round']==df_ratings_test_pred_x['rating']-1)]

Unnamed: 0,user_id,business_id,rating,predicted,pred_round
1,TPFWtsUi_GwNm6BX6vGv5g,F1qPjasn0R6-j8sa6iYNmA,4.0,2.940811,3.0
5,JbRnDXp6ylbCw3jURKcB3g,hvPPw19VdGRsOEkxgyD7tg,5.0,4.251669,4.0
14,Rdp3C5KacNRbufVLL_XhMA,jeTfL2kCyBtmFGSrSQHqVw,5.0,4.153662,4.0
20,L_sVLn8BBDiVSNVlQOfCYg,L2c-qKZWumCmOCR-dqBLrg,5.0,4.173412,4.0
22,t9EWoO3fay-4_yJtCGdp3w,kNtToQSP_Y5U8tznLXuCaw,5.0,4.111889,4.0
...,...,...,...,...,...
2990,DMF6Gle5GC35h4lcl67x-Q,l4x5FFzfiCFrGN0S-uZbBQ,5.0,3.624949,4.0
2994,N4PnVYepGliRXeeITxN0iw,6jsuYyjUM45WAX-6SxOQgQ,5.0,3.624949,4.0
2995,1LtJ5w7YaxaN0equ3Oq6pw,dIUHCuiAlzkxfgCEOtky8w,5.0,3.624949,4.0
2996,EeOcpXst4ihchMBvNkAoUA,dIUHCuiAlzkxfgCEOtky8w,5.0,3.624949,4.0


In [139]:
df_ratings_test_pred_x.loc[(df_ratings_test_pred_x['pred_round']==df_ratings_test_pred_x['rating']+1)]

Unnamed: 0,user_id,business_id,rating,predicted,pred_round
4,oJl-C8UECsibhHS2dB8yzQ,PBmfdx-tC2D54FI3HtcKww,2.0,3.224926,3.0
8,sMEJUkjyTWQrdUaSBLR-sQ,9A1C1f0m4nQltQrOOTl-Kw,1.0,2.093914,2.0
11,oWjsGEPgt5qMymvmYoD0VQ,oWQwUw8xfQgi1nVNKgS6vA,3.0,3.793692,4.0
13,x6Fr3uJgECqvCFIe1d_sXQ,9A1C1f0m4nQltQrOOTl-Kw,2.0,3.044250,3.0
16,fwvJx2dzm3W9b5ajBYIPHw,KedpWqBphmRopwBd7trzFw,1.0,1.594412,2.0
...,...,...,...,...,...
2937,44ZwQqCtNIl79OaVn-sT7A,SUktrYdNQD8k2vvkM4OpfA,3.0,3.893496,4.0
2938,4zEBlnX60GYJceJ5fEXRtg,LvPbZ_5odnjE_oj5BNHivQ,3.0,4.247043,4.0
2949,DHiaN7x2UtHXRIcwwMqh0Q,4OABFHxKDYxJh9A2BhoRkg,4.0,5.000000,5.0
2972,utxFQ-yv3CXxc6KN1SvGJQ,ldKrUirpZZ6JDnNwOwLpvw,1.0,1.769401,2.0


In [140]:
1002+825+393

2220

## Rating prediction based on Multi Graph: Other approach

There are too many ratings that are not predicted (set to average_train).

In [117]:
average_users = []
for u in users_list:
    avg = df_ratings_train.loc[df_ratings_train['user_id']==u].rating.mean()
    if math.isnan(avg):
        average_users.append(average_train)
    else:
        average_users.append(avg)


In [118]:
def rating_u_i_imp(u,i,G_us_re,G_users,df_sim_1_sc,df_sim_2_sc,average_users):
    A = (G_users.degree(u)*G_users.number_of_nodes())/nx.adjacency_matrix(G_users).count_nonzero()
    R = (G_us_re.degree(u)*G_users.number_of_nodes())/G_us_re.number_of_edges()
    if A+R !=0:
        r = R/(A+R)
    else:
        r=1/2
    similarities_1 = np.asarray(df_sim_1_sc[u])
    similarities_2 = np.asarray(df_sim_2_sc[u])
    ar_sim = r*similarities_1 + (1-r)*similarities_2
    
    index_u = users_list.index(u)
    avg_u = average_users[index_u]
    
    n,d = 0,0
    pred = avg_u
    for v in users_list:
        index_v = users_list.index(v)
        avg_v = average_users[index_v]
        rating_v_i = G_us_re.get_edge_data(v,i,default={'weight': 0.0})['weight']
        if rating_v_i != 0:
            n += ar_sim[index_v]*abs(rating_v_i - avg_v)
            d += ar_sim[index_v]
    if d !=0:
        pred += n/d
    return pred
    

In [119]:
def rating_predictions_imp(df_ratings_test,G_us_re,G_users,df_sim_1_sc,df_sim_2_sc,average_users):
    pred = []
    for z in range(df_ratings_test.shape[0]):
        userid = df_ratings_test['user_id'][z]
        businessid = df_ratings_test['business_id'][z]
        pred.append(rating_u_i_imp(userid,businessid,G_us_re,G_users,df_sim_1_sc,df_sim_2_sc,average_users))
    return pred

In [120]:
pred_multi_imp = rating_predictions_imp(df_ratings_test,G_us_re,G_users,df_sim_1_sc,df_sim_2_sc,average_users)


In [121]:
mean_squared_error(df_ratings_test['rating'], pred_multi_imp)

2.8655603797249047

In [122]:
mean_absolute_error(df_ratings_test['rating'], pred_multi_imp)

1.2924552661184818

In [123]:
nb_changed_values = df_ratings_test.shape[0]-pred_multi_imp.count(average_train)
nb_changed_values

2871

In [124]:
df_ratings_test_pred_2 = df_ratings_test[['user_id','business_id','rating']]
df_ratings_test_pred_2['predicted'] = pred_multi_imp

In [125]:
df_ratings_test_pred_2

Unnamed: 0,user_id,business_id,rating,predicted
0,6X0i-oGUbh5DZdTHzFuKfg,dHkbBWmXXjaO_-9BgQyEPg,1.0,3.971954
1,TPFWtsUi_GwNm6BX6vGv5g,F1qPjasn0R6-j8sa6iYNmA,4.0,3.404085
2,-o-EaM-C3PROpIcwtI_AFw,QxbVPV2xndVP-nT1IRKF1w,2.0,3.502398
3,CHkneOHkRH9Yf-LY-XlTIg,eJtmOfqwGj5sYKQrB01oWQ,1.0,3.669230
4,oJl-C8UECsibhHS2dB8yzQ,PBmfdx-tC2D54FI3HtcKww,2.0,4.307118
...,...,...,...,...
2995,1LtJ5w7YaxaN0equ3Oq6pw,dIUHCuiAlzkxfgCEOtky8w,5.0,3.624949
2996,EeOcpXst4ihchMBvNkAoUA,dIUHCuiAlzkxfgCEOtky8w,5.0,3.624949
2997,Qu_MI93Sxl02KXNj-ylxAQ,E8b-loa-89wsnJpu9ff_Gg,4.0,3.624949
2998,36TpSKvNT8nKVMUe4OFncw,RAJUQvRLopFmIklzqaUYeA,3.0,3.624949


In [145]:
df_ratings_test_pred_2_x = df_ratings_test_pred_2.copy()
df_ratings_test_pred_2_x['pred_round']= round(df_ratings_test_pred_2_x['predicted'])

In [146]:
mean_squared_error(df_ratings_test_pred_2_x['rating'], df_ratings_test_pred_2_x['pred_round'])

2.9433333333333334

In [147]:
mean_absolute_error(df_ratings_test_pred_2_x['rating'], df_ratings_test_pred_2_x['pred_round'])

1.26

# Essai combinaison Multi-SVD

In [148]:
df_comp = df_ratings_test[['user_id','business_id','rating']]
df_comp['pred_SVD'] = pred_latent_list
df_comp['pred_multi_1'] = pred_multi
df_comp['pred_multi_2'] = pred_multi_imp
df_comp.loc[700,:]

user_id         MG3kdFrljYCcwkmzDDtR8g
business_id     BKNPoWf3bQoxFWf-Kjt3wQ
rating                               1
pred_SVD                       3.39629
pred_multi_1                         4
pred_multi_2                       5.5
Name: 700, dtype: object

In [149]:
def rating_prediction_multi_SVD(pred_svd,pred_mult):
    pred = []
    for z in range(df_ratings_test.shape[0]):
        if pred_mult[z] == average_train:
            pred.append(pred_svd[z])
        else:
            pred.append(pred_mult[z])
    return pred

In [154]:
pred_test1 = rating_prediction_multi_SVD(pred_latent2_list,pred_multi)
pred_test2 = rating_prediction_multi_SVD(pred_latent2_list,pred_multi_imp)

In [155]:
mean_squared_error(df_ratings_test['rating'], pred_test1)

1.849715641001648

In [156]:
mean_absolute_error(df_ratings_test['rating'], pred_test1)

1.0843950414275425

In [525]:
mean_squared_error(df_ratings_test['rating'], pred_test2)

2.410275579296073

In [526]:
mean_absolute_error(df_ratings_test['rating'], pred_test2)

1.2076380203543702

In [527]:
df_test = df_ratings_test[['user_id','business_id','rating']]
df_test['pred_test_1'] = pred_test1
df_test['pred_test_2'] = pred_test2

In [528]:
df_test

Unnamed: 0,user_id,business_id,rating,pred_test_1,pred_test_2
0,6X0i-oGUbh5DZdTHzFuKfg,u8C8pRvaHXg3PgDrsUHJHQ,5.0,3.779700,3.618027
1,6X0i-oGUbh5DZdTHzFuKfg,dHkbBWmXXjaO_-9BgQyEPg,1.0,3.238342,2.714286
2,iQt3ya8qaVJ347rJi5jSmA,dIZcPB3CtNjMn4O_p8QFxw,4.0,4.087520,4.450527
3,iQt3ya8qaVJ347rJi5jSmA,i_t8WTwztuHweRqQ89hmuQ,4.0,3.439996,4.433910
4,iQt3ya8qaVJ347rJi5jSmA,f_CDR6H4QL1K3SeaBe7r3g,3.0,3.540818,4.764313
...,...,...,...,...,...
3929,VO42jNnadblgtODxPYVDRA,k6m3Msok7bto6biv5guEzg,2.0,3.104620,3.104620
3930,nlHtklaFE5gfKCd6K-jeuQ,KGpsB2dsdkxl8SGVUlJbZw,1.0,3.191568,3.191568
3931,1LtJ5w7YaxaN0equ3Oq6pw,dIUHCuiAlzkxfgCEOtky8w,5.0,4.330800,4.330800
3932,EeOcpXst4ihchMBvNkAoUA,dIUHCuiAlzkxfgCEOtky8w,5.0,4.330800,4.330800
