In [3]:
import pandas as pd
import json
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

## Loading data

In [4]:
df_users = pd.read_csv('users_restaurants_illinois_reduced.csv')
df_business = pd.read_csv('businesses_restaurants_illinois_reduced.csv')
df_ratings = pd.read_csv('ratings_restaurants_illinois_reduced.csv')
df_users = df_users.drop(columns=['Unnamed: 0'])
df_business = df_business.drop(columns=['Unnamed: 0'])
df_ratings = df_ratings.drop(columns=['Unnamed: 0'])

## Preprocessing

#### Users data preprocessing

In [5]:
df_users.head(2)

Unnamed: 0,user_id,user_name,user_review_count,user_yelp_since,friends,useful_reviews,funny_reviews,cool_reviews,n_fans,years_elite,average_stars
0,4mjnkd8oJVCfBKN3i4rB-g,Tricia,1247,2008-11-27 02:24:47,"RO3vznPVVw5NWcgfP3k17A, azPMZoWkxdyjpnFul3aBQw...",3967,2631,2491,141,2009201020112012201320142015201620172018,3.42
1,ZdYHJ-hctaaDeH1CpP7-EA,Colleen,266,2009-08-08 16:46:46,"ouODopBKF3AqfCkuQEnrDg, QowYhZNL0T3UOi1FLG5VYQ...",491,103,152,11,20102011201220132014,4.02


In [6]:
df_users_p=df_users.copy()
#reviews
df_users_p["useful_reviews"]=df_users["useful_reviews"]/df_users["user_review_count"]
df_users_p["funny_reviews"]=df_users["funny_reviews"]/df_users["user_review_count"]
df_users_p["cool_reviews"]=df_users["cool_reviews"]/df_users["user_review_count"]
#elite->binary encoding
df_users_p["years_elite"]=df_users["years_elite"].apply(lambda x: 0 if pd.isna(x) else 1)
#days since registration
df_users_p["user_yelp_since"]=df_users["user_yelp_since"].apply(lambda x: x.split(" ")[0])
df_users_p["user_yelp_since"]=pd.to_datetime(df_users_p["user_yelp_since"])
df_users_p["today"]=["2020-01-01"]*len(df_users)
df_users_p["today"]=pd.to_datetime(df_users_p["today"])
df_users_p["user_yelp_since"]=(df_users_p["today"]-df_users_p["user_yelp_since"]).dt.days
#remove useless columns
df_users_p.drop(columns=["user_name","friends","today"],inplace=True)

In [7]:
df_users_p.head(2)

Unnamed: 0,user_id,user_review_count,user_yelp_since,useful_reviews,funny_reviews,cool_reviews,n_fans,years_elite,average_stars
0,4mjnkd8oJVCfBKN3i4rB-g,1247,4052,3.181235,2.109864,1.997594,141,1,3.42
1,ZdYHJ-hctaaDeH1CpP7-EA,266,3798,1.845865,0.387218,0.571429,11,1,4.02


#### Business data preprocessing

In [8]:
df_business.head(2)

Unnamed: 0,business_id,business_name,business_address,business_city,business_state,business_latitude,business_longitude,stars,review_counts,is_open,categories
0,pQeaRpvuhoEqudo3uymHIQ,The Empanadas House,404 E Green St,Champaign,IL,40.110446,-88.233073,4.5,5,1,"Ethnic Food, Food Trucks, Specialty Food, Impo..."
1,-LfTBo0oa_uD454ScEW2XA,Merry Ann's Diner,1 E Main St,Champaign,IL,40.118133,-88.2429,3.0,47,0,"Restaurants, Diners, Sandwiches, Breakfast & B..."


In [9]:
df_business_p=df_business.copy()

#encode categories
from sklearn.preprocessing import MultiLabelBinarizer
one_hot = MultiLabelBinarizer()

df_business["categories"]=df_business["categories"].apply(lambda x:x.split(","))

cat_col=one_hot.fit_transform(df_business["categories"])
cat_col=pd.DataFrame(cat_col)
cat_col.columns=list(one_hot.classes_)

#encode city
df_business_p["business_city"]=df_business["business_city"].apply(lambda x: x if x in ["Champaign","Urbana","Rantoul","Mahomet","Savoy","Monticello"] else "Other")
df_business_p=pd.get_dummies(df_business_p, columns=["business_city"])

#remove useless columns
df_business_p.drop(columns=["business_name","business_address","business_latitude","business_longitude","categories","business_state"],inplace=True)

In [10]:
df_business_p.head(2)

Unnamed: 0,business_id,stars,review_counts,is_open,business_city_Champaign,business_city_Mahomet,business_city_Monticello,business_city_Other,business_city_Rantoul,business_city_Savoy,business_city_Urbana
0,pQeaRpvuhoEqudo3uymHIQ,4.5,5,1,1,0,0,0,0,0,0
1,-LfTBo0oa_uD454ScEW2XA,3.0,47,0,1,0,0,0,0,0,0


#### Ratings data preprocessing

In [11]:
df_ratings.head(2)

Unnamed: 0,user_id,business_id,rating,date
0,6X0i-oGUbh5DZdTHzFuKfg,9A1C1f0m4nQltQrOOTl-Kw,1.0,2013-12-07 02:26:13
1,6X0i-oGUbh5DZdTHzFuKfg,u8C8pRvaHXg3PgDrsUHJHQ,5.0,2018-08-02 00:59:40


In [12]:
df_ratings_cop = df_ratings.copy()
df_ratings_cop = df_ratings_cop.groupby(['user_id'])['date'].max()
df_ratings_cop = df_ratings_cop.to_frame()
df_ratings_cop = df_ratings_cop.reset_index()


In [13]:
df_ratings_cop.head()

Unnamed: 0,user_id,date
0,-0e6xyw_4zyg-2YtqSlS_g,2017-09-06 03:02:05
1,-5PGdb8Cdp3GNZkiqyO8hQ,2019-03-14 00:31:06
2,-5RiprVYwmC33pb6sX7PGg,2014-02-27 05:13:07
3,-6v_LFbfmSbIx2ZSnVtixQ,2011-09-25 01:46:24
4,-B2cmf8vkUq0lUOUs63lhg,2019-08-16 14:26:06


In [14]:
#days since registration
df_ratings_p = df_ratings.copy()
df_ratings_p["date"]=pd.to_datetime(df_ratings_p["date"])
df_ratings_p["today"]=["2020-01-01"]*len(df_ratings)
df_ratings_p["today"]=pd.to_datetime(df_ratings_p["today"])
df_ratings_p["days_since_rating"]=(df_ratings_p["today"]-df_ratings_p["date"]).dt.days

#removing useless columns
df_ratings_p.drop(columns=["today"],inplace=True)

In [15]:
df_ratings_p.head(2)

Unnamed: 0,user_id,business_id,rating,date,days_since_rating
0,6X0i-oGUbh5DZdTHzFuKfg,9A1C1f0m4nQltQrOOTl-Kw,1.0,2013-12-07 02:26:13,2215
1,6X0i-oGUbh5DZdTHzFuKfg,u8C8pRvaHXg3PgDrsUHJHQ,5.0,2018-08-02 00:59:40,516


#### Merging dataframes

In [16]:
all_df=pd.merge(df_ratings_p,df_users_p,how="left",on='user_id')
all_df=pd.merge(all_df,df_business_p,how="left",on="business_id")
all_df.drop(columns=["business_id"],inplace=True);

In [17]:
all_df.head(2)

Unnamed: 0,user_id,rating,date,days_since_rating,user_review_count,user_yelp_since,useful_reviews,funny_reviews,cool_reviews,n_fans,...,stars,review_counts,is_open,business_city_Champaign,business_city_Mahomet,business_city_Monticello,business_city_Other,business_city_Rantoul,business_city_Savoy,business_city_Urbana
0,6X0i-oGUbh5DZdTHzFuKfg,1.0,2013-12-07 02:26:13,2215,22,3091,3.363636,0.636364,0.272727,2,...,2.5,42,0,1,0,0,0,0,0,0
1,6X0i-oGUbh5DZdTHzFuKfg,5.0,2018-08-02 00:59:40,516,22,3091,3.363636,0.636364,0.272727,2,...,3.5,440,1,1,0,0,0,0,0,0


## Model training

#### Split training et test sets

In [18]:
df_ratings_cop = all_df.copy()
df_ratings_cop = df_ratings_cop.groupby(['user_id'])['date'].max()
df_ratings_cop = df_ratings_cop.to_frame()
df_ratings_cop = df_ratings_cop.reset_index()

all_df['index']=all_df.index
all_df_test = pd.merge(all_df, df_ratings_cop, how='right')

all_df_test.drop([2327,2328],inplace=True)
all_df_test=all_df_test.reset_index(drop=True)
all_df_train = all_df.drop(list(all_df_test['index']))

all_df_train.drop(columns=["date","index","user_id"],inplace=True)
all_df_test.drop(columns=["date","index","user_id"],inplace=True)
all_df_train=all_df_train.reset_index(drop=True)

In [19]:
x_train,y_train=all_df_train.drop(columns=["rating"]),pd.DataFrame(all_df_train["rating"])
x_test,y_test=all_df_train.drop(columns=["rating"]),pd.DataFrame(all_df_train["rating"])

#### Cross-validation to determine best model

In [22]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score

dtr = DecisionTreeRegressor(random_state=13,criterion="mse",max_depth=20)
scores = cross_val_score(dtr, x_train, y_train,cv=10,scoring="neg_mean_squared_error")
mse_scores = -scores
print("MSE de la régression avec arbre de décision: ",round(mse_scores.mean(),2))
print("Ecart type: ",round(mse_scores.std(),2))

MSE de la régression avec arbre de décision:  2.31
Ecart type:  0.24


In [23]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()
scores = cross_val_score(linreg, x_train, y_train,cv=10,scoring="neg_mean_squared_error")
mse_scores = -scores
print("MSE de la régression linéaire: ",round(mse_scores.mean(),2))
print("Ecart type: ",round(mse_scores.std(),2))

MSE de la régression linéaire:  1.17
Ecart type:  0.1


In [24]:
from sklearn.ensemble import RandomForestRegressor
rdmforestr = RandomForestRegressor(random_state=13,criterion="mse",max_depth=15,n_estimators=100)
scores = cross_val_score(rdmforestr, x_train, y_train,cv=10,scoring="neg_mean_squared_error")
mse_scores = -scores
print("MSE de la régression random forest: ",round(mse_scores.mean(),2))
print("Ecart type: ",round(mse_scores.std(),2))

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


MSE de la régression random forest:  1.21
Ecart type:  0.11


In [27]:
from lightgbm import LGBMRegressor
lgbm = LGBMRegressor(boosting_type="gbdt",
                      max_depth = 20,
                      colsample_bytree=.9,
                      n_estimators = 150,
                      #min_child_samples=10,
                      #subsample=.9,
                      subsample_freq=1,
                      reg_alpha=0, 
                      reg_lambda=1,
                      n_jobs=-1,
                      tree_learner='voting',
                      max_bin = 100,
                      min_gain_to_split = .5)
scores = cross_val_score(lgbm, x_train, y_train,cv=10,scoring="neg_mean_squared_error")
mse_scores = -scores
print("MSE de la régression LGBM: ",round(mse_scores.mean(),2))
print("Ecart type: ",round(mse_scores.std(),2))

MSE de la régression LGBM:  1.18
Ecart type:  0.1


In [34]:
from sklearn.model_selection import GridSearchCV

grid = {'max_depth':[10,15,20,25],
        'n_estimators':[75,100,125]}

rfr = RandomForestRegressor(criterion="mse")

gridsearchrf = GridSearchCV(rfr, param_grid = grid, scoring='neg_mean_squared_error', n_jobs=-1, cv=10)
gridsearchrf.fit(x_train, y_train)

  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'max_depth': [10, 15, 20, 25],
 

In [36]:
gridsearchrf.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=75,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [None]:
from sklearn.model_selection import GridSearchCV

grid = {'max_depth':[8,10,12,14,16,18],
        'reg_lambda':[0.5, 1, 3, 5, 10],
        'n_estimators':[50,100,150]}

lgbm = LGBMRegressor(boosting_type="gbdt",
                      #max_depth = 15,
                      colsample_bytree=.9,
                      #n_estimators = 100,
                      #min_child_samples=10,
                      #subsample=.9,
                      subsample_freq=1,
                      reg_alpha=0, 
                      #reg_lambda=1,
                      n_jobs=-1,
                      tree_learner='voting',
                      max_bin = 100,
                      min_gain_to_split = .5)

gridsearch = GridSearchCV(lgbm, param_grid = grid, scoring='neg_mean_squared_error', n_jobs=-1, cv=10)
gridsearch.fit(x_train, y_train);

#### Performance evaluation

lgbm not tuned

In [32]:
lgbm.fit(x_train,y_train)
y_pred=lgbm.predict(x_test)
print("MSE test set: ", round(mean_squared_error(y_test, y_pred),2))

MSE test set:  0.98


Random forest regressor not tuned

In [33]:
rdmforestr.fit(x_train,y_train)
y_pred=rdmforestr.predict(x_test)
print("MSE test set: ", round(mean_squared_error(y_test, y_pred),2))

  """Entry point for launching an IPython kernel.


MSE test set:  0.48


Random forest regressor tuned

In [37]:
rf=gridsearchrf.best_estimator_
rf.fit(x_train,y_train)
y_pred=rf.predict(x_test)
print("MSE test set: ", round(mean_squared_error(y_test, y_pred),2))

  


MSE test set:  0.9
