In [1]:
import csv
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.iolib.summary2 import summary_col

# Analysis for TripAdvisor

In [2]:
# Import the TripAdvisor file for regression analysis
df = pd.read_csv('reg.csv')
df.head()

Unnamed: 0,restaurant_name,link,price_range,category,michelin,ranking,avg_rating,food_rating,service_rating,value_rating,...,authentity_score,review,word_count,culture_word_count,culture_score,is_cul_neighborhood,is_match_culture,catergory_num,price_class_2.5,price_class_4.0
0,Bryn Mawr Breakfast Club,https://www.tripadvisor.com/Restaurant_Review-...,[],"['american', 'cafe', 'vegetarian friendly']",0,363,4,,,,...,59.025974,"{'for': 93, 'pair': 2, 'of': 160, 'year': 6, '...",9631,2,0.000208,0,0,3,1,0
1,Tre Kronor,https://www.tripadvisor.com/Restaurant_Review-...,[],"['european', 'swedish', 'scandinavian']",0,19,4,4.5,4.5,4.5,...,65.488372,"{'being': 11, 'swedish': 115, 'came': 12, 'to'...",11486,146,0.012711,0,0,3,1,0
2,Via Veneto Ristorante,https://www.tripadvisor.com/Restaurant_Review-...,"[30, 30]","['italian', 'vegetarian friendly']",0,195,4,4.5,5.0,4.5,...,66.22,"{'via': 31, 'veneto': 31, 'ristorante': 6, 'ha...",3934,38,0.009659,0,0,2,1,0
3,Hoanh Long Vietnamese & Chinese Restaurant,https://www.tripadvisor.com/Restaurant_Review-...,[],"['chinese', 'asian', 'vietnamese']",0,53,4,4.5,4.0,4.5,...,63.533333,"{'choose': 2, 'the': 112, 'hot': 5, 'pot': 3, ...",1818,19,0.010451,0,0,3,0,0
4,Martino's,https://www.tripadvisor.com/Restaurant_Review-...,[],"['italian', 'pizza', 'vegetarian friendly']",0,1,4,,,,...,68.3125,"{'we': 51, 'like': 15, 'to': 50, 'come': 3, 'm...",2378,16,0.006728,0,0,3,1,0


In [3]:
# Drop missing values (673 rows)
df= df.dropna(subset=['avg_rating', 'authentity_score', 
                    'culture_score', 
                     'michelin', 'review_count','is_cul_neighborhood',
                     'is_match_culture', 'catergory_num'])
df = df.reset_index(drop=True)

# Change the type of variables to a suitable type (e.g., object to integer)
df['avg_rating'] = pd.to_numeric(df['avg_rating'],errors='coerce')
df['review_count'] = pd.to_numeric(df['review_count'],errors='coerce')
df['food_rating'] = pd.to_numeric(df['atmosphere_rating'],errors='coerce')

df.head()

Unnamed: 0,restaurant_name,link,price_range,category,michelin,ranking,avg_rating,food_rating,service_rating,value_rating,...,authentity_score,review,word_count,culture_word_count,culture_score,is_cul_neighborhood,is_match_culture,catergory_num,price_class_2.5,price_class_4.0
0,Bryn Mawr Breakfast Club,https://www.tripadvisor.com/Restaurant_Review-...,[],"['american', 'cafe', 'vegetarian friendly']",0,363,4,,,,...,59.025974,"{'for': 93, 'pair': 2, 'of': 160, 'year': 6, '...",9631,2,0.000208,0,0,3,1,0
1,Tre Kronor,https://www.tripadvisor.com/Restaurant_Review-...,[],"['european', 'swedish', 'scandinavian']",0,19,4,4.5,4.5,4.5,...,65.488372,"{'being': 11, 'swedish': 115, 'came': 12, 'to'...",11486,146,0.012711,0,0,3,1,0
2,Via Veneto Ristorante,https://www.tripadvisor.com/Restaurant_Review-...,"[30, 30]","['italian', 'vegetarian friendly']",0,195,4,4.5,5.0,4.5,...,66.22,"{'via': 31, 'veneto': 31, 'ristorante': 6, 'ha...",3934,38,0.009659,0,0,2,1,0
3,Hoanh Long Vietnamese & Chinese Restaurant,https://www.tripadvisor.com/Restaurant_Review-...,[],"['chinese', 'asian', 'vietnamese']",0,53,4,4.5,4.0,4.5,...,63.533333,"{'choose': 2, 'the': 112, 'hot': 5, 'pot': 3, ...",1818,19,0.010451,0,0,3,0,0
4,Martino's,https://www.tripadvisor.com/Restaurant_Review-...,[],"['italian', 'pizza', 'vegetarian friendly']",0,1,4,,,,...,68.3125,"{'we': 51, 'like': 15, 'to': 50, 'come': 3, 'm...",2378,16,0.006728,0,0,3,1,0


In [4]:
# Create a correlation matrix
columns = ['ranking', 'service_rating', 'value_rating','atmosphere_rating',
          'rate5_count', 'rate4_count', 'rate3_count', 'rate2_count','rate1_count',
          'word_count', 'culture_word_count', 'culture_score', 'catergory_num']
df.drop(columns,axis=1, inplace=True)
corrMatrix = df.corr()
corrMatrix

Unnamed: 0,michelin,avg_rating,food_rating,review_count,authentity_score,is_cul_neighborhood,is_match_culture,price_class_2.5,price_class_4.0
michelin,1.0,0.138762,-0.061368,0.228438,-0.050176,0.066565,0.033986,0.021464,0.304375
avg_rating,0.138762,1.0,0.343172,0.058361,0.019879,0.082853,0.027978,-0.049562,0.095332
food_rating,-0.061368,0.343172,1.0,-0.035681,0.170849,0.188969,0.029576,-0.122581,-0.115729
review_count,0.228438,0.058361,-0.035681,1.0,0.039408,-0.136425,-0.022129,0.081172,0.190715
authentity_score,-0.050176,0.019879,0.170849,0.039408,1.0,0.149874,0.234223,-0.114925,-0.080133
is_cul_neighborhood,0.066565,0.082853,0.188969,-0.136425,0.149874,1.0,0.34346,-0.033478,-0.082351
is_match_culture,0.033986,0.027978,0.029576,-0.022129,0.234223,0.34346,1.0,-0.055513,-0.067482
price_class_2.5,0.021464,-0.049562,-0.122581,0.081172,-0.114925,-0.033478,-0.055513,1.0,-0.351646
price_class_4.0,0.304375,0.095332,-0.115729,0.190715,-0.080133,-0.082351,-0.067482,-0.351646,1.0


# Regression (1) - DV: avg_rating

In [37]:
# Model 1: baseline model- authenticity score and avg_rating

X = df[['authentity_score']] 
y = df['avg_rating']
X = sm.add_constant(X)

model1 = sm.OLS(y, X).fit()

print_model= model1.summary()
print(print_model)


                            OLS Regression Results                            
Dep. Variable:             avg_rating   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.001
Method:                 Least Squares   F-statistic:                    0.2653
Date:                Tue, 08 Mar 2022   Prob (F-statistic):              0.607
Time:                        18:03:31   Log-Likelihood:                -214.10
No. Observations:                 673   AIC:                             432.2
Df Residuals:                     671   BIC:                             441.2
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const                3.8955      0.111  

  x = pd.concat(x[::order], 1)


In [38]:
# Model 2: authenticity score + control variables

X = df[['authentity_score', 'price_class_2.5', 'price_class_4.0', 'michelin', 
        'review_count']] 
y = df['avg_rating']
X = sm.add_constant(X)

model2 = sm.OLS(y, X).fit()

print_model = model2.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:             avg_rating   R-squared:                       0.025
Model:                            OLS   Adj. R-squared:                  0.017
Method:                 Least Squares   F-statistic:                     3.383
Date:                Tue, 08 Mar 2022   Prob (F-statistic):            0.00502
Time:                        18:03:31   Log-Likelihood:                -205.80
No. Observations:                 673   AIC:                             423.6
Df Residuals:                     667   BIC:                             450.7
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const                3.8808      0.118  

  x = pd.concat(x[::order], 1)


In [39]:
# Model 3: cultural neighborhood added

X = df[['authentity_score', 
        'price_class_2.5', 'price_class_4.0', 'michelin', 'review_count',
       'is_cul_neighborhood']] 
y = df['avg_rating']
X = sm.add_constant(X)

model3 = sm.OLS(y, X).fit()

print_model = model3.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:             avg_rating   R-squared:                       0.031
Model:                            OLS   Adj. R-squared:                  0.022
Method:                 Least Squares   F-statistic:                     3.551
Date:                Tue, 08 Mar 2022   Prob (F-statistic):            0.00181
Time:                        18:03:32   Log-Likelihood:                -203.63
No. Observations:                 673   AIC:                             421.3
Df Residuals:                     666   BIC:                             452.8
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   3.8876    

  x = pd.concat(x[::order], 1)


In [40]:
# Model 4: is_match_culture added

X = df[['authentity_score', 
        'price_class_2.5', 'price_class_4.0', 'michelin', 'review_count',
       'is_cul_neighborhood', 'is_match_culture']] 
y = df['avg_rating']
X = sm.add_constant(X)

model4 = sm.OLS(y, X).fit()

print_model = model4.summary()
print(print_model)


                            OLS Regression Results                            
Dep. Variable:             avg_rating   R-squared:                       0.031
Model:                            OLS   Adj. R-squared:                  0.021
Method:                 Least Squares   F-statistic:                     3.041
Date:                Tue, 08 Mar 2022   Prob (F-statistic):            0.00372
Time:                        18:03:32   Log-Likelihood:                -203.63
No. Observations:                 673   AIC:                             423.3
Df Residuals:                     665   BIC:                             459.3
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   3.8852    

  x = pd.concat(x[::order], 1)


In [42]:
# Create a regression table
dfoutput = summary_col([model1,model2,model3,model4],stars=True, regressor_order=['authentity_score', 'michelin',
                                                                                 'price_class_2.5', 'price_class_4.0',
                                                                                 'review_count', 'is_cul_neighborhood',
                                                                                 'is_match_culture'])
print(dfoutput)


                    avg_rating I avg_rating II avg_rating III avg_rating IIII
-----------------------------------------------------------------------------
authentity_score    0.0009       0.0011        0.0006         0.0006         
                    (0.0018)     (0.0018)      (0.0018)       (0.0019)       
michelin                         0.1427***     0.1294***      0.1296***      
                                 (0.0480)      (0.0483)       (0.0484)       
price_class_2.5                  -0.0269       -0.0244        -0.0246        
                                 (0.0313)      (0.0312)       (0.0313)       
price_class_4.0                  0.0677        0.0805         0.0800         
                                 (0.0704)      (0.0705)       (0.0707)       
review_count                     0.0001        0.0001         0.0001         
                                 (0.0001)      (0.0001)       (0.0001)       
is_cul_neighborhood                            0.0547**       0

Do authentic restaurants get higher ratings?

No, the results show no significant correlations between authenticity score and restaurant average ratings.

As shown in models 3 and 4, when the restaurant is located in a cultural neighborhood, there is a significant positive relationship with higher average ratings. Given that it is located in a cultural neighborhood, serving culturally match cuisine has little explanatory power to the average ratings.


# Regression (2) - DV: food_rating

In [43]:
# Drop missing values in food_rating
df= df.dropna(subset=['food_rating'])
df = df.reset_index(drop=True) #349 rows left
df

Unnamed: 0,restaurant_name,link,price_range,category,michelin,avg_rating,food_rating,review_count,cultural_neighborhood,authentity_score,review,is_cul_neighborhood,is_match_culture,price_class_2.5,price_class_4.0
0,Tre Kronor,https://www.tripadvisor.com/Restaurant_Review-...,[],"['european', 'swedish', 'scandinavian']",0,4,4.5,142,,65.488372,"{'being': 11, 'swedish': 115, 'came': 12, 'to'...",0,0,1,0
1,Via Veneto Ristorante,https://www.tripadvisor.com/Restaurant_Review-...,"[30, 30]","['italian', 'vegetarian friendly']",0,4,4.5,47,,66.220000,"{'via': 31, 'veneto': 31, 'ristorante': 6, 'ha...",0,0,1,0
2,Hoanh Long Vietnamese & Chinese Restaurant,https://www.tripadvisor.com/Restaurant_Review-...,[],"['chinese', 'asian', 'vietnamese']",0,4,4.5,22,,63.533333,"{'choose': 2, 'the': 112, 'hot': 5, 'pot': 3, ...",0,0,0,0
3,Mee Mah,https://www.tripadvisor.com/Restaurant_Review-...,[],"['chinese', 'asian', 'cantonese']",0,4,4.0,22,,58.625000,"{'few': 1, 'evenings': 1, 'ago': 2, 'we': 24, ...",0,0,1,0
4,Midori Japanese Restaurant,https://www.tripadvisor.com/Restaurant_Review-...,[],"['japanese', 'sushi', 'asian']",0,4,4.0,22,,63.388889,"{'wonderful': 2, 'tempura': 15, 'but': 25, 'of...",0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
344,Dusek's Board & Beer,https://www.tripadvisor.com/Restaurant_Review-...,[],"['american', 'bar', 'vegetarian friendly']",1,4,4.0,177,mexican,58.444444,"{'everything': 16, 'was': 265, 'delicious': 33...",1,0,1,0
345,Honky Tonk BBQ,https://www.tripadvisor.com/Restaurant_Review-...,[],"['american', 'bar', 'barbecue']",0,4,4.0,96,mexican,66.600000,"{'firstly': 2, 'the': 269, 'cornbread': 3, 'mu...",1,0,1,0
346,Taqueria El Milagro,https://www.tripadvisor.com/Restaurant_Review-...,[],"['quick bites', 'mexican', 'latin']",0,4,4.5,34,mexican,71.250000,"{'ve': 3, 'been': 6, 'going': 3, 'to': 36, 'th...",1,1,0,0
347,Simone's,https://www.tripadvisor.com/Restaurant_Review-...,"[5, 15]","['bar', 'international']",0,4,4.0,20,mexican,55.181818,"{'went': 4, 'for': 17, 'lunch': 2, 'on': 17, '...",1,0,0,0


In [44]:
# Model 1: baseline model - authenticity score and food_rating

X = df[['authentity_score']] 
y = df['food_rating']
X = sm.add_constant(X)

model1 = sm.OLS(y, X).fit()

print_model = model1.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:            food_rating   R-squared:                       0.029
Model:                            OLS   Adj. R-squared:                  0.026
Method:                 Least Squares   F-statistic:                     10.43
Date:                Tue, 08 Mar 2022   Prob (F-statistic):            0.00136
Time:                        18:05:17   Log-Likelihood:                -110.65
No. Observations:                 349   AIC:                             225.3
Df Residuals:                     347   BIC:                             233.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const                3.5342      0.181  

  x = pd.concat(x[::order], 1)


In [45]:
# Model 2: baseline model + control variables

X = df[['authentity_score', 'price_class_2.5', 'price_class_4.0', 'michelin', 
        'review_count']] 
y = df['food_rating']
X = sm.add_constant(X)

model2 = sm.OLS(y, X).fit()

print_model = model2.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:            food_rating   R-squared:                       0.068
Model:                            OLS   Adj. R-squared:                  0.054
Method:                 Least Squares   F-statistic:                     5.006
Date:                Tue, 08 Mar 2022   Prob (F-statistic):           0.000193
Time:                        18:05:26   Log-Likelihood:                -103.53
No. Observations:                 349   AIC:                             219.1
Df Residuals:                     343   BIC:                             242.2
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const                3.7816      0.192  

  x = pd.concat(x[::order], 1)


In [46]:
# Model 3: cultural neighborhood added

X = df[['authentity_score', 'price_class_2.5', 'price_class_4.0',
        'michelin', 
        'review_count', 'is_cul_neighborhood']] 
y = df['food_rating']
X = sm.add_constant(X)

model3 = sm.OLS(y, X).fit()

print_model = model3.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:            food_rating   R-squared:                       0.089
Model:                            OLS   Adj. R-squared:                  0.073
Method:                 Least Squares   F-statistic:                     5.546
Date:                Tue, 08 Mar 2022   Prob (F-statistic):           1.67e-05
Time:                        18:05:26   Log-Likelihood:                -99.618
No. Observations:                 349   AIC:                             213.2
Df Residuals:                     342   BIC:                             240.2
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   3.8411    

  x = pd.concat(x[::order], 1)


In [47]:
# Model 4: is_match_culture added

X = df[['authentity_score', 'price_class_2.5', 'price_class_4.0',
        'michelin', 
        'review_count', 'is_cul_neighborhood', 'is_match_culture']] 
y = df['food_rating']
X = sm.add_constant(X)

model4 = sm.OLS(y, X).fit()

print_model = model4.summary()
print(print_model)


                            OLS Regression Results                            
Dep. Variable:            food_rating   R-squared:                       0.093
Model:                            OLS   Adj. R-squared:                  0.074
Method:                 Least Squares   F-statistic:                     4.969
Date:                Tue, 08 Mar 2022   Prob (F-statistic):           2.27e-05
Time:                        18:05:27   Log-Likelihood:                -98.873
No. Observations:                 349   AIC:                             213.7
Df Residuals:                     341   BIC:                             244.6
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   3.8099    

  x = pd.concat(x[::order], 1)


In [49]:
# Create a regression table
dfoutput = summary_col([model1,model2,model3,model4],stars=True, regressor_order=['authentity_score', 'michelin',
                                                                                 'price_class_2.5', 'price_class_4.0',
                                                                                 'review_count', 'is_cul_neighborhood',
                                                                                 'is_match_culture'])
print(dfoutput)


                    food_rating I food_rating II food_rating III food_rating IIII
---------------------------------------------------------------------------------
authentity_score    0.0094***     0.0076***      0.0057*         0.0062**        
                    (0.0029)      (0.0029)       (0.0030)        (0.0030)        
michelin                          0.0011         -0.0294         -0.0286         
                                  (0.0685)       (0.0687)        (0.0687)        
price_class_2.5                   -0.1568***     -0.1508***      -0.1519***      
                                  (0.0495)       (0.0491)        (0.0491)        
price_class_4.0                   -0.3129***     -0.2879***      -0.2941***      
                                  (0.1044)       (0.1038)        (0.1038)        
review_count                      0.0000         0.0001          0.0001          
                                  (0.0001)       (0.0001)        (0.0001)        
is_cul_neighbor

Do authentic restaurants get higher food ratings?

Yes, all models show that authenticity score is significantly correlated with food rating. Similar to previous results, locating in a cultural neighborhood is positively associated with food rating. Serving culturally match cuisine does not increase food rating. 


# Regression (3) - DV: authentity_score
## avg_rating is controlled

In [5]:
# Import the TripAdvisor file
df = pd.read_csv('reg.csv')

In [6]:
# Drop missing values (673 rows)
df= df.dropna(subset=['avg_rating', 'authentity_score',  
                     'michelin', 'review_count','is_cul_neighborhood',
                     'is_match_culture'])

df = df.reset_index(drop=True)

# Change the type of variables to a suitable type (e.g., object to integer)
df['avg_rating'] = pd.to_numeric(df['avg_rating'],errors='coerce')
df['review_count'] = pd.to_numeric(df['review_count'],errors='coerce')

df.head()

Unnamed: 0,restaurant_name,link,price_range,category,michelin,ranking,avg_rating,food_rating,service_rating,value_rating,...,authentity_score,review,word_count,culture_word_count,culture_score,is_cul_neighborhood,is_match_culture,catergory_num,price_class_2.5,price_class_4.0
0,Bryn Mawr Breakfast Club,https://www.tripadvisor.com/Restaurant_Review-...,[],"['american', 'cafe', 'vegetarian friendly']",0,363,4,,,,...,59.025974,"{'for': 93, 'pair': 2, 'of': 160, 'year': 6, '...",9631,2,0.000208,0,0,3,1,0
1,Tre Kronor,https://www.tripadvisor.com/Restaurant_Review-...,[],"['european', 'swedish', 'scandinavian']",0,19,4,4.5,4.5,4.5,...,65.488372,"{'being': 11, 'swedish': 115, 'came': 12, 'to'...",11486,146,0.012711,0,0,3,1,0
2,Via Veneto Ristorante,https://www.tripadvisor.com/Restaurant_Review-...,"[30, 30]","['italian', 'vegetarian friendly']",0,195,4,4.5,5.0,4.5,...,66.22,"{'via': 31, 'veneto': 31, 'ristorante': 6, 'ha...",3934,38,0.009659,0,0,2,1,0
3,Hoanh Long Vietnamese & Chinese Restaurant,https://www.tripadvisor.com/Restaurant_Review-...,[],"['chinese', 'asian', 'vietnamese']",0,53,4,4.5,4.0,4.5,...,63.533333,"{'choose': 2, 'the': 112, 'hot': 5, 'pot': 3, ...",1818,19,0.010451,0,0,3,0,0
4,Martino's,https://www.tripadvisor.com/Restaurant_Review-...,[],"['italian', 'pizza', 'vegetarian friendly']",0,1,4,,,,...,68.3125,"{'we': 51, 'like': 15, 'to': 50, 'come': 3, 'm...",2378,16,0.006728,0,0,3,1,0


In [52]:
# Model 1: add control variables

X = df[['price_class_2.5', 'price_class_4.0', 'michelin', 
        'review_count', 'avg_rating']] 
y = df['authentity_score']
X = sm.add_constant(X)

model1 = sm.OLS(y, X).fit()

print_model = model1.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:       authentity_score   R-squared:                       0.037
Model:                            OLS   Adj. R-squared:                  0.030
Method:                 Least Squares   F-statistic:                     5.173
Date:                Tue, 08 Mar 2022   Prob (F-statistic):           0.000115
Time:                        18:06:35   Log-Likelihood:                -2261.7
No. Observations:                 673   AIC:                             4535.
Df Residuals:                     667   BIC:                             4562.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              60.9781      3.295     

  x = pd.concat(x[::order], 1)


In [53]:
# Model 2: is_cul_neighborhood included

X = df[['price_class_2.5', 'price_class_4.0', 'michelin', 
        'review_count', 'avg_rating',
       'is_cul_neighborhood']] 
y = df['authentity_score']
X = sm.add_constant(X)

model2 = sm.OLS(y, X).fit()

print_model = model2.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:       authentity_score   R-squared:                       0.059
Model:                            OLS   Adj. R-squared:                  0.050
Method:                 Least Squares   F-statistic:                     6.924
Date:                Tue, 08 Mar 2022   Prob (F-statistic):           3.76e-07
Time:                        18:06:38   Log-Likelihood:                -2254.1
No. Observations:                 673   AIC:                             4522.
Df Residuals:                     666   BIC:                             4554.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                  60.8652    

  x = pd.concat(x[::order], 1)


In [54]:
# Model 3: is_match_culture included

X = df[['price_class_2.5', 'price_class_4.0', 'michelin', 
        'review_count', 'avg_rating',
       'is_cul_neighborhood', 'is_match_culture']] 
y = df['authentity_score']
X = sm.add_constant(X)

model3 = sm.OLS(y, X).fit()

print_model = model3.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:       authentity_score   R-squared:                       0.091
Model:                            OLS   Adj. R-squared:                  0.081
Method:                 Least Squares   F-statistic:                     9.513
Date:                Tue, 08 Mar 2022   Prob (F-statistic):           2.74e-11
Time:                        18:06:39   Log-Likelihood:                -2242.3
No. Observations:                 673   AIC:                             4501.
Df Residuals:                     665   BIC:                             4537.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                  60.6759    

  x = pd.concat(x[::order], 1)


In [56]:
# Create a regression table
dfoutput = summary_col([model1,model2,model3],stars=True, regressor_order=['avg_rating', 'michelin',
                                                                                 'price_class_2.5', 'price_class_4.0',
                                                                                 'review_count', 'is_cul_neighborhood',
                                                                                 'is_match_culture'])
print(dfoutput)


                    authentity_score I authentity_score II authentity_score III
-------------------------------------------------------------------------------
avg_rating          0.5120             0.2491              0.2587              
                    (0.8212)           (0.8155)            (0.8020)            
michelin            -0.5713            -1.0444             -1.1875             
                    (1.0250)           (1.0215)            (1.0050)            
price_class_2.5     -2.7527***         -2.6002***          -2.3648***          
                    (0.6551)           (0.6494)            (0.6405)            
price_class_4.0     -5.1917***         -4.5597***          -4.0170***          
                    (1.4820)           (1.4755)            (1.4553)            
review_count        0.0055**           0.0069***           0.0064**            
                    (0.0025)           (0.0025)            (0.0025)            
is_cul_neighborhood                    

The results show that whether the restaurant is located in a cultural neighborhood and whether the culture matches with restaurant cuisine type are positively correlated with authenticity score.
Intuitively, people generally perceive that Chinese cuisines in Chinatown are more authentic than other Chinese restaurants in other neighborhoods.

# Analysis for Zomato 

# Regression (1) - DV: authentity_score

In [37]:
# Import the Zomato file for regression analysis
df = pd.read_csv('zomato_combined_data.csv') # 500 rows

# Create a dummy variable: is_cul_neighborhood
df['is_cul_neighborhood'] = df['cultural_neighborhood']!='None'
df["is_cul_neighborhood"] = df["is_cul_neighborhood"].astype(int)

# Create a dummy variable: is_match_culture
df["category"] = df["category"].str.lower()
df['is_match_culture'] = df.apply(lambda x: x.cultural_neighborhood in x.category, axis=1)
df["is_match_culture"] = df["is_match_culture"].astype(int)

In [39]:
# Export the Zomato file 
df.to_csv('z_reg.csv', index=False)

In [9]:
# Import the Zomato file 
df = pd.read_csv('z_reg.csv')
df.head()

Unnamed: 0,trip_res_name,restaurant_name_x,link,avg_rating,category,avgerage_cost,review_count,location,cultural_neighborhood,review,word_count,culture_word_count,culture_score,authentity_score,is_cul_neighborhood,is_match_culture
0,Chez Joel,Chez Joel,https://www.zomato.com/chicago/chez-joel-unive...,3.9,['french'],40.0,33,1119 W. Taylor Street 60607,italian,"{'we': 4, 'dine': 1, 'at': 1, 'chez': 2, 'joel...",263.0,2,0.007605,71.666667,1,0
1,The Rosebud,The Rosebud,https://www.zomato.com/chicago/the-rosebud-uni...,4.1,['italian'],37.5,83,"1500 W. Taylor Street, Chicago 60607",italian,"{'we': 23, 'went': 2, 'there': 7, 'for': 12, '...",1124.0,18,0.016014,55.727273,1,1
2,Tufanos Vernon Park Tap,Tufano's Vernon Park Tap,https://www.zomato.com/chicago/tufanos-vernon-...,4.2,['italian'],20.0,119,"1073 W Vernon Park Plaza, Chicago 60607",italian,"{'this': 8, 'is': 22, 'the': 47, 'kind': 1, 'o...",722.0,14,0.019391,64.75,1,1
3,Sweet Maple Cafe,Sweet Maple Cafe,https://www.zomato.com/chicago/sweet-maple-caf...,4.4,['american'],15.0,108,"1339 W. Taylor Street, Chicago 60607",italian,"{'the': 28, 'staff': 4, 'and': 17, 'manager': ...",509.0,0,0.0,0.0,1,0
4,Mario's Italian Lemonade,Mario's Italian Lemonade,https://www.zomato.com/chicago/marios-italian-...,4.0,['desserts'],2.5,42,1068 W. Taylor Street 60607,italian,"{'september': 2, 'really': 1, 'snuck': 1, 'up'...",433.0,0,0.0,64.571429,1,0


In [59]:
# Model 1: baseline model - authentity_score and avg_rating

X = df[['authentity_score']] 
y = df['avg_rating']
X = sm.add_constant(X)

model1 = sm.OLS(y, X).fit()

print_model = model1.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:             avg_rating   R-squared:                       0.056
Model:                            OLS   Adj. R-squared:                  0.054
Method:                 Least Squares   F-statistic:                     29.41
Date:                Tue, 08 Mar 2022   Prob (F-statistic):           9.15e-08
Time:                        18:07:23   Log-Likelihood:                -317.79
No. Observations:                 501   AIC:                             639.6
Df Residuals:                     499   BIC:                             648.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const                3.4795      0.044  

  x = pd.concat(x[::order], 1)


In [60]:
# Model 2: baseline model + control variables

X = df[['authentity_score', 'avgerage_cost', 
        'review_count']] 
y = df['avg_rating']
X = sm.add_constant(X)

model2 = sm.OLS(y, X).fit()

print_model = model2.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:             avg_rating   R-squared:                       0.306
Model:                            OLS   Adj. R-squared:                  0.302
Method:                 Least Squares   F-statistic:                     72.94
Date:                Tue, 08 Mar 2022   Prob (F-statistic):           4.18e-39
Time:                        18:07:23   Log-Likelihood:                -240.74
No. Observations:                 501   AIC:                             489.5
Df Residuals:                     497   BIC:                             506.3
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const                3.3789      0.042  

  x = pd.concat(x[::order], 1)


In [61]:
# Model 3: cultural neighborhood added

X = df[['authentity_score', 
        'avgerage_cost', 'review_count',
       'is_cul_neighborhood']] 
y = df['avg_rating']
X = sm.add_constant(X)

model3 = sm.OLS(y, X).fit()

print_model = model3.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:             avg_rating   R-squared:                       0.309
Model:                            OLS   Adj. R-squared:                  0.303
Method:                 Least Squares   F-statistic:                     55.45
Date:                Tue, 08 Mar 2022   Prob (F-statistic):           1.20e-38
Time:                        18:07:24   Log-Likelihood:                -239.54
No. Observations:                 501   AIC:                             489.1
Df Residuals:                     496   BIC:                             510.2
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   3.3502    

  x = pd.concat(x[::order], 1)


In [62]:
# Model 4: is_match_culture added

X = df[['authentity_score', 
        'avgerage_cost', 'review_count',
       'is_cul_neighborhood', 'is_match_culture']] 
y = df['avg_rating']
X = sm.add_constant(X)

model4 = sm.OLS(y, X).fit()

print_model = model4.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:             avg_rating   R-squared:                       0.309
Model:                            OLS   Adj. R-squared:                  0.302
Method:                 Least Squares   F-statistic:                     44.27
Date:                Tue, 08 Mar 2022   Prob (F-statistic):           9.53e-38
Time:                        18:07:31   Log-Likelihood:                -239.54
No. Observations:                 501   AIC:                             491.1
Df Residuals:                     495   BIC:                             516.4
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   3.3502    

  x = pd.concat(x[::order], 1)


In [63]:
# Create a regression table 
dfoutput = summary_col([model1,model2,model3, model4],stars=True, regressor_order=['authentity_score', 'avgerage_cost',
                                                                                 'review_count', 'is_cul_neighborhood',
                                                                                 'is_match_culture'])
print(dfoutput)


                    avg_rating I avg_rating II avg_rating III avg_rating IIII
-----------------------------------------------------------------------------
authentity_score    0.0043***    0.0025***     0.0025***      0.0025***      
                    (0.0008)     (0.0007)      (0.0007)       (0.0007)       
avgerage_cost                    0.0018**      0.0020**       0.0020**       
                                 (0.0009)      (0.0009)       (0.0009)       
review_count                     0.0031***     0.0031***      0.0031***      
                                 (0.0002)      (0.0002)       (0.0002)       
is_cul_neighborhood                            0.0545         0.0542         
                                               (0.0353)       (0.0371)       
is_match_culture                                              0.0018         
                                                              (0.0674)       
const               3.4795***    3.3789***     3.3502***      3

Do authentic restaurants get higher ratings?
Yes, as displayed in all models, restaurants with higher authenticity scores tend to have higher average ratings. However, the location of the restaurant and the cusine type served have less explanatory power to the restaurant average ratings. 

# Regression (2) - DV : authentity_score

In [64]:
# Model 1: control variables included

X = df[['avgerage_cost',  
        'review_count', 'avg_rating']] 
y = df['authentity_score']
X = sm.add_constant(X)

model1 = sm.OLS(y, X).fit()

print_model = model1.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:       authentity_score   R-squared:                       0.062
Model:                            OLS   Adj. R-squared:                  0.056
Method:                 Least Squares   F-statistic:                     10.92
Date:                Tue, 08 Mar 2022   Prob (F-statistic):           5.89e-07
Time:                        18:08:26   Log-Likelihood:                -2320.9
No. Observations:                 501   AIC:                             4650.
Df Residuals:                     497   BIC:                             4667.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const             9.6901      9.988      0.970

  x = pd.concat(x[::order], 1)


In [65]:
# Model 2: is_cul_neighborhood included

X = df[['avgerage_cost',  
        'review_count', 'avg_rating', 'is_cul_neighborhood']] 
y = df['authentity_score']
X = sm.add_constant(X)

model2 = sm.OLS(y, X).fit()

print_model = model2.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:       authentity_score   R-squared:                       0.062
Model:                            OLS   Adj. R-squared:                  0.054
Method:                 Least Squares   F-statistic:                     8.176
Date:                Tue, 08 Mar 2022   Prob (F-statistic):           2.17e-06
Time:                        18:08:29   Log-Likelihood:                -2320.9
No. Observations:                 501   AIC:                             4652.
Df Residuals:                     496   BIC:                             4673.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   9.6349    

  x = pd.concat(x[::order], 1)


In [66]:
# Model 3: is_match_culture included

X = df[['avgerage_cost',  
        'review_count', 'avg_rating', 'is_cul_neighborhood',
       'is_match_culture']] 
y = df['authentity_score']
X = sm.add_constant(X)

model3 = sm.OLS(y, X).fit()

print_model = model3.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:       authentity_score   R-squared:                       0.062
Model:                            OLS   Adj. R-squared:                  0.053
Method:                 Least Squares   F-statistic:                     6.549
Date:                Tue, 08 Mar 2022   Prob (F-statistic):           6.50e-06
Time:                        18:08:59   Log-Likelihood:                -2320.8
No. Observations:                 501   AIC:                             4654.
Df Residuals:                     495   BIC:                             4679.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   9.6251    

  x = pd.concat(x[::order], 1)


In [68]:
# Create a regression table
dfoutput = summary_col([model1,model2,model3],stars=True, regressor_order=['avg_rating', 'avgerage_cost',
                                                                                 'review_count', 'is_cul_neighborhood',
                                                                                 'is_match_culture'])
print(dfoutput)


                    authentity_score I authentity_score II authentity_score III
-------------------------------------------------------------------------------
avg_rating          10.2251***         10.2039***          10.2008***          
                    (2.8139)           (2.8238)            (2.8264)            
avgerage_cost       -0.0036            -0.0029             -0.0021             
                    (0.0588)           (0.0593)            (0.0594)            
review_count        0.0323*            0.0324*             0.0325*             
                    (0.0180)           (0.0180)            (0.0180)            
is_cul_neighborhood                    0.2390              0.0101              
                                       (2.2545)            (2.3703)            
is_match_culture                                           1.3555              
                                                           (4.2964)            
const               9.6901             

Restaurants with higher average rating tend to have higher authenticity score.

# Combine TripAdvisor & Zomato

In [69]:
# Import TripAdvisor data 
trip_df = pd.read_csv('reg.csv')

In [70]:
# Import Zomato data 
df = pd.read_csv('z_reg.csv')

In [71]:
# Combine TripAdvisor and Zomato by name 
com_df = pd.merge(trip_df,df[['trip_res_name','word_count', 'culture_word_count', 'culture_score', 'authentity_score', 'review_count', 'avg_rating']],left_on = 'restaurant_name', right_on='trip_res_name', how='left')
com_df.head()

Unnamed: 0,restaurant_name,link,price_range,category,michelin,ranking,avg_rating_x,food_rating,service_rating,value_rating,...,catergory_num,price_class_2.5,price_class_4.0,trip_res_name,word_count_y,culture_word_count_y,culture_score_y,authentity_score_y,review_count_y,avg_rating_y
0,Bryn Mawr Breakfast Club,https://www.tripadvisor.com/Restaurant_Review-...,[],"['american', 'cafe', 'vegetarian friendly']",0,363,4,,,,...,3,1,0,Bryn Mawr Breakfast Club,283.0,0.0,0.0,0.0,8.0,3.4
1,Tre Kronor,https://www.tripadvisor.com/Restaurant_Review-...,[],"['european', 'swedish', 'scandinavian']",0,19,4,4.5,4.5,4.5,...,3,1,0,Tre Kronor,1143.0,17.0,0.014873,64.428571,115.0,4.5
2,Via Veneto Ristorante,https://www.tripadvisor.com/Restaurant_Review-...,"[30, 30]","['italian', 'vegetarian friendly']",0,195,4,4.5,5.0,4.5,...,2,1,0,Via Veneto Ristorante,113.0,3.0,0.026549,91.5,15.0,3.3
3,Hoanh Long Vietnamese & Chinese Restaurant,https://www.tripadvisor.com/Restaurant_Review-...,[],"['chinese', 'asian', 'vietnamese']",0,53,4,4.5,4.0,4.5,...,3,0,0,Hoanh Long Vietnamese & Chinese Restaurant,462.0,5.0,0.010823,70.285714,15.0,3.9
4,Martino's,https://www.tripadvisor.com/Restaurant_Review-...,[],"['italian', 'pizza', 'vegetarian friendly']",0,1,4,,,,...,3,1,0,Martino's,14.0,0.0,0.0,0.0,6.0,3.8


In [72]:
# Create combined_culture_score 
com_df['combined_culture_score'] = (com_df['culture_word_count_x'] + com_df['culture_word_count_y'])/ \
                                    (com_df['word_count_x'] + com_df['word_count_y'])
com_df['combined_culture_score'] = com_df['combined_culture_score'].fillna(com_df['culture_score_x'])

In [73]:
# Create combined_authentity_score
com_df['combined_authentity_score'] = com_df['authentity_score_x'] * com_df['word_count_x'] / (com_df['word_count_x'] + com_df['word_count_y']) +\
                                       com_df['authentity_score_y'] * com_df['word_count_y'] / (com_df['word_count_x'] + com_df['word_count_y'])
com_df['combined_authentity_score'] = com_df['combined_authentity_score'].fillna(com_df['authentity_score_x'])


In [74]:
# Create combined_review_count
com_df['review_count_x'] = pd.to_numeric(com_df['review_count_x'], errors='coerce')
com_df['combined_review_count'] = com_df['review_count_x'] + com_df['review_count_y']
com_df['combined_review_count'] = com_df['combined_review_count'].fillna(com_df['review_count_x'])


In [75]:
# Create combined_review_count
com_df['combined_avg_rating']= (com_df['avg_rating_x'] * com_df['review_count_x'] / (com_df['review_count_x'] + com_df['review_count_y']) +\
                                 com_df['avg_rating_y'] * com_df['review_count_y'] / (com_df['word_count_x'] + com_df['word_count_y']))
com_df['combined_avg_rating'] = com_df['combined_avg_rating'].fillna(com_df['avg_rating_x'])             
                                

In [76]:
# Drop missing values (672 rows)
com_df= com_df.dropna(subset=['combined_avg_rating', 'combined_authentity_score', 
                    'combined_culture_score', 
                     'michelin', 'combined_review_count','is_cul_neighborhood',
                     'is_match_culture', 'catergory_num'])

com_df = com_df.reset_index(drop=True)

# Change the type of variables to a suitable type (e.g., object to integer)
com_df['ranking'] = pd.to_numeric(com_df['ranking'],errors='coerce')
com_df['food_rating'] = pd.to_numeric(com_df['atmosphere_rating'],errors='coerce')
com_df



Unnamed: 0,restaurant_name,link,price_range,category,michelin,ranking,avg_rating_x,food_rating,service_rating,value_rating,...,word_count_y,culture_word_count_y,culture_score_y,authentity_score_y,review_count_y,avg_rating_y,combined_culture_score,combined_authentity_score,combined_review_count,combined_avg_rating
0,Bryn Mawr Breakfast Club,https://www.tripadvisor.com/Restaurant_Review-...,[],"['american', 'cafe', 'vegetarian friendly']",0,363,4,,,,...,283.0,0.0,0.000000,0.000000,8.0,3.4,0.000202,57.341049,75.0,3.576077
1,Tre Kronor,https://www.tripadvisor.com/Restaurant_Review-...,[],"['european', 'swedish', 'scandinavian']",0,19,4,4.5,4.5,4.5,...,1143.0,17.0,0.014873,64.428571,115.0,4.5,0.012907,65.392454,257.0,2.251094
2,Via Veneto Ristorante,https://www.tripadvisor.com/Restaurant_Review-...,"[30, 30]","['italian', 'vegetarian friendly']",0,195,4,4.5,5.0,4.5,...,113.0,3.0,0.026549,91.500000,15.0,3.3,0.010131,66.925866,62.0,3.044489
3,Hoanh Long Vietnamese & Chinese Restaurant,https://www.tripadvisor.com/Restaurant_Review-...,[],"['chinese', 'asian', 'vietnamese']",0,53,4,4.5,4.0,4.5,...,462.0,5.0,0.010823,70.285714,15.0,3.9,0.010526,64.901579,37.0,2.404036
4,Martino's,https://www.tripadvisor.com/Restaurant_Review-...,[],"['italian', 'pizza', 'vegetarian friendly']",0,1,4,,,,...,14.0,0.0,0.000000,0.000000,6.0,3.8,0.006689,67.912678,26.0,3.086455
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
668,Simone's,https://www.tripadvisor.com/Restaurant_Review-...,"[5, 15]","['bar', 'international']",0,1,4,4.0,4.5,4.0,...,244.0,0.0,0.000000,56.000000,46.0,3.8,0.000000,55.312556,66.0,1.326594
669,Los Comales of 18th Street,https://www.tripadvisor.com/Restaurant_Review-...,[],"['mexican', 'spanish']",0,114,4,,,,...,271.0,3.0,0.011070,0.000000,19.0,3.8,0.010791,66.817986,40.0,2.151942
670,Taqueria Casa Del Pueblo,https://www.tripadvisor.com/Restaurant_Review-...,[],"['mexican', 'latin', 'spanish']",0,131,4,,,,...,198.0,0.0,0.000000,0.000000,14.0,2.9,0.009836,76.955107,33.0,2.313270
671,May St. Cafe,https://www.tripadvisor.com/Restaurant_Review-...,"[20, 40]","['latin', 'spanish']",0,119,4,4.0,4.0,4.0,...,176.0,0.0,0.000000,55.333333,25.0,3.4,0.002976,60.575964,44.0,1.763412


# Regression (1) - DV : combined_avg_rating

In [77]:
# Model 1: Baseline model - combined_authentity_score and combined_avg_rating
X = com_df[['combined_authentity_score']] 
y = com_df['combined_avg_rating']
X = sm.add_constant(X)

model1 = sm.OLS(y, X).fit()

print_model = model1.summary()
print(print_model)

                             OLS Regression Results                            
Dep. Variable:     combined_avg_rating   R-squared:                       0.000
Model:                             OLS   Adj. R-squared:                 -0.001
Method:                  Least Squares   F-statistic:                    0.1398
Date:                 Tue, 08 Mar 2022   Prob (F-statistic):              0.709
Time:                         18:10:25   Log-Likelihood:                -881.04
No. Observations:                  673   AIC:                             1766.
Df Residuals:                      671   BIC:                             1775.
Df Model:                            1                                         
Covariance Type:             nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const       

  x = pd.concat(x[::order], 1)


In [78]:
# Model 2: authenticity score + control variables

X = com_df[['combined_authentity_score', 'price_class_2.5','price_class_4.0', 'michelin', 
        'combined_review_count']] 
y = com_df['combined_avg_rating']
X = sm.add_constant(X)

model2 = sm.OLS(y, X).fit()

print_model = model2.summary()
print(print_model)


                             OLS Regression Results                            
Dep. Variable:     combined_avg_rating   R-squared:                       0.086
Model:                             OLS   Adj. R-squared:                  0.079
Method:                  Least Squares   F-statistic:                     12.55
Date:                 Tue, 08 Mar 2022   Prob (F-statistic):           1.15e-11
Time:                         18:10:29   Log-Likelihood:                -850.86
No. Observations:                  673   AIC:                             1714.
Df Residuals:                      667   BIC:                             1741.
Df Model:                            5                                         
Covariance Type:             nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const       

  x = pd.concat(x[::order], 1)


In [80]:
# Model 3: cultural neighborhood added

X = com_df[['combined_authentity_score', 
        'price_class_2.5','price_class_4.0', 'michelin', 'combined_review_count',
       'is_cul_neighborhood']] 
y = com_df['combined_avg_rating']
X = sm.add_constant(X)

model3 = sm.OLS(y, X).fit()

print_model = model3.summary()
print(print_model)

                             OLS Regression Results                            
Dep. Variable:     combined_avg_rating   R-squared:                       0.099
Model:                             OLS   Adj. R-squared:                  0.091
Method:                  Least Squares   F-statistic:                     12.16
Date:                 Tue, 08 Mar 2022   Prob (F-statistic):           5.34e-13
Time:                         18:10:30   Log-Likelihood:                -846.14
No. Observations:                  673   AIC:                             1706.
Df Residuals:                      666   BIC:                             1738.
Df Model:                            6                                         
Covariance Type:             nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const       

  x = pd.concat(x[::order], 1)


In [81]:
# Model 4: is_match_culture added

X = com_df[['combined_authentity_score', 
        'price_class_2.5','price_class_4.0', 'michelin', 'combined_review_count',
       'is_cul_neighborhood', 'is_match_culture']] 
y = com_df['combined_avg_rating']
X = sm.add_constant(X)

model4 = sm.OLS(y, X).fit()

print_model = model4.summary()
print(print_model)

                             OLS Regression Results                            
Dep. Variable:     combined_avg_rating   R-squared:                       0.103
Model:                             OLS   Adj. R-squared:                  0.094
Method:                  Least Squares   F-statistic:                     10.94
Date:                 Tue, 08 Mar 2022   Prob (F-statistic):           4.09e-13
Time:                         18:10:30   Log-Likelihood:                -844.44
No. Observations:                  673   AIC:                             1705.
Df Residuals:                      665   BIC:                             1741.
Df Model:                            7                                         
Covariance Type:             nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const       

  x = pd.concat(x[::order], 1)


In [84]:
# Create a regression table
dfoutput = summary_col([model1,model2,model3,model4],stars=True, regressor_order=['combined_authentity_score', 'michelin',
                                                                                 'price_class_2.5', 'price_class_4.0',
                                                                                 'combined_review_count', 'is_cul_neighborhood',
                                                                                 'is_match_culture'])
print(dfoutput)


                          combined_avg_rating I combined_avg_rating II combined_avg_rating III combined_avg_rating IIII
-----------------------------------------------------------------------------------------------------------------------
combined_authentity_score -0.0018               0.0036                 0.0055                  0.0039                  
                          (0.0049)              (0.0048)               (0.0048)                (0.0049)                
michelin                                        0.5375***              0.5842***               0.5757***               
                                                (0.1247)               (0.1249)                (0.1247)                
price_class_2.5                                 0.2558***              0.2437***               0.2502***               
                                                (0.0812)               (0.0808)                (0.0808)                
price_class_4.0                        

Results are similar to those produced from TripAdvisor data

# Regression (2) - DV : food_rating

In [85]:
# Drop missing value in food_rating
com_df= com_df.dropna(subset=['food_rating'])
com_df = com_df.reset_index(drop=True) #349 rows left
com_df

Unnamed: 0,restaurant_name,link,price_range,category,michelin,ranking,avg_rating_x,food_rating,service_rating,value_rating,...,word_count_y,culture_word_count_y,culture_score_y,authentity_score_y,review_count_y,avg_rating_y,combined_culture_score,combined_authentity_score,combined_review_count,combined_avg_rating
0,Tre Kronor,https://www.tripadvisor.com/Restaurant_Review-...,[],"['european', 'swedish', 'scandinavian']",0,19,4,4.5,4.5,4.5,...,1143.0,17.0,0.014873,64.428571,115.0,4.5,0.012907,65.392454,257.0,2.251094
1,Via Veneto Ristorante,https://www.tripadvisor.com/Restaurant_Review-...,"[30, 30]","['italian', 'vegetarian friendly']",0,195,4,4.5,5.0,4.5,...,113.0,3.0,0.026549,91.500000,15.0,3.3,0.010131,66.925866,62.0,3.044489
2,Hoanh Long Vietnamese & Chinese Restaurant,https://www.tripadvisor.com/Restaurant_Review-...,[],"['chinese', 'asian', 'vietnamese']",0,53,4,4.5,4.0,4.5,...,462.0,5.0,0.010823,70.285714,15.0,3.9,0.010526,64.901579,37.0,2.404036
3,Mee Mah,https://www.tripadvisor.com/Restaurant_Review-...,[],"['chinese', 'asian', 'cantonese']",0,1,4,4.0,4.5,4.0,...,13.0,0.0,0.000000,0.000000,6.0,3.6,0.009428,58.145977,28.0,3.156434
4,Midori Japanese Restaurant,https://www.tripadvisor.com/Restaurant_Review-...,[],"['japanese', 'sushi', 'asian']",0,91,4,4.0,4.0,4.0,...,137.0,0.0,0.000000,67.750000,17.0,2.9,0.005575,63.645095,39.0,2.277551
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
344,Dusek's Board & Beer,https://www.tripadvisor.com/Restaurant_Review-...,[],"['american', 'bar', 'vegetarian friendly']",1,178,4,4.0,4.5,4.0,...,1428.0,0.0,0.000000,54.888889,40.0,3.9,0.000293,58.072342,217.0,3.274106
345,Honky Tonk BBQ,https://www.tripadvisor.com/Restaurant_Review-...,[],"['american', 'bar', 'barbecue']",0,363,4,4.0,4.5,4.0,...,2120.0,0.0,0.000000,56.333333,100.0,4.1,0.000307,63.257144,196.0,2.022154
346,Taqueria El Milagro,https://www.tripadvisor.com/Restaurant_Review-...,[],"['quick bites', 'mexican', 'latin']",0,244,4,4.5,4.5,4.5,...,81.0,0.0,0.000000,0.000000,17.0,3.2,0.007001,67.882876,51.0,2.698405
347,Simone's,https://www.tripadvisor.com/Restaurant_Review-...,"[5, 15]","['bar', 'international']",0,1,4,4.0,4.5,4.0,...,244.0,0.0,0.000000,56.000000,46.0,3.8,0.000000,55.312556,66.0,1.326594


In [86]:
# Model 1: baseline model - combined_authentity_score and food_rating

X = com_df[['combined_authentity_score']] 
y = com_df['food_rating']
X = sm.add_constant(X)

model1 = sm.OLS(y, X).fit()

print_model = model1.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:            food_rating   R-squared:                       0.026
Model:                            OLS   Adj. R-squared:                  0.023
Method:                 Least Squares   F-statistic:                     9.174
Date:                Tue, 08 Mar 2022   Prob (F-statistic):            0.00264
Time:                        18:13:17   Log-Likelihood:                -111.27
No. Observations:                 349   AIC:                             226.5
Df Residuals:                     347   BIC:                             234.2
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const                 

  x = pd.concat(x[::order], 1)


In [87]:
# Model 2: control variables included

X = com_df[['combined_authentity_score', 'price_class_2.5','price_class_4.0', 'michelin', 
        'combined_review_count']] 
y = com_df['food_rating']
X = sm.add_constant(X)

model2 = sm.OLS(y, X).fit()

print_model = model2.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:            food_rating   R-squared:                       0.068
Model:                            OLS   Adj. R-squared:                  0.054
Method:                 Least Squares   F-statistic:                     5.009
Date:                Tue, 08 Mar 2022   Prob (F-statistic):           0.000192
Time:                        18:13:18   Log-Likelihood:                -103.52
No. Observations:                 349   AIC:                             219.0
Df Residuals:                     343   BIC:                             242.2
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const                 

  x = pd.concat(x[::order], 1)


In [88]:
# Model 3: cultural neighborhood added

X = com_df[['combined_authentity_score',
        'price_class_2.5','price_class_4.0', 'michelin', 'combined_review_count',
       'is_cul_neighborhood']] 
y = com_df['food_rating']
X = sm.add_constant(X)

model3 = sm.OLS(y, X).fit()

print_model = model3.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:            food_rating   R-squared:                       0.091
Model:                            OLS   Adj. R-squared:                  0.075
Method:                 Least Squares   F-statistic:                     5.697
Date:                Tue, 08 Mar 2022   Prob (F-statistic):           1.16e-05
Time:                        18:13:18   Log-Likelihood:                -99.199
No. Observations:                 349   AIC:                             212.4
Df Residuals:                     342   BIC:                             239.4
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const                 

  x = pd.concat(x[::order], 1)


In [89]:
# Model 4: is_match_culture added

X = com_df[['combined_authentity_score', 
        'price_class_2.5','price_class_4.0', 'michelin', 'combined_review_count',
       'is_cul_neighborhood', 'is_match_culture']] 
y = com_df['food_rating']
X = sm.add_constant(X)

model4 = sm.OLS(y, X).fit()

print_model = model4.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:            food_rating   R-squared:                       0.095
Model:                            OLS   Adj. R-squared:                  0.076
Method:                 Least Squares   F-statistic:                     5.084
Date:                Tue, 08 Mar 2022   Prob (F-statistic):           1.65e-05
Time:                        18:13:19   Log-Likelihood:                -98.499
No. Observations:                 349   AIC:                             213.0
Df Residuals:                     341   BIC:                             243.8
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const                 

  x = pd.concat(x[::order], 1)


In [91]:
# Create a regression table
dfoutput = summary_col([model1,model2,model3,model4],stars=True, regressor_order=['combined_authentity_score', 'michelin',
                                                                                 'price_class_2.5', 'price_class_4.0',
                                                                                 'combined_review_count', 'is_cul_neighborhood',
                                                                                 'is_match_culture'])
print(dfoutput)


                          food_rating I food_rating II food_rating III food_rating IIII
---------------------------------------------------------------------------------------
combined_authentity_score 0.0087***     0.0072**       0.0055*         0.0060**        
                          (0.0029)      (0.0028)       (0.0029)        (0.0029)        
michelin                                -0.0092        -0.0376         -0.0367         
                                        (0.0682)       (0.0682)        (0.0681)        
price_class_2.5                         -0.1622***     -0.1535***      -0.1547***      
                                        (0.0492)       (0.0488)        (0.0488)        
price_class_4.0                         -0.3243***     -0.2911***      -0.2968***      
                                        (0.1032)       (0.1027)        (0.1027)        
combined_review_count                   0.0001         0.0001          0.0001          
                               

Results are similar to those produced from TripAdvisor data

# Overview of the results

First, the authenticity score is positively correlated to a higher food rating. 
Authenticity score is not correlated to the average rating in TripAdvisor, but in Zomato.
One of the possible reasons is that, in TripAdvisor, the average rating is the average score for food, atmosphere, value, and service, so higher authenticity score may only be correlated to food, but not service or atmosphere. 
But in Zomato, users are only allowed to rate the average rating. It is possible that users are rating food when giving a score to the restaurant. TripAdvisor’s results may provide support to this point. Thus, it is likely that authenticity score is associated with a higher food rating but not the other ratings.

Second, when the restaurant is located in a cultural neighbourhood, it tends to have a higher food rating and a higher authenticity score (even when average rating is controlled). Also, restaurants serving culturally matched cuisine tend to have a higher authenticity score. We can see that geo-cultural features are important to the restaurant food rating and authenticity score.