In [36]:
import pandas as pd
import numpy as np
import csv
import os, glob
import re
from collections import Counter
pd.set_option('max_rows', None) 
pd.set_option('max_columns', None)

## In this file, we combine all scraping data (one for restaurant information and the other for reviews), calculate some values for necessary features, and add them into the combined restaurant dataset 

### Create cultural_neighborhood

In [38]:
cultural_neighborhood = {'z_restaurants_Bridgeview.csv':"arabic",'z_restaurants_HumboldtPark.csv':"puerto rican",
                         'z_restaurants_chinatown.csv':"chinese",'z_restaurants_lincolnsquare.csv': "german",
                         "z_restaurants_littleitaly.csv":"italian",'z_restaurants_pilsen.csv':"mexican",
                         'z_restaurants_westtown.csv':"polish"}

In [39]:
def add_neighborhood(rest_folder):
    """
    In each review file, add a column of 'cultural_neighborhood' which labels 
    the representive culture or 'None'otherwise
    
    input: rest_folder- a list of files with restaurant information
    return: rest_files- the same list of files with a new column of cultural_neghborhood.
    """
    
    os.chdir(rest_folder)
    rest_files = glob.glob('*.{}'.format('csv'))
    for file in rest_files:
        df = pd.read_csv(file)
        if file in cultural_neighborhood.keys():
            df['cultural_neighborhood'] = cultural_neighborhood[file]
            df.to_csv(file, index=False)
        else:
            df['cultural_neighborhood'] = "None"
            df.to_csv(file,index=False)
    return rest_files

In [41]:
# Create all_restaurant.csv in rest_folder (547 rest)
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
combined_csv.to_csv( "all_restaurant.csv", index=False, encoding='utf-8-sig')


### Calculate autheticity score

In [None]:
# This table comes from "Authenticity and Consumer Value Ratings- Empirical Tests from the Restaurant Domain" (Kovács et al. (2014))
auth_table = {"authentic":95, "genuine":92,"real":88, "skilled":83,"faithful":81,
              "legitimate":81,"original":80,"traditional":79, "pure":78, "historical":77,
              "sincere":77,"master chef":75,"craftsmanship":74,"honest":74,"integrity":74,
              "quintessential":74,"expert":73,"iconic":73,"inspiring":73,"unique":72,
              "wholesome":72,"professional":70,"skillful":70,"truthful":68,
              "unmistakable":68,"artisan":67,"unpretentious":67,"heartful":66,
              "delicious":65,"virtuous":64,"normal":63,"creative":62,"interesting":62,
              "orthodox":62,"artful":60,"special":60,"righteous":58,"substantial":57,
              "authoritative":56,"typical":56,"awesome":55,"moral":55,"eccentric":54,
              "ethical":54,"fresh":53,"old-fashioned":53, "usual":53, "decent":51,
              "unusual":51,"caring":49, "ambitious":48, "replica":46,"offbeat":43,
             "atypical":41,"unassuming":37,"invented":36,"new":36,"unconventional":36,
             "peculiar":35,"outlandish":32,"assumed":30,"idiosyncratic":30,"quirkly":29,
             "extroverted":28,"modern":27,"unorthodox":27,"pretentious":17,"artificial":14,
             "bogus":13,"forgery":13,"fake":12,"hoax":11,"cheat":10,"dishonest":10,
             "feigned":10,"ersatz":9,"faked":9,"limitation":9,"quack":9,"unreal":8,
             "humbug":7,"impostor":7, "sham":7, "unauthentic":7,"deceptive":6,"inauthentic":6,
             "false":6,"phony":5,"scam":4}

In [44]:
def check_authentic_score(csv):
    """
    Calculate authentic_score for each restaruant
    
    input: csv - csv file with reviews in each neghborhood
    return: a dictionary with restaurant name as keys and 
        authenticity score as values.
    """

    extension = 'csv'
    all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
    combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
    combined_csv.to_csv("all_restaurant_review.csv", index=False, encoding='utf-8-sig')
    reviews_df = pd.read_csv(csv)
    reviews_df = reviews_df.dropna()
    reviews_df = reviews_df.reset_index(drop=True)
    
    res = []
    [res.append(x) for x in reviews_df['restaurant_name'].tolist() if x not in res]
    dic = {}
    for re in res:
        wc = 0
        total = 0
        data = reviews_df.loc[reviews_df["restaurant_name"] == re]
        for ind in data.index:
            txt =data.at[ind, "review"].split()
            txt = (map(lambda x: x.lower(), txt))
            for wrd in txt:
                if wrd in auth_table.keys():
                    wc += 1
                    total += auth_table[wrd]
        if wc == 0:
            dic[re] = 0
        else:
            score = total/wc
            dic[re] = score
    return dic

In [None]:
# test
dic_test = check_authentic_score('all_restaurant_review.csv')
dic_test

### Create culture score

In [47]:
#list of words that represent nationalities and regions
cultural_dict = {
    'American' : ['American', 'America'],
    'Argentinean' : ['Argentinean', 'Argentina'], 
    'Asian' : ['Asian', 'Asia'], 
    'Austrian' : ['Austrian', 'Austria'],
    'Belgian' : ['Belgian', 'Belgium'], 
    'British' : ['British', 'Britan'],
    'Cajun' : ['Cajun'], 
    'Cantonese' : ['Cantonese'], 
    'Caribbean' : ['Caribbean'], 
    'Central American' : ['America'], 
    'Chinese' : ['Chinese', 'China'], 
    'Creole' : ['Creole'], 
    'Croatian' : ['Croatian', 'Croatia'],
    'Dutch' : ['Dutch', 'Netherlands'], 
    'Eastern European' : ['Europe'], 
    'Egyptian' : ['Egyptian', 'Egypt'], 
    'European' : ['European', 'Europe'], 
    'Filipino' : ['Filipino', 'Philippines'], 
    'French' : ['French'],
    'German' : ['German', 'Germany'], 
    'Greek' : ['Greek', 'Greece'], 
    'Halal' : ['Halal'], 
    'Hawaiian' : ['Hawaiian', 'Hawaii'], 
    'Hong Kong' : ['HongKong', 'Hong'], 
    'Hunan' : ['Hunan'], 
    'Indian' : ['Indian', 'India'], 
    'Irish' : ['Irish', 'Ireland'], 
    'Israeli' : ['Israeli', 'Isarel'], 
    'Italian' : ['Italian', 'Italy'], 
    'Jamaican' : ['Jamaican', 'Jamaica'], 
    'Japanese' : ['Japanese', 'Japan'], 
    'Korean' : ['Korean', 'Korea'], 
    'Kosher' : ['Kosher'],
    'Latin' : ['Latin', 'Latino'], 
    'Lazio' : ['Lazio'], 
    'Lebanese' : ['Lebanese', 'Lebanon'], 
    'Mediterranean' : ['Mediterranean'], 
    'Mexican' : ['Mexican', 'Mexico'], 
    'Middle Eastern' : [], 
    'Nepali' : ['Nepali', 'Nepal'], 
    'Persian' : ['Persian'], 
    'Peruvian' : ['Peruvian', 'Peru'], 
    'Polish' : ['Polish', 'Poland'], 
    'Polynesian' : ['Polynesian', 'Polynesia'], 
    'Portuguese' : ['Portuguese', 'Portugal'], 
    'Romana' : ['Romana', 'Roman'], 
    'Russian' : ['Russian', 'Russia'], 
    'Scandinavian' : ['Scandinavian', 'Scandinavia'], 
    'Shanghai' : [], 
    'South American' : ['American', 'America'], 
    'Southwestern' : ['Southwestern', 'Southwest'], 
    'Spanish' : ['Spanish', 'Spain', 'Hispanic'], 
    'Swedish' : ['Swedish', 'Sweden'], 
    'Szechuan' : ['Szechuan'],
    'Thai' : ['Thai', 'Thailand'], 
    'Turkish' : ['Turkish', 'Turkey'], 
    'Tuscan' : ['Tuscan', 'Tuscany'], 
    'Ukrainian' : ['Ukrainian', 'Ukraine'], 
    'Vietnamese' : ['Vietnamese', 'Vietnam']}

In [48]:
cultural_dict = dict((k, [w.lower() for w in v]) for k,v in cultural_dict.items())

In [49]:
def review_words(restaurant_all):
    """
    Create a dictionary of each word and calculate word counts for reviews in
    each restaurant
    
    input: 
        restaurants_all- a combined dataset for all restaurant information
    return: 
        restaurants- a combined dataset for all restaurant information with
            a new column of word counts.
    """
    reviews = pd.read_csv('all_restaurant_review.csv')
    reviews = reviews.dropna()
    reviews = reviews.reset_index(drop=True)
    review_cat = reviews.groupby(['restaurant_name'])['review'].apply(','.join).reset_index()
    review_cat['review'] = review_cat['review']\
                    .apply(lambda x: re.findall(r"[a-z]\w+", x.lower()))
    review_cat['word_count'] = review_cat['review'].apply(lambda x: len(x))
    review_cat['review'] = review_cat['review'].apply(lambda x: dict(Counter(x)))
    restaurants = restaurants_all.merge(review_cat, how='left', left_on=['trip_res_name'], right_on=['restaurant_name'])
    restaurants = restaurants.dropna()
    return restaurants.reset_index(drop=True)


In [51]:
def culture_score(restaurants_all):
    """
    Count words connecting with nationalities of region and culculate culutral score with frequency of
    nationalities-related word given all word counts of each restaurant review.
    
    input: restaurants_all- a combined dataset for all restaurant information(with word count column)
    return: restaurants_all- a combined dataset for all restaurant information with new columns of
        count of cultural words(nationalities-related, words in cultural_dict) and cultural score.
    """
    
    culture_word_col = []
    restaurants_all["category"] = restaurants_all["category"].apply(lambda x: re.findall(r'\w+\s?\w+', x))
    for i in range(restaurants_all.shape[0]):
        cultural_words = []
        culture_score = 0
        for category in restaurants_all['category'][i]:
            if category in cultural_dict.keys():
                cultural_words.extend(cultural_dict[category])
        for word in cultural_words:
            if word in restaurants_all['review'][i].keys():
                culture_score += restaurants_all['review'][i][word]
        culture_word_col.append(culture_score)
    restaurants_all['culture_word_count'] = culture_word_col
    restaurants_all['culture_score'] = culture_word_col/restaurants_all['word_count']
    return restaurants_all


In [52]:
pre_final = culture_score(restaurants_all)

In [53]:
pre_final

Unnamed: 0,trip_res_name,restaurant_name_x,link,avg_rating,category,avgerage_cost,review_count,location,cultural_neighborhood,restaurant_name_y,review,word_count,culture_word_count,culture_score
0,Chez Joel,Chez Joel,https://www.zomato.com/chicago/chez-joel-unive...,3.9,[French],40.0,33,1119 W. Taylor Street 60607,italian,Chez Joel,"{'we': 4, 'dine': 1, 'at': 1, 'chez': 2, 'joel...",263.0,2,0.007605
1,The Rosebud,The Rosebud,https://www.zomato.com/chicago/the-rosebud-uni...,4.1,[Italian],37.5,83,"1500 W. Taylor Street, Chicago 60607",italian,The Rosebud,"{'we': 23, 'went': 2, 'there': 7, 'for': 12, '...",1124.0,18,0.016014
2,Tufanos Vernon Park Tap,Tufano's Vernon Park Tap,https://www.zomato.com/chicago/tufanos-vernon-...,4.2,[Italian],20.0,119,"1073 W Vernon Park Plaza, Chicago 60607",italian,Tufanos Vernon Park Tap,"{'this': 8, 'is': 22, 'the': 47, 'kind': 1, 'o...",722.0,14,0.019391
3,Sweet Maple Cafe,Sweet Maple Cafe,https://www.zomato.com/chicago/sweet-maple-caf...,4.4,[American],15.0,108,"1339 W. Taylor Street, Chicago 60607",italian,Sweet Maple Cafe,"{'the': 28, 'staff': 4, 'and': 17, 'manager': ...",509.0,0,0.0
4,Mario's Italian Lemonade,Mario's Italian Lemonade,https://www.zomato.com/chicago/marios-italian-...,4.0,[Desserts],2.5,42,1068 W. Taylor Street 60607,italian,Mario's Italian Lemonade,"{'september': 2, 'really': 1, 'snuck': 1, 'up'...",433.0,0,0.0
5,Tuscany on Taylor,Tuscany,https://www.zomato.com/chicago/tuscany-restaur...,3.9,"[Italian, Pizza]",35.0,38,"550 S. Milwaukee Avenue, Wheeling 60090",italian,Tuscany on Taylor,"{'if': 1, 'you': 1, 'are': 2, 'visiting': 1, '...",260.0,5,0.019231
6,Stax Cafe,Stax Cafe,https://www.zomato.com/chicago/stax-cafe-unive...,3.9,[American],15.0,42,"1401 W. Taylor Street, Chicago 60607",italian,Stax Cafe,"{'this': 13, 'has': 2, 'been': 2, 'on': 12, 'm...",1366.0,1,0.000732
7,Jim's Original Hot Dog,Jim's Original,https://www.zomato.com/chicago/jims-original-u...,4.2,"[Burger, Sandwich]",7.5,59,"1250 S Union Avenue, Chicago 60607",italian,Jim's Original Hot Dog,"{'best': 6, 'polish': 10, 'ever': 2, 'love': 2...",857.0,0,0.0
8,Al's 1 Italian Beef,Al's Italian Beef,https://www.zomato.com/chicago/als-beef-lakeview,3.7,"[Fast Food, Italian, Sandwich]",10.0,10,3420 N. Clark Street 60657,italian,Al's 1 Italian Beef,"{'would': 1, 'recommend': 1, 'al': 4, 'and': 1...",255.0,2,0.007843
9,Pompei,Pompei Little Italy Streeterville,https://www.zomato.com/chicago/pompei-little-i...,3.3,"[Italian, Pizza]",5.0,25,"212 East Ohio Street, Near North Side, River E...",italian,Pompei,"{'three': 2, 'of': 5, 'us': 3, 'had': 1, 'diff...",285.0,3,0.010526


In [54]:
# Merge authentity_score to each restuarant 
pre_final['authentity_score']= pre_final['trip_res_name'].map(dic_test)
pre_final = pre_final.drop(columns=['restaurant_name_y'])

In [55]:
pre_final

Unnamed: 0,trip_res_name,restaurant_name_x,link,avg_rating,category,avgerage_cost,review_count,location,cultural_neighborhood,review,word_count,culture_word_count,culture_score,authentity_score
0,Chez Joel,Chez Joel,https://www.zomato.com/chicago/chez-joel-unive...,3.9,[French],40.0,33,1119 W. Taylor Street 60607,italian,"{'we': 4, 'dine': 1, 'at': 1, 'chez': 2, 'joel...",263.0,2,0.007605,71.666667
1,The Rosebud,The Rosebud,https://www.zomato.com/chicago/the-rosebud-uni...,4.1,[Italian],37.5,83,"1500 W. Taylor Street, Chicago 60607",italian,"{'we': 23, 'went': 2, 'there': 7, 'for': 12, '...",1124.0,18,0.016014,55.727273
2,Tufanos Vernon Park Tap,Tufano's Vernon Park Tap,https://www.zomato.com/chicago/tufanos-vernon-...,4.2,[Italian],20.0,119,"1073 W Vernon Park Plaza, Chicago 60607",italian,"{'this': 8, 'is': 22, 'the': 47, 'kind': 1, 'o...",722.0,14,0.019391,64.75
3,Sweet Maple Cafe,Sweet Maple Cafe,https://www.zomato.com/chicago/sweet-maple-caf...,4.4,[American],15.0,108,"1339 W. Taylor Street, Chicago 60607",italian,"{'the': 28, 'staff': 4, 'and': 17, 'manager': ...",509.0,0,0.0,0.0
4,Mario's Italian Lemonade,Mario's Italian Lemonade,https://www.zomato.com/chicago/marios-italian-...,4.0,[Desserts],2.5,42,1068 W. Taylor Street 60607,italian,"{'september': 2, 'really': 1, 'snuck': 1, 'up'...",433.0,0,0.0,64.571429
5,Tuscany on Taylor,Tuscany,https://www.zomato.com/chicago/tuscany-restaur...,3.9,"[Italian, Pizza]",35.0,38,"550 S. Milwaukee Avenue, Wheeling 60090",italian,"{'if': 1, 'you': 1, 'are': 2, 'visiting': 1, '...",260.0,5,0.019231,65.0
6,Stax Cafe,Stax Cafe,https://www.zomato.com/chicago/stax-cafe-unive...,3.9,[American],15.0,42,"1401 W. Taylor Street, Chicago 60607",italian,"{'this': 13, 'has': 2, 'been': 2, 'on': 12, 'm...",1366.0,1,0.000732,62.333333
7,Jim's Original Hot Dog,Jim's Original,https://www.zomato.com/chicago/jims-original-u...,4.2,"[Burger, Sandwich]",7.5,59,"1250 S Union Avenue, Chicago 60607",italian,"{'best': 6, 'polish': 10, 'ever': 2, 'love': 2...",857.0,0,0.0,78.6
8,Al's 1 Italian Beef,Al's Italian Beef,https://www.zomato.com/chicago/als-beef-lakeview,3.7,"[Fast Food, Italian, Sandwich]",10.0,10,3420 N. Clark Street 60657,italian,"{'would': 1, 'recommend': 1, 'al': 4, 'and': 1...",255.0,2,0.007843,0.0
9,Pompei,Pompei Little Italy Streeterville,https://www.zomato.com/chicago/pompei-little-i...,3.3,"[Italian, Pizza]",5.0,25,"212 East Ohio Street, Near North Side, River E...",italian,"{'three': 2, 'of': 5, 'us': 3, 'had': 1, 'diff...",285.0,3,0.010526,0.0


In [57]:
# Create a data file for regression
final = pre_final.drop_duplicates(subset=['trip_res_name'], keep='first')
final = final.dropna()
final = final.reset_index(drop=True)
final
final.to_csv('zomato_combined_data.csv', index=False)
