### Imports 

In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, StandardScaler


import matplotlib.pyplot as plt
%matplotlib inline

### Read data for all business

In [2]:
businesses = pd.read_csv('data/yelp_business.csv')
businesses.head()

Unnamed: 0,business_id,name,neighborhood,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,categories
0,FYWN1wneV18bWNgQjJ2GNg,"""Dental by Design""",,"""4855 E Warner Rd, Ste B9""",Ahwatukee,AZ,85044,33.33069,-111.978599,4.0,22,1,Dentists;General Dentistry;Health & Medical;Or...
1,He-G7vWjzVUysIKrfNbPUQ,"""Stephen Szabo Salon""",,"""3101 Washington Rd""",McMurray,PA,15317,40.291685,-80.1049,3.0,11,1,Hair Stylists;Hair Salons;Men's Hair Salons;Bl...
2,KQPW8lFf1y5BT2MxiSZ3QA,"""Western Motor Vehicle""",,"""6025 N 27th Ave, Ste 1""",Phoenix,AZ,85017,33.524903,-112.11531,1.5,18,1,Departments of Motor Vehicles;Public Services ...
3,8DShNS-LuFqpEWIp0HxijA,"""Sports Authority""",,"""5000 Arizona Mills Cr, Ste 435""",Tempe,AZ,85282,33.383147,-111.964725,3.0,9,0,Sporting Goods;Shopping
4,PfOCPjBrlQAnz__NXj9h_w,"""Brick House Tavern + Tap""",,"""581 Howe Ave""",Cuyahoga Falls,OH,44221,41.119535,-81.47569,3.5,116,1,American (New);Nightlife;Bars;Sandwiches;Ameri...


### Filter restaurants

In [3]:
restaurants = businesses[businesses['categories'].str.contains("Restaurants")]
restaurants.head(50)

Unnamed: 0,business_id,name,neighborhood,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,categories
4,PfOCPjBrlQAnz__NXj9h_w,"""Brick House Tavern + Tap""",,"""581 Howe Ave""",Cuyahoga Falls,OH,44221,41.119535,-81.47569,3.5,116,1,American (New);Nightlife;Bars;Sandwiches;Ameri...
5,o9eMRCWt5PkpLDE0gOPtcQ,"""Messina""",,"""Richterstr. 11""",Stuttgart,BW,70567,48.7272,9.14795,4.0,5,1,Italian;Restaurants
10,XOSRcvtaKc_Q5H1SAzN20A,"""East Coast Coffee""",,"""737 West Pike St""",Houston,PA,15342,40.241548,-80.212815,4.5,3,0,Breakfast & Brunch;Gluten-Free;Coffee & Tea;Fo...
14,fNMVV_ZX7CJSDWQGdOM8Nw,"""Showmars Government Center""",Uptown,"""600 E 4th St""",Charlotte,NC,28202,35.221647,-80.839345,3.5,7,1,Restaurants;American (Traditional)
15,l09JfMeQ6ynYs5MCJtrcmQ,"""Alize Catering""",Yonge and Eglinton,"""2459 Yonge St""",Toronto,ON,M4P 2H6,43.711399,-79.399339,3.0,12,0,Italian;French;Restaurants
19,Gu-xs3NIQTj3Mj2xYoN2aw,"""Maxim Bakery & Restaurant""",,"""9665 Bayview Avenue, Unit 1-4""",Richmond Hill,ON,L4C 9V4,43.867565,-79.412662,3.5,34,1,French;Food;Bakeries;Restaurants
25,1K4qrnfyzKzGgJPBEcJaNQ,"""Chula Taberna Mexicana""",Leslieville,"""1058 Gerrard Street E""",Toronto,ON,M4M 3A6,43.669256,-79.335902,3.5,39,1,Tiki Bars;Nightlife;Mexican;Restaurants;Bars
28,Dj0S-Oe4ytRJzMGUPgYUkw,"""Panera Bread""",,"""38295 Chestnut Ridge Rd""",Elyria,OH,44035,41.343078,-82.06714,2.0,4,1,Soup;Salad;Sandwiches;Restaurants
29,gAy4LYpsScrj8POnCW6btQ,"""Toast Cafe""",,"""2429 Hwy 160 W""",Fort Mill,SC,29708,35.047287,-80.990559,3.5,6,0,Food;American (Traditional);Coffee & Tea;Resta...
30,nbhBRhZtdaZmMMeb2i02pg,"""Sunnyside Grill""",,"""2777 Steeles Avenue W""",Toronto,ON,M3J 3K5,43.781816,-79.490433,5.0,3,1,Restaurants;Breakfast & Brunch


### Read other attributes of all businesses (deprecated as of now)

In [None]:
# business_attr = pd.read_csv('data/yelp_business_attributes.csv')

### Merge filtered restaurants with above attributes

In [4]:
# restaurant_attr = restaurants.merge(business_attr, left_on='business_id', right_on='business_id', how = 'inner')
restaurant_attr = restaurants

### Read checkin data and create weekend flag

In [5]:
business_checkin = pd.read_csv('data/yelp_checkin.csv')
# restaurant_checkin = pd.merge(restaurant_attr['business_id'], business_checkin, 
#          left_on = 'business_id', right_on = 'business_id', how = 'inner')
business_checkin['is_weekend'] = business_checkin['weekday'].\
                            apply(lambda x : 1 if x == 'Sat' or x == 'Sun' or x == 'Fri' else 0)

### Function to create categorical variable - Morning, Afternoon, Evening, Night from time 

In [6]:
def create_time_of_day(x):
    hour = int(x.split(':')[0])
    if 6<= hour <= 11:
        return 'Morning'
    elif 12<= hour <= 15:
        return 'Afternoon'
    elif 16<= hour <= 20:
        return 'Evening'
    elif 20<=hour <= 24 or 0<=hour<=2:
        return 'Night'
    else:
        return 'Late Night'
business_checkin['time_of_day'] = business_checkin['hour'].apply(lambda x: create_time_of_day(x))

### Feature Engineering to capture different checkin patterns 

In [7]:
#Average checkins on weekdays and weekends

feature_1 = pd.pivot_table(business_checkin, values='checkins', index=['business_id'],
                     columns=['is_weekend'], aggfunc=np.mean).reset_index()
feature_1.columns = ['business_id', 'avg_checkins_weekday', 'avg_checkins_weekend']
# feature_1

In [8]:
#Average checkins on different day of the week

feature_2 = pd.pivot_table(business_checkin, values='checkins', index=['business_id'],
                     columns=['weekday'], aggfunc=np.mean).reset_index()
feature_2.columns = ['business_id', 'avg_checkins_Fri', 'avg_checkins_Mon', 'avg_checkins_Sat', 'avg_checkins_Sun',
                    'avg_checkins_Thu', 'avg_checkins_Tue', 'avg_checkins_Wed']
# feature_2

In [9]:
#Average checkins on different time of the day

feature_3 = pd.pivot_table(business_checkin, values='checkins', index=['business_id'],
                     columns=['time_of_day'], aggfunc=np.mean).reset_index()
feature_3.columns = ['business_id', 'Avg_checkins_Afternoon', 'Avg_checkins_Evening', 'Avg_checkins_Late_Night', 
                     'Avg_checkins_Morning', 'Avg_checkins_Night']
# feature_3

In [10]:
#Average checkins on weekdays vs. weekends broken by different times of the day

feature_4 = pd.pivot_table(business_checkin, values='checkins', index=['business_id'],
                     columns=['is_weekend','time_of_day'], aggfunc=np.mean).reset_index()


feature_4.columns = [str(col[0])+ '_'+col[1] for col in feature_4.columns]
feature_4.rename(columns={'business_id_':'business_id'}, inplace=True)

# feature_4

### Merge all features and attributes

In [11]:
feature_1_2 = pd.merge(feature_1, feature_2, 
         left_on = 'business_id', right_on = 'business_id', how = 'inner')
feature_1_2_3 = pd.merge(feature_1_2, feature_3, 
         left_on = 'business_id', right_on = 'business_id', how = 'inner')

feature_1_2_3_4 = pd.merge(feature_1_2_3, feature_4, 
         left_on = 'business_id', right_on = 'business_id', how = 'inner')
restaurant_attr_all = pd.merge(restaurant_attr, feature_1_2_3_4, 
         left_on = 'business_id', right_on = 'business_id', how = 'inner')

In [13]:
pd.set_option('display.max_columns', 200)
restaurant_attr_all.head(5)

Unnamed: 0,business_id,name,neighborhood,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,categories,avg_checkins_weekday,avg_checkins_weekend,avg_checkins_Fri,avg_checkins_Mon,avg_checkins_Sat,avg_checkins_Sun,avg_checkins_Thu,avg_checkins_Tue,avg_checkins_Wed,Avg_checkins_Afternoon,Avg_checkins_Evening,Avg_checkins_Late_Night,Avg_checkins_Morning,Avg_checkins_Night,0_Afternoon,0_Evening,0_Late Night,0_Morning,0_Night,1_Afternoon,1_Evening,1_Late Night,1_Morning,1_Night
0,PfOCPjBrlQAnz__NXj9h_w,"""Brick House Tavern + Tap""",,"""581 Howe Ave""",Cuyahoga Falls,OH,44221,41.119535,-81.47569,3.5,116,1,American (New);Nightlife;Bars;Sandwiches;Ameri...,2.666667,3.878049,3.5,2.7,3.785714,4.266667,3.125,2.444444,2.5,1.4,3.038462,2.25,1.0,4.025641,1.25,2.090909,1.0,1.0,3.47619,2.0,3.733333,2.666667,1.0,4.666667
1,o9eMRCWt5PkpLDE0gOPtcQ,"""Messina""",,"""Richterstr. 11""",Stuttgart,BW,70567,48.7272,9.14795,4.0,5,1,Italian;Restaurants,,1.0,,,1.0,,,,,,1.0,,,,,,,,,,1.0,,,
2,XOSRcvtaKc_Q5H1SAzN20A,"""East Coast Coffee""",,"""737 West Pike St""",Houston,PA,15342,40.241548,-80.212815,4.5,3,0,Breakfast & Brunch;Gluten-Free;Coffee & Tea;Fo...,1.0,2.0,2.0,,,,,,1.0,1.0,1.5,,,,1.0,1.0,,,,,2.0,,,
3,fNMVV_ZX7CJSDWQGdOM8Nw,"""Showmars Government Center""",Uptown,"""600 E 4th St""",Charlotte,NC,28202,35.221647,-80.839345,3.5,7,1,Restaurants;American (Traditional),1.25,2.0,2.0,1.333333,,,1.0,1.25,1.5,1.2,1.6,,,,1.25,1.25,,,,1.0,3.0,,,
4,l09JfMeQ6ynYs5MCJtrcmQ,"""Alize Catering""",Yonge and Eglinton,"""2459 Yonge St""",Toronto,ON,M4P 2H6,43.711399,-79.399339,3.0,12,0,Italian;French;Restaurants,1.0,1.0,,,1.0,,1.0,,1.0,,,1.0,,1.0,,,1.0,,1.0,,,,,1.0


### Look at the distribution of open and closed restaurants 

In [14]:
restaurant_attr_all['is_open'].value_counts()

1    39070
0    13017
Name: is_open, dtype: int64

### Look at Top 5 cities with most open restaurants

In [15]:
restaurant_attr_all[(restaurant_attr_all['is_open'] == 1)].\
    groupby('city')['business_id'].count().reset_index().\
        sort_values('business_id', ascending = False)[:5]



Unnamed: 0,city,business_id
653,Toronto,4832
289,Las Vegas,3947
476,Phoenix,2534
384,Montréal,2524
92,Charlotte,1838


### We will focus on 1 city for now. Filter open restaurants  in Toronto 

In [16]:
city = 'Toronto'
restaurants_in_city = restaurant_attr_all[(restaurant_attr_all['is_open'] == 1) & (restaurant_attr_all['city'] == city)]

restaurants_in_city.to_csv('./data/restaurants_in_'+city+'.csv', index = False)

In [17]:
restaurants_in_Toronto = pd.read_csv('./data/restaurants_in_Toronto.csv')


### Look at distribution of count of reviews 

In [18]:
restaurants_in_Toronto['review_count'].describe()

count    4832.000000
mean       52.530629
std        85.135641
min         3.000000
25%         9.000000
50%        24.000000
75%        60.000000
max      1494.000000
Name: review_count, dtype: float64

In [20]:
percentile_value = np.percentile(restaurants_in_Toronto['review_count'], 70)

print('Count of restaurants having reviews more than', percentile_value,':', len(restaurants_in_Toronto[restaurants_in_Toronto['review_count'] > percentile_value]))

Count of restaurants having reviews more than 50.0 : 1428


### Identify Top 30 popular cuisines

In [21]:
from collections import Counter

all_categories = restaurants_in_Toronto['categories'].apply(lambda x: x.split(';')).tolist()
all_categories_flat_list = [item for sublist in all_categories for item in sublist]

category_counts = Counter(all_categories_flat_list)
print(category_counts.most_common(30))


[('Restaurants', 4832), ('Food', 1086), ('Nightlife', 683), ('Bars', 667), ('Sandwiches', 465), ('Breakfast & Brunch', 427), ('Canadian (New)', 413), ('Cafes', 401), ('Chinese', 388), ('Coffee & Tea', 374), ('Pizza', 369), ('Italian', 352), ('Fast Food', 346), ('Japanese', 325), ('Burgers', 293), ('American (Traditional)', 241), ('Sushi Bars', 236), ('Middle Eastern', 189), ('Indian', 180), ('Mediterranean', 179), ('Asian Fusion', 176), ('Pubs', 176), ('Mexican', 175), ('Thai', 171), ('Korean', 167), ('Seafood', 160), ('Bakeries', 158), ('Specialty Food', 157), ('Event Planning & Services', 156), ('Salad', 151)]


### Create flag for different cuisines 

In [22]:

def create_restaurant_cuisine_flag(df, cuisine):
    df_cuisine = df[df['categories'].str.contains(cuisine)][['business_id', 'cuisine_flag']]
    df_cuisine.columns = ['business_id', cuisine]
    return df_cuisine
    
restaurants_in_Toronto['cuisine_flag'] = 1
restaurants_in_Toronto_w_Nightlife = create_restaurant_cuisine_flag(restaurants_in_Toronto, 'Nightlife')
restaurants_in_Toronto_w_Bar = create_restaurant_cuisine_flag(restaurants_in_Toronto, 'Bars')
restaurants_in_Toronto_w_Canadian = create_restaurant_cuisine_flag(restaurants_in_Toronto, 'Canadian')
restaurants_in_Toronto_w_Chinese = create_restaurant_cuisine_flag(restaurants_in_Toronto, 'Chinese')
restaurants_in_Toronto_w_Italian = create_restaurant_cuisine_flag(restaurants_in_Toronto, 'Italian')
restaurants_in_Toronto_w_Japanese = create_restaurant_cuisine_flag(restaurants_in_Toronto, 'Japanese')
restaurants_in_Toronto_w_American = create_restaurant_cuisine_flag(restaurants_in_Toronto, 'American')
restaurants_in_Toronto_w_Indian = create_restaurant_cuisine_flag(restaurants_in_Toronto, 'Indian')
restaurants_in_Toronto_w_Mexican = create_restaurant_cuisine_flag(restaurants_in_Toronto, 'Mexican')
restaurants_in_Toronto_w_Thai = create_restaurant_cuisine_flag(restaurants_in_Toronto, 'Thai')


### Merging cuisine flags 

In [23]:
df_1 = pd.merge(restaurants_in_Toronto, restaurants_in_Toronto_w_Nightlife, 
         left_on = 'business_id', right_on = 'business_id', how = 'left')
df_2 = pd.merge(df_1, restaurants_in_Toronto_w_Bar, 
         left_on = 'business_id', right_on = 'business_id', how = 'left')
df_3 = pd.merge(df_2, restaurants_in_Toronto_w_Canadian, 
         left_on = 'business_id', right_on = 'business_id', how = 'left')
df_4 = pd.merge(df_3, restaurants_in_Toronto_w_Chinese, 
         left_on = 'business_id', right_on = 'business_id', how = 'left')
df_5 = pd.merge(df_4, restaurants_in_Toronto_w_Italian, 
         left_on = 'business_id', right_on = 'business_id', how = 'left')
df_6 = pd.merge(df_5, restaurants_in_Toronto_w_Japanese, 
         left_on = 'business_id', right_on = 'business_id', how = 'left')
df_7 = pd.merge(df_6, restaurants_in_Toronto_w_American, 
         left_on = 'business_id', right_on = 'business_id', how = 'left')
df_8 = pd.merge(df_7, restaurants_in_Toronto_w_Indian, 
         left_on = 'business_id', right_on = 'business_id', how = 'left')
df_9 = pd.merge(df_8, restaurants_in_Toronto_w_Mexican, 
         left_on = 'business_id', right_on = 'business_id', how = 'left')
df_10 = pd.merge(df_9, restaurants_in_Toronto_w_Thai, 
         left_on = 'business_id', right_on = 'business_id', how = 'left')

### Final columms and filters

In [24]:
relevant_cols = ['business_id','name','stars','review_count','avg_checkins_weekday', 'avg_checkins_weekend',
                'avg_checkins_Fri', 'avg_checkins_Mon', 'avg_checkins_Sat', 'avg_checkins_Sun',
                'avg_checkins_Thu', 'avg_checkins_Tue', 'avg_checkins_Wed', 'Avg_checkins_Afternoon',
                'Avg_checkins_Evening', 'Avg_checkins_Late_Night', 'Avg_checkins_Morning', 'Avg_checkins_Night',
                '0_Afternoon', '0_Evening', '0_Late Night', '0_Morning', '0_Night', '1_Afternoon', 
                '1_Evening', '1_Late Night', '1_Morning', '1_Night',
                'Nightlife', 'Bars', 'Canadian', 'Chinese', 'Italian', 'Japanese','American', 'Indian', 
                'Mexican', 'Thai']

X = df_10.loc[:,relevant_cols]
X.fillna(0, inplace = True)

relevant_cuisines = ['Nightlife', 'Bars', 'Canadian', 'Chinese', 'Italian', 'Japanese','American', 'Indian', 
                'Mexican', 'Thai']
X['cuisines'] = X.loc[:,relevant_cuisines].sum(axis = 1)
AD = X[(X['cuisines'] > 0) & (X['review_count'] > 50)]
AD.shape

(936, 39)

### Look at statistics and write to file

In [25]:
AD.reset_index(drop=True).describe()

Unnamed: 0,stars,review_count,avg_checkins_weekday,avg_checkins_weekend,avg_checkins_Fri,avg_checkins_Mon,avg_checkins_Sat,avg_checkins_Sun,avg_checkins_Thu,avg_checkins_Tue,avg_checkins_Wed,Avg_checkins_Afternoon,Avg_checkins_Evening,Avg_checkins_Late_Night,Avg_checkins_Morning,Avg_checkins_Night,0_Afternoon,0_Evening,0_Late Night,0_Morning,0_Night,1_Afternoon,1_Evening,1_Late Night,1_Morning,1_Night,Nightlife,Bars,Canadian,Chinese,Italian,Japanese,American,Indian,Mexican,Thai,cuisines
count,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0
mean,3.581731,143.508547,2.662704,3.728247,3.250028,2.545456,4.007101,3.833019,2.852299,2.553925,2.68089,1.569071,2.776539,1.654802,0.765161,3.931636,1.149658,2.12091,1.196448,0.486392,3.356948,1.675383,3.381395,1.866085,0.650225,4.612714,0.301282,0.430556,0.195513,0.108974,0.139957,0.180556,0.17094,0.055556,0.070513,0.059829,1.713675
std,0.500265,132.259646,1.602847,2.537196,2.201589,1.657061,2.90622,2.904646,1.861916,1.639898,1.755881,1.496662,2.281397,1.217453,0.733113,3.302671,1.022988,1.704251,0.932491,0.751446,2.819276,2.315971,3.303212,1.779914,0.815805,4.070748,0.45906,0.495419,0.396807,0.311774,0.347128,0.384855,0.376658,0.229184,0.256146,0.237297,0.902323
min,1.5,51.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,3.5,69.75,1.6875,2.153846,1.875,1.5,2.166667,2.0,1.696875,1.545455,1.6,1.0,1.543706,1.0,0.0,2.071429,1.0,1.230769,1.0,0.0,1.807143,1.0,1.6,1.0,0.0,2.272727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,3.5,100.0,2.179144,2.886752,2.666667,2.095455,3.1,3.0,2.3,2.111111,2.2,1.2,2.111111,1.333333,1.0,2.84123,1.0,1.677083,1.0,0.0,2.460606,1.0,2.428571,1.4,0.0,3.310096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,4.0,162.25,3.080833,4.40119,3.846154,3.0,4.839286,4.595023,3.339286,3.0,3.102778,1.75,3.075556,2.0,1.0,4.489865,1.5,2.375,1.5,1.0,3.79375,2.0,3.8,2.258929,1.0,5.555556,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
max,5.0,1494.0,16.151515,22.474576,22.176471,14.583333,26.7,28.65,16.294118,18.571429,16.25,19.75,23.714286,10.095238,9.8,31.02381,11.875,18.3,8.090909,15.333333,27.416667,30.25,36.133333,13.222222,6.0,37.555556,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0


In [26]:
AD.to_csv('./data/clusteringAD.csv', index = False)