In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import numpy as np
import json
from fancyimpute import KNN    

palette = "rainbow_r"

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [None]:
df = pd.read_csv('TA_restaurants_curated.csv', index_col='Unnamed: 0')
df.head()

In [None]:
df = df[['Name', 'City', 'Cuisine Style', 'Ranking', 'Rating', 'Price Range']]
df.loc[df['City'] == 'Oporto', 'City'] = 'Porto'
df.info()

In [None]:
geoloc = pd.read_csv('worldcities.csv', index_col=None)
more_cities = pd.DataFrame([{'city': 'Zurich', 'lat':47.3769, 'lng':8.5417, 'country':'Switzerland'}, 
                            {'city': 'Krakow', 'lat':50.0647, 'lng':19.9450, 'country':'Poland'}])
geoloc = pd.concat([geoloc, more_cities], sort=True)

In [None]:
cities = list(np.unique(df['City'].values))
for c in cities:
    try:
        df.loc[df['City'] == c, 'Latitude'] = geoloc.loc[geoloc.city == c]['lat'].values[0]
        df.loc[df['City'] == c, 'Longitude'] = geoloc.loc[geoloc.city == c]['lng'].values[0]
        df.loc[df['City'] == c, 'Country'] = geoloc.loc[geoloc.city == c]['country'].values[0]
    except:
        pass
    
df.head()

# Data analysis

In [None]:
fig, ax = plt.subplots(figsize=(18, 11))
sns.heatmap(df.isnull().transpose())

In [None]:
df_corr = df.corr()
fig, ax = plt.subplots(figsize=(18, 11))
sns.heatmap(df_corr, ax=ax, cmap='Purples')

In [None]:
df = df.dropna(axis='index', how='all')
df.info()

In [None]:
df = df.dropna(axis='index', subset=['Cuisine Style'])
df.info()

In [None]:
df['Rating'] = df['Rating'].interpolate().astype(int)
df = df[df['Rating'] >= 0]
list(np.unique(np.array([x for x in df['Rating'].values if x != np.nan])))

In [None]:
knnOutput = KNN(k=5).complete(mydata)

In [None]:
df['Ranking'] = df['Ranking'].fillna(df['Ranking'].max()+1)

In [None]:
#Binarizing 'Cuisine Style' column
styles = set()
for list1 in df['Cuisine Style'].values:
    try:
        styles = styles.union({s.replace("'", "") for s in  list1.strip('][').split(', ')})
    except:
        pass
    
styles = sorted(list(styles))
df_cs =  df[['Name','City']].copy()
for style in styles:
    df_cs[style] = df['Cuisine Style'].str.contains("'{}'".format(style), na=False, regex=False)
    df_cs.loc[df_cs[style] == False, style] = 0
    df_cs.loc[df_cs[style] == True, style] = 1

In [None]:
df['Number of styles'] = df_cs[styles].sum(axis=1)
df_cs['Number of styles'] = df_cs[styles].sum(axis=1)
df.drop(['Cuisine Style'], axis=1, inplace=True)

In [None]:
#Cleaning Price Range
prange_dict = {1:'$', 2:'$$ - $$$', 3:'$$$$'}
for k, v in prange_dict.items():
    df.loc[df['Price Range'] == v, 'Price Range'] = k
    
df['Price Range'] = df['Price Range'].fillna(0)

In [None]:
kmeans = KMeans(n_clusters=5)
kmeans.fit(df.drop(['Name', 'Country', 'City', 'Rating'], axis=1).values)
df['Cluster ID'] = kmeans.labels_

In [None]:
df.info()

In [None]:
df_corr = df.corr()
fig, ax = plt.subplots(figsize=(18, 11))
sns.heatmap(df_corr, ax=ax, cmap='Purples')

In [None]:
city_features = [x for x in df.columns if x not in ['Cluster ID', 'City']]
sns.pairplot(df, 
             diag_kind='hist', 
             hue='Cluster ID')

# Analysis by city

In [None]:
df_city = df.groupby('City').mean()
df_city['Restaurant Count'] = df.groupby('City').count()['Name']
df_city['Number of styles'] = df_cs.groupby('City').max()[styles].sum(axis=1)
df_city['Price Range'] = df.groupby('City').median()['Price Range']
df_city['Cluster ID'] = df.groupby('City').median()['Cluster ID'].astype(str)
df_city.index.name = None
df_city['City'] = df_city.index

In [None]:
df_city.info()

In [None]:
city_features = [x for x in df_city.columns if x not in ['Cluster ID', 'City']]
sns.pairplot(df_city, 
             vars=city_features, 
             diag_kind='hist', 
             hue='Cluster ID')

In [None]:
df_city_corr = df_city.corr()
fig, ax = plt.subplots(figsize=(18, 11))
sns.heatmap(df_city_corr, ax=ax, cmap='Purples')

In [None]:
df_city_normalized = df_city.copy()
df_city_normalized['Restaurant Count'] = (df_city['Restaurant Count'] - df_city['Restaurant Count'].min())/(df_city['Restaurant Count'].max() - df_city['Restaurant Count'].min())
df_city_normalized['Number of styles'] = (df_city['Number of styles'] - df_city['Number of styles'].min())/(df_city['Number of styles'].max() - df_city['Number of styles'].min())
df_city_normalized['Rating'] = (df_city['Rating'] - df_city['Rating'].min())/(df_city['Rating'].max() - df_city['Rating'].min())

ax = df_city_normalized.plot.barh(x='City', y=['Rating','Restaurant Count', 'Number of styles'], figsize=(18, 18))
ax.set_xlim([-0.005, 1.1])
ax = ax.set_yticklabels(df_city_normalized.index)

In [None]:
fig, ax = plt.subplots(figsize=(12, 12))
sns.kdeplot(df_city['Number of styles'], df_city['Ranking'], shade=True, alpha=0.9, ax=ax, cmap='rainbow')

In [None]:
fig, ax = plt.subplots(figsize=(12, 12))
sns.kdeplot(df_city['Cluster ID'], df_city['Ranking'], shade=True, alpha=0.9, ax=ax, cmap='rainbow')

In [None]:
fig, ax = plt.subplots(figsize=(12, 12))
sns.scatterplot(x="Longitude", y="Latitude", data=df_city, 
                size="Number of styles", hue="Price Range", 
                sizes=(40,200), palette=palette, ax=ax)

In [None]:
fig, ax = plt.subplots(figsize=(12, 12))
sns.scatterplot(x="Longitude", y="Latitude", data=df_city, 
                size="Number of styles", hue="Cluster ID", style="Cluster ID", palette="rainbow_r",
                sizes=(5,200), ax=ax, edgecolor=None)