# What is the best place to open a moderately priced, fast casual asian dessert spot in NYC?
### Considerations
* Where are all bakeries primarily aggregated?
* Using review count as a measure of success, does their price point affect the amount of reviews they recieve?
* Does the number/type of categories these businesses put on their profile affect their review_count?
* Integrate census data, how does median income affect these parameters?
    * Business 'success' using review_count (see if I can find out information for how long they've been open, take the average rev/year)
    * How income, price, success and location relate to eachother
  

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import re
import seaborn as sns

In [2]:
# Reading and inspection
df1 = pd.read_csv('Resources/bake_nyc.csv')
df2 = pd.read_csv('Resources/cake_nyc.csv')
df3 = pd.read_csv('Resources/desserts_nyc.csv')
df4 = pd.read_csv('Resources/donut_nyc.csv')
df5 = pd.read_csv('Resources/patisserie_nyc.csv')
df6 = pd.read_csv('Resources/viennoiserie_nyc.csv')
master_df = pd.concat([df1, df2, df3, df4, df5, df6]).drop_duplicates()
# Check status
master_df['is_closed'].describe()


count      3131
unique        1
top       False
freq       3131
Name: is_closed, dtype: object

In [6]:
# Create sublist
master_df_sub = master_df[['name', 'review_count', 'categories', 'rating', 'coordinates', 'price']]
master_df_sub.head()

Unnamed: 0,name,review_count,categories,rating,coordinates,price
0,Michaeli Bakery,158,"[{'alias': 'bakeries', 'title': 'Bakeries'}, {...",5.0,"{'latitude': 40.714349, 'longitude': -73.992054}",$$
1,Mei Lai Wah Bakery,1735,"[{'alias': 'bakeries', 'title': 'Bakeries'}, {...",4.0,"{'latitude': 40.7155758, 'longitude': -73.9979...",$
2,Dominique Ansel Bakery,5384,"[{'alias': 'bakeries', 'title': 'Bakeries'}, {...",4.0,"{'latitude': 40.72516, 'longitude': -74.00296}",$$
3,Angelina Bakery,609,"[{'alias': 'bakeries', 'title': 'Bakeries'}, {...",4.5,"{'latitude': 40.75498, 'longitude': -73.99166}",$$
4,L'Appartement 4F,181,"[{'alias': 'bakeries', 'title': 'Bakeries'}]",4.0,"{'latitude': 40.69521417249751, 'longitude': -...",$$


In [14]:
# Cleaning function
chars_to_remove = ['[', '{', '\'', 'alias', 'title', 'latitude', 'longitude', "}", ":", "}", "]"]
col_list = ['categories', 'coordinates']
def string_cleaner(df, col_list, char_list):
    for item in char_list:
        for col in col_list:
            df.loc[:, col] = df[col].str.replace(item, "", case=False)
    return df

string_cleaner(string_cleaner(master_df_sub, col_list, chars_to_remove), col_list, chars_to_remove)

Unnamed: 0,name,review_count,categories,rating,coordinates,price
0,Michaeli Bakery,158,"bakeries, Bakeries, desserts, Desserts",5.0,"40.714349, -73.992054",$$
1,Mei Lai Wah Bakery,1735,"bakeries, Bakeries, dimsum, Dim Sum, nood...",4.0,"40.7155758, -73.9979585",$
2,Dominique Ansel Bakery,5384,"bakeries, Bakeries, desserts, Desserts",4.0,"40.72516, -74.00296",$$
3,Angelina Bakery,609,"bakeries, Bakeries, desserts, Desserts, d...",4.5,"40.75498, -73.99166",$$
4,L'Appartement 4F,181,"bakeries, Bakeries",4.0,"40.69521417249751, -73.99478573089799",$$
...,...,...,...,...,...,...
89,Oren's Coffee,170,"coffee, Coffee & Tea",3.5,"40.8054989, -73.965405",$
90,French Toast Bakery,83,"bakeries, Bakeries, coffee, Coffee & Tea, ...",3.5,"40.746355, -73.895131",$
93,Perrine,118,"newamerican, American (New), french, French",4.0,"40.7653271962541, -73.97199900859833",$$$
96,Gaby Brasserie Francaise,106,"french, French, modern_european, Modern Eu...",3.0,"40.75613, -73.98154",$$$


In [15]:
# Cleaning duplicate aliases by split and delete
def remove_alias(df, column):
    new_df = df[column].str.split(pat=', ', expand=True)
    new_df = new_df.drop(labels=range(0, 8, 2), axis=1)
    df = df.drop(column, axis=1)
    df = pd.concat([df, new_df], axis=1)
    return df

master_df_sub = remove_alias(master_df_sub, 'categories')

In [16]:
# Split Coordinates & rename columns
master_df_sub[['coord_lat', 'coord_long']] = master_df_sub['coordinates'].str.split(pat=', ', expand=True)
master_df_sub = master_df_sub.drop('coordinates', axis=1)
master_df_sub.columns = ['name', 'review_count', 'rating', 'price', 'cat1', 'cat2', 'cat3','cat4', 'coord_lat', 'coord_long']
master_df_sub.head()

Unnamed: 0,name,review_count,rating,price,cat1,cat2,cat3,cat4,coord_lat,coord_long
0,Michaeli Bakery,158,5.0,$$,Bakeries,Desserts,,,40.714349,-73.992054
1,Mei Lai Wah Bakery,1735,4.0,$,Bakeries,Dim Sum,Noodles,,40.7155758,-73.9979585
2,Dominique Ansel Bakery,5384,4.0,$$,Bakeries,Desserts,,,40.72516,-74.00296
3,Angelina Bakery,609,4.5,$$,Bakeries,Desserts,Donuts,,40.75498,-73.99166
4,L'Appartement 4F,181,4.0,$$,Bakeries,,,,40.69521417249751,-73.99478573089799


In [None]:
# Manipulation & Statistics
# Convert price col to int

In [None]:
# Visualization