In [1]:
import numpy as np
import pandas as pd

import json
import requests
import googlemaps
from bs4 import BeautifulSoup
import query
import time

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# load api keys

key = open('APIkeys.txt', 'r')
goog_key = open('google_places_api_key.txt', 'r')

In [3]:
keys = list(key)

In [4]:
# save keys as strings

api_key = keys[0].split()[2]
goog_api_key = list(goog_key)[0][11:-1]

In [5]:
# instantiate googlemaps object

gmaps = googlemaps.Client(key=goog_api_key)

In [None]:
# Query NYTimes API for 800 reviews by Pete Wells from 2012-2021
urls, restaurants_100, dates = query.review_url_names(api_key, 'Pete Wells', news_desk = 'Dining', type_of_material = 'Review', begin_date = '20120101', end_date = '20210401',n_pages_min = 0, n_pages_max = 80)

In [None]:
# create a dataframe with the returned items
df = pd.DataFrame(index = dates)
df['urls'] = urls
df['restaurants'] = restaurants_100

In [None]:
df.head()

In [None]:
# Remove irrelevant words from the restaurants column
stopwords = ['review', 'pete wells', 'restaurant', 'on the lower east side']

df['restaurants'] = query.remove_stopwords(df['restaurants'], stopwords)

In [None]:
# Reload the saved data

df = pd.read_csv('restaurants.csv')
df.set_index('Unnamed: 0.1', inplace = True)
df.drop(columns = 'Unnamed: 0', inplace = True)

In [None]:
# Drop non-review rows including articles from before Wells was the main Dining critic, roundups and briefs

df = query.drop_rows(['unde', 'rest', 'brief', 'hungry city'], df, 'restaurants')

In [None]:
# Get star ratings for each review if applicable

df['stars'] = query.get_stars(df.urls)

In [None]:
# Convert stars from ASCII to number of stars

df['stars'] = df.stars.apply(lambda i: len(i) if type(i) == str else 0)

In [6]:
fields = ['name', 'business_status', 'place_id', 'formatted_address'] 

In [24]:
def get_rest_info(rest_names, fields, googlemap_object, rest_location= 'NYC'):
    ''''''
    dct = {}
    for i in rest_names:
        dct.update({i: googlemap_object.find_place(
            i + rest_location, input_type = 'textquery', fields = fields)})
    
    df = pd.DataFrame(columns = fields)

    for i in dct:
        try:
            dict1 = {}
            for j in fields:
        
                dict1.update({j:dct[i]['candidates'][0][j]})
            df.append(dict1, ignore_index = True)

        except (IndexError, KeyError):
            dict1 = {}
            for j in fields:
                dict1.update({j:np.NaN})
            df.append(dict1, ignore_index = True)
    
    return df
    

In [None]:
dct = {}
for i in list(df.restaurants):
    dct.update({i : gmaps.find_place(i + ' NYC', input_type='textquery', fields = ['name', 'business_status', 'place_id', 'formatted_address'] )})

In [None]:
df1_name = []
business_status = []
formatted_address = []
name = []
place_id = []



for i in dct:
    try:
        df1_name.append(i)
        business_status.append(dct[i]['candidates'][0]['business_status'])
        formatted_address.append(dct[i]['candidates'][0]['formatted_address'])
        name.append(dct[i]['candidates'][0]['name'])
        place_id.append(dct[i]['candidates'][0]['place_id'])
    except (IndexError, KeyError):
        business_status.append(np.NaN)
        formatted_address.append(np.NaN)
        name.append(np.NaN)
        place_id.append(np.NaN)


In [None]:
def gmaps_dataframe(list_of_restaurants, fields, gmaps_obj):
    dct = {}
    for i in list_of_restaurants:
        dct.update({i : gmaps.find_place(i + ' NYC', input_type='textquery', fields = fields)})
    for i in fields:
        i = []
    

In [None]:
df1 = pd.DataFrame({'status': business_status, 'address':formatted_address, 'restaurants':df1_name, 'place_id': place_id, 'name' :name})

In [None]:
maindf = df.reset_index().merge(df1, how = 'left', on= 'restaurants').set_index('index')

In [None]:
maindf.to_csv('merged.csv')

In [None]:
maindf = pd.read_csv('merged.csv')
maindf.set_index('index', inplace = True)

In [None]:
maindf['zipcode'] = maindf.address.apply(lambda i: i.split('United')[0].split(' ')[-2][0:-1])

In [None]:
maindf = maindf[maindf.zipcode.str.startswith('1')]
maindf = maindf[maindf.zipcode.str.isnumeric()]
maindf = maindf[maindf.zipcode != '11010']

In [None]:
def borough_column(address_column):
    borough = []
    for i in address_column:
        borough.append(i.split(', NY')[0].split(' ')[-1])
    for i in range(len(borough)):
        if borough[i] in ('York', 'States'):
            borough[i] = 'Manhattan'
        elif borough[i] in ('Maspeth', 'Ridgewood', 'Astoria', 'Flushing', 
                            'City', 'Hill', 'Broadway', 'Point', 'Park'):    
            borough[i] = 'Queens'
        elif borough[i] == 'Island':
            borough[i] = 'Staten Island'

    return borough

In [None]:
maindf['borough'] = query.borough_column(maindf.address)

In [None]:
maindf.drop(columns = 'restaurants', inplace = True)

In [None]:
maindf.dropna(inplace = True)

In [None]:
maindf.info()

In [None]:
maindf.stars.value_counts()

In [None]:
maindf.status.value_counts()

In [None]:
plt.bar(height = maindf[maindf['status'] == 'CLOSED_PERMANENTLY']['stars'].value_counts(), x = maindf[maindf['status'] == 'CLOSED_PERMANENTLY']['stars'].value_counts().index)
plt.title('Permanently Closed Restaurants by NYTimes Stars')
plt.xticks([0,1,2,3,4]);

In [None]:
plt.bar(height = maindf[maindf['status'] == 'CLOSED_TEMPORARILY']['stars'].value_counts(), x = maindf[maindf['status'] == 'CLOSED_TEMPORARILY']['stars'].value_counts().index)
plt.title('Temporarily Closed Restaurants by NYTimes Stars')
plt.xticks([0,1,2,3,4]);

In [None]:
plt.bar(height = maindf[maindf['status'] == 'OPERATIONAL']['stars'].value_counts(), x = maindf[maindf['status'] == 'OPERATIONAL']['stars'].value_counts().index)
plt.title('Currently Operating Restaurants by NYTimes Stars')
plt.xticks([0,1,2,3,4])
plt.ylim(0,160);

In [None]:
plt.bar(height = maindf['stars'].value_counts(), x = maindf['stars'].value_counts().index)
plt.title('Total Star Counts from Pete Wells Reviews');

In [None]:
maindf.groupby(['borough'])['stars'].value_counts().unstack(level=0).plot(kind='bar', subplots=False);

In [None]:
maindf.groupby('borough')['status'].value_counts().unstack(level= 0).plot(kind = 'barh', subplots = False);