In [1]:
import json
import numpy as np
import pandas as pd
import requests
import time
import googlemaps
from bs4 import BeautifulSoup

In [3]:
# load api keys

key = open('APIkeys.txt', 'r')
goog_key = open('google_places_api_key.txt', 'r')

In [4]:
keys = list(key)

In [5]:
# save keys as strings

api_key = keys[0].split()[2]
goog_api_key = list(goog_key)[0][11:-1]

In [6]:
# instantiate googlemaps object

gmaps = googlemaps.Client(key=goog_api_key)

In [7]:
# create a url constructor for querying the NYTimes API based on possible input fields

def nytimes_query(api_key, query, 
                  glocations = None, headline = None, news_desk = None, 
                  organizations = None, persons = None, byline = None, 
                  subject = None, news_type = None, type_of_material = None, 
                  begin_date = None, end_date = None, n_page = 0):
    '''
    Perform a query from the NYTimes API and return a JSON of results
    ----
    Arguments:
    api-key -- a user-specific key registered at developer.nytimes.com
    query -- the search terms used for the query
    
    filters:
        each of the following categories is set at None by default. If values are added, the
        queries will be filtered based on the parameter

        glocations -- geographic locations
        headline -- literal headline
        news_desk -- by news desk using the following values:
            Adventure Sports
            Arts & Leisure
            Arts
            Automobiles
            Blogs
            Books
            Booming
            Business Day
            Business
            Cars
            Circuits
            Classifieds
            Connecticut
            Crosswords & Games
            Culture
            DealBook
            Dining
            Editorial
            Education
            Energy
            Entrepreneurs
            Environment
            Escapes
            Fashion & Style
            Fashion
            Favorites
            Financial
            Flight
            Food
            Foreign
            Generations
            Giving
            Global Home
            Health & Fitness
            Health
            Home & Garden
            Home
            Jobs
            Key
            Letters
            Long Island
            Magazine
            Market Place
            Media
            Men's Health
            Metro
            Metropolitan
            Movies
            Museums
            National
            Nesting
            Obits
            Obituaries
            Obituary
            OpEd
            Opinion
            Outlook
            Personal Investing
            Personal Tech
            Play
            Politics
            Regionals
            Retail
            Retirement
            Science
            Small Business
            Society
            Sports
            Style
            Sunday Business
            Sunday Review
            Sunday Styles
            T Magazine
            T Style
            Technology
            Teens
            Television
            The Arts
            The Business of Green
            The City Desk
            The City
            The Marathon
            The Millennium
            The Natural World
            The Upshot
            The Weekend
            The Year in Pictures
            Theater
            Then & Now
            Thursday Styles
            Times Topics
            Travel
            U.S.
            Universal
            Upshot
            UrbanEye
            Vacation
            Washington
            Wealth
            Weather
            Week in Review
            Week
            Weekend
            Westchester
            Wireless Living
            Women's Health
            Working
            Workplace
            World
            Your Money
        organizations 
        persons -- filter by persons referenced
        byline -- filter by writer or several other parameters- see developer.nytimes.com
        subject -- by subject
        type_of_material -- document type- sample values below:
            Addendum
            An Analysis
            An Appraisal
            Archives
            Article
            Banner
            Biography
            Birth Notice
            Blog
            Brief
            Caption
            Chronology
            Column
            Correction
            Economic Analysis
            Editorial
            Editorial Cartoon
            Editors' Note
            First Chapter
            Front Page
            Glossary
            Interactive Feature
            Interactive Graphic
            Interview
            Letter
            List
            Marriage Announcement
            Military Analysis
            News
            News Analysis
            Newsletter
            Obituary
            Obituary (Obit)
            Op-Ed
            Paid Death Notice
            Postscript
            Premium
            Question
            Quote
            Recipe
            Review
            Schedule
            SectionFront
            Series
            Slideshow
            Special Report
            Statistics
            Summary
            Text
            Video
            Web Log

    date params:
        begin_date -- beginning date of filter 
        end_date -- end date of filter
    n_page -- the selected page of returns- for large queries this will select sections to return
    '''
    # Set the base url for the query
    base_url = f'https://api.nytimes.com/svc/search/v2/articlesearch.json?q={query}'             
    # Empty dictionary for filters
    filter_queries = {}
    # empty dictionary for dates
    dates = {}

    # Populate the filter dictionary
    if glocations:
        filter_queries.update({'glocations': glocations})
    if headline:
        filter_queries.update({'headline': headline})
    if news_desk:
        filter_queries.update({'news_desk': news_desk})
    if organizations:
        filter_queries.update({'organizations': organizations})
    if persons:
        filter_queries.update({'persons': persons})
    if subject:
        filter_queries.update({'subject': subject})
    if byline:
        filter_queries.update({'byline': byline})
    if news_type:
        filter_queries.update({'news_type': news_type})
    if type_of_material:
        filter_queries.update({'type_of_material': type_of_material})
    
    # Populate the date dictionary
    if begin_date:
        dates.update({'begin_date': begin_date})
    if end_date:
        dates.update({'end_date': end_date})
    
    # If 1 filter is present, and/or date params, add to URL and execute query
    if len(filter_queries) == 1:
        base_url += f'&fq={list(filter_queries.keys())[0]}:({list(filter_queries.values())[0]})'
        if len(dates) == 1:
            base_url += f'&{list(filter_queries.keys())[0]}={list(filter_queries.values())[0]}&'
        elif len(dates) == 2:
            base_url += '&'
            for i in dates.keys():
                base_url += f'{i}={dates[i]}&'

    # If 2 or more filters are present, concatenate with AND, add dates if present and execute
    elif  len(filter_queries) > 1:
        base_url += '&fq='
        for i in filter_queries.keys():
            base_url += f'{i}:({filter_queries[i]}) AND '
        base_url = base_url[:-5]
        if len(dates) == 1:
            base_url += f'&{list(filter_queries.keys())[0]}={list(filter_queries.values())[0]}&'
        elif len(dates) == 2:
            base_url += '&'
            for i in dates.keys():
                base_url += f'{i}={dates[i]}&'

    # concatenate page number and api key and make the request. 
    # Returns a truncated JSON indexed past the metadata
    base_url += f'&page={n_page}'
    base_url += f'&api-key={api_key}'
    r = requests.get(base_url)
    json_data = r.json()
    return r.json()['response']['docs']

In [8]:
# create a function to run n queries using the constructor function above

def review_url_names(api_key, query, n_pages_min, n_pages_max, **kwargs):
    ''''''
    urls = []
    names = []
    dates = []
    for i in range(n_pages_min, n_pages_max):
        qref = nytimes_query(api_key, query, n_page = i, **kwargs)
        names += [j['web_url'].split('dining/')[1][:-5].replace('-', ' ').replace('reviews/', '') for j in qref]
        urls += [k['web_url'] for k in qref]
        dates += [m['pub_date'] for m in qref]
        time.sleep(6)
    return urls, names, dates

In [9]:
urls, restaurants_100, dates = review_url_names(api_key, 'Pete Wells', news_desk = 'Dining', type_of_material = 'Review', begin_date = '20120101', end_date = '20210401' n_pages_min = 0, n_pages_max = 80)
# run again with end date

In [79]:
# attempting to iterate through kwargs in order to shorten the if statements in the main function

# def test_function(glocations = None, headline = None, news_desk = None, 
#                   organizations = None, persons = None, byline = None, 
#                   subject = None, news_type = None, type_of_material = None, 
#                   begin_date = None, end_date = None):
#     for i in [glocations, headline, news_desk, 
#               organizations, persons, byline, 
#               subject, news_type, type_of_material]:
#         if i:
#             print(i)


In [12]:
df = pd.DataFrame(index = dates)

In [13]:
df['urls'] = urls

In [14]:
df['restaurants'] = restaurants_100

In [15]:
df.head()

Unnamed: 0,urls,restaurants
2021-03-08T18:31:52+0000,https://www.nytimes.com/2021/03/08/dining/whea...,wheated pizza washington squares
2021-03-01T18:03:31+0000,https://www.nytimes.com/2021/03/01/dining/winn...,winner restaurant review
2021-02-12T18:36:55+0000,https://www.nytimes.com/2021/02/12/dining/fala...,falansai review vietnamese food
2020-12-01T17:44:20+0000,https://www.nytimes.com/2020/12/01/dining/ayat...,ayat review palestinian food
2020-11-23T18:00:37+0000,https://www.nytimes.com/2020/11/23/dining/bila...,bilao review


In [16]:
df['restaurants'] = df.restaurants.apply(lambda i: i.replace(' review', '').replace('pete wells', '').replace('restaurant', '').replace('on the lower east side', ''))

In [17]:
df.head()

Unnamed: 0,urls,restaurants
2021-03-08T18:31:52+0000,https://www.nytimes.com/2021/03/08/dining/whea...,wheated pizza washington squares
2021-03-01T18:03:31+0000,https://www.nytimes.com/2021/03/01/dining/winn...,winner
2021-02-12T18:36:55+0000,https://www.nytimes.com/2021/02/12/dining/fala...,falansai vietnamese food
2020-12-01T17:44:20+0000,https://www.nytimes.com/2020/12/01/dining/ayat...,ayat palestinian food
2020-11-23T18:00:37+0000,https://www.nytimes.com/2020/11/23/dining/bila...,bilao


In [58]:
# df.to_csv('restaurants.csv')

In [9]:
df = pd.read_csv('restaurants.csv')

In [186]:
df.urls[30]

'https://www.nytimes.com/2012/03/07/dining/reviews/pok-pok-wing-on-the-lower-east-side.html'

In [19]:
page = requests.get('https://www.nytimes.com/2019/03/05/dining/madame-vo-bbq-review.html')
soup = BeautifulSoup(page.content, 'html.parser')
l = soup.find(class_="css-z4hz5")

In [21]:
l.text

'★'

In [21]:
stars = []
for i in df.urls:
    page = requests.get(i)
    soup = BeautifulSoup(page.content, 'html.parser')
    l = soup.find(class_="css-z4hz5")
    if l:
        stars.append(l.text)
    else:
        stars.append(np.NaN)
    

In [22]:
df['stars'] = stars

In [23]:
df['stars'] = df.stars.apply(lambda i: len(i) if type(i) == str else 0)

In [24]:
df[0:50]

Unnamed: 0,urls,restaurants,stars
2021-03-08T18:31:52+0000,https://www.nytimes.com/2021/03/08/dining/whea...,wheated pizza washington squares,0
2021-03-01T18:03:31+0000,https://www.nytimes.com/2021/03/01/dining/winn...,winner,0
2021-02-12T18:36:55+0000,https://www.nytimes.com/2021/02/12/dining/fala...,falansai vietnamese food,0
2020-12-01T17:44:20+0000,https://www.nytimes.com/2020/12/01/dining/ayat...,ayat palestinian food,0
2020-11-23T18:00:37+0000,https://www.nytimes.com/2020/11/23/dining/bila...,bilao,0
2020-11-10T17:12:25+0000,https://www.nytimes.com/2020/11/10/dining/silv...,silver apricot,0
2020-10-27T16:30:08+0000,https://www.nytimes.com/2020/10/27/dining/moky...,mokyo,0
2020-10-20T16:13:52+0000,https://www.nytimes.com/2020/10/20/dining/pata...,pata paplean queens,0
2020-10-06T15:54:22+0000,https://www.nytimes.com/2020/10/06/dining/koko...,kokomo brooklyn,0
2020-09-21T19:30:55+0000,https://www.nytimes.com/2020/09/21/dining/rang...,rangoon burmese food nyc,0


In [34]:
df = df[~df.restaurants.str.contains('unde')]
df = df[~df.restaurants.str.contains('rest')]
df = df[~df.restaurants.str.contains('brief')]
df = df[~df.restaurants.str.contains('hungry city')]

In [183]:
gmaps.find_place('Contra NYC', input_type= 'textquery', fields = ['business_status', 'place_id', 'formatted_address'])

{'candidates': [{'business_status': 'OPERATIONAL',
   'formatted_address': '138 Orchard St, New York, NY 10002, United States',
   'place_id': 'ChIJT1joy4ZZwokRQOu1I_pG85Q'}],
 'status': 'OK'}

In [10]:
df.drop(columns = 'Unnamed: 0', inplace = True)

In [11]:
df.set_index('Unnamed: 0.1', inplace = True)

In [12]:
df.head()

Unnamed: 0_level_0,urls,restaurants,stars
Unnamed: 0.1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-03-01T18:03:31+0000,https://www.nytimes.com/2021/03/01/dining/winn...,winner,0
2021-02-12T18:36:55+0000,https://www.nytimes.com/2021/02/12/dining/fala...,falansai vietnamese food,0
2020-12-01T17:44:20+0000,https://www.nytimes.com/2020/12/01/dining/ayat...,ayat palestinian food,0
2020-11-23T18:00:37+0000,https://www.nytimes.com/2020/11/23/dining/bila...,bilao,0
2020-11-10T17:12:25+0000,https://www.nytimes.com/2020/11/10/dining/silv...,silver apricot,0


In [16]:
gmaps.find_place(df['restaurants'][290] + 'NYC', input_type= 'textquery', fields = ['business_status', 'place_id', 'formatted_address'])

{'candidates': [{'business_status': 'CLOSED_TEMPORARILY',
   'formatted_address': '81-19, Broadway, NY 11373, United States',
   'place_id': 'ChIJdcuSjVJfwokRZ0w4TUXCA_g'}],
 'status': 'OK'}

In [203]:
df['restaurants'][290]

'thai cook '

In [None]:
def get_rest_info(rest_names = [], rest_location= 'NYC', fields = [], googlemap_object):
    dct = {}
    for i in rest_names:
        dct.update({i: googlemap_object.find_place(
            i + rest_location, input_type = 'textquery', fields = fields 
            )})
    

In [192]:
fields = ['business_status', 'place_id', 'formatted_address']
columns = {i : dct[i] for i in fields}

KeyError: 'business_status'

In [316]:
dct['winner ']

{'candidates': [{'business_status': 'OPERATIONAL',
   'formatted_address': '367 7th Ave, Brooklyn, NY 11215, United States',
   'name': 'WINNER',
   'place_id': 'ChIJg8UZJyNbwokR0-LkfR8MPJQ'}],
 'status': 'OK'}

In [188]:
dct = {}
for i in list(df.restaurants):
    dct.update({i : gmaps.find_place(i + ' NYC', input_type='textquery', fields = ['name', 'business_status', 'place_id', 'formatted_address'] )})

In [333]:
df1_name = []
business_status = []
formatted_address = []
name = []
place_id = []



for i in dct:
    try:
        df1_name.append(i)
        business_status.append(dct[i]['candidates'][0]['business_status'])
        formatted_address.append(dct[i]['candidates'][0]['formatted_address'])
        name.append(dct[i]['candidates'][0]['name'])
        place_id.append(dct[i]['candidates'][0]['place_id'])
    except (IndexError, KeyError):
        business_status.append(np.NaN)
        formatted_address.append(np.NaN)
        name.append(np.NaN)
        place_id.append(np.NaN)


In [None]:
def gmaps_dataframe(list_of_restaurants, fields, gmaps_obj):
    dct = {}
    for i in list_of_restaurants:
        dct.update({i : gmaps.find_place(i + ' NYC', input_type='textquery', fields = fields)})
    for i in fields:
        i = []
    

<class 'pandas.core.frame.DataFrame'>
Index: 422 entries, 2021-03-08T18:31:52+0000 to 2012-03-27T21:06:49+0000
Data columns (total 3 columns):
urls           422 non-null object
restaurants    422 non-null object
stars          422 non-null int64
dtypes: int64(1), object(2)
memory usage: 33.2+ KB


In [334]:
df1 = pd.DataFrame({'status': business_status, 'address':formatted_address, 'restaurants':df1_name, 'place_id': place_id, 'name' :name})

In [368]:
maindf = df.reset_index().merge(df1, how = 'left', on= 'restaurants').set_index('index')

In [370]:
maindf.to_csv('merged.csv')

In [369]:
maindf.head()

Unnamed: 0_level_0,urls,restaurants,stars,status,address,place_id,name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-03-08T18:31:52+0000,https://www.nytimes.com/2021/03/08/dining/whea...,wheated pizza washington squares,0,OPERATIONAL,"905 Church Ave, Brooklyn, NY 11218, United States",ChIJIYCk0DpbwokRZaTcJmohk1Y,Wheated
2021-03-01T18:03:31+0000,https://www.nytimes.com/2021/03/01/dining/winn...,winner,0,OPERATIONAL,"367 7th Ave, Brooklyn, NY 11215, United States",ChIJg8UZJyNbwokR0-LkfR8MPJQ,WINNER
2021-02-12T18:36:55+0000,https://www.nytimes.com/2021/02/12/dining/fala...,falansai vietnamese food,0,OPERATIONAL,"112 Harrison Pl, Brooklyn, NY 11237, United St...",ChIJc11gFQJcwokRcOejfRbBPpo,Falansai
2020-12-01T17:44:20+0000,https://www.nytimes.com/2020/12/01/dining/ayat...,ayat palestinian food,0,OPERATIONAL,"8504 3rd Ave, Brooklyn, NY 11209, United States",ChIJBSKRV89PwokRcofsyKk26CM,Ayat
2020-11-23T18:00:37+0000,https://www.nytimes.com/2020/11/23/dining/bila...,bilao,0,OPERATIONAL,"1437 1st Avenue Store 1, New York, NY 10021, U...",ChIJierqy1ZZwokR9eCcG1n0hXU,BILAO


In [348]:
maindf.dropna(inplace = True)

In [339]:
import matplotlib.pyplot as plt
import seaborn as sns

In [349]:
maindf.stars.value_counts()

2    156
0    100
1     94
3     43
4      2
Name: stars, dtype: int64

In [350]:
maindf.status.value_counts()

OPERATIONAL           246
CLOSED_PERMANENTLY     95
CLOSED_TEMPORARILY     54
Name: status, dtype: int64