In [1]:
import json
import numpy as np
import pandas as pd
import requests
import time

In [2]:
from bs4 import BeautifulSoup

In [34]:
key = open('APIkeys.txt', 'r')
goog_key = open('google_places_api_key.txt', 'r')

In [35]:
keys = list(key)

In [36]:
api_key = keys[0].split()[2]
goog_api_key = list(goog_key)[0][11:-1]

In [188]:
nytimes_query(api_key, 'Pete Wells', news_desk = 'Dining', type_of_material = 'Review', n_page = 2)[0]['web_url']

'https://www.nytimes.com/2012/05/09/dining/reviews/frej-brooklyn-restaurant-review.html'

In [5]:
def nytimes_query(api_key, query, 
                  glocations = None, headline = None, news_desk = None, 
                  organizations = None, persons = None, byline = None, 
                  subject = None, news_type = None, type_of_material = None, 
                  begin_date = None, end_date = None, n_page = 0):
    '''
    Perform a query from the NYTimes API and return a JSON of results
    ----
    Arguments:
    api-key -- a user-specific key registered at developer.nytimes.com
    query -- the search terms used for the query
    
    filters:
        each of the following categories is set at None by default. If values are added, the
        queries will be filtered based on the parameter

        glocations -- geographic locations
        headline -- literal headline
        news_desk -- by news desk using the following values:
            Adventure Sports
            Arts & Leisure
            Arts
            Automobiles
            Blogs
            Books
            Booming
            Business Day
            Business
            Cars
            Circuits
            Classifieds
            Connecticut
            Crosswords & Games
            Culture
            DealBook
            Dining
            Editorial
            Education
            Energy
            Entrepreneurs
            Environment
            Escapes
            Fashion & Style
            Fashion
            Favorites
            Financial
            Flight
            Food
            Foreign
            Generations
            Giving
            Global Home
            Health & Fitness
            Health
            Home & Garden
            Home
            Jobs
            Key
            Letters
            Long Island
            Magazine
            Market Place
            Media
            Men's Health
            Metro
            Metropolitan
            Movies
            Museums
            National
            Nesting
            Obits
            Obituaries
            Obituary
            OpEd
            Opinion
            Outlook
            Personal Investing
            Personal Tech
            Play
            Politics
            Regionals
            Retail
            Retirement
            Science
            Small Business
            Society
            Sports
            Style
            Sunday Business
            Sunday Review
            Sunday Styles
            T Magazine
            T Style
            Technology
            Teens
            Television
            The Arts
            The Business of Green
            The City Desk
            The City
            The Marathon
            The Millennium
            The Natural World
            The Upshot
            The Weekend
            The Year in Pictures
            Theater
            Then & Now
            Thursday Styles
            Times Topics
            Travel
            U.S.
            Universal
            Upshot
            UrbanEye
            Vacation
            Washington
            Wealth
            Weather
            Week in Review
            Week
            Weekend
            Westchester
            Wireless Living
            Women's Health
            Working
            Workplace
            World
            Your Money
        organizations 
        persons -- filter by persons referenced
        byline -- filter by writer or several other parameters- see developer.nytimes.com
        subject -- by subject
        type_of_material -- document type- sample values below:
            Addendum
            An Analysis
            An Appraisal
            Archives
            Article
            Banner
            Biography
            Birth Notice
            Blog
            Brief
            Caption
            Chronology
            Column
            Correction
            Economic Analysis
            Editorial
            Editorial Cartoon
            Editors' Note
            First Chapter
            Front Page
            Glossary
            Interactive Feature
            Interactive Graphic
            Interview
            Letter
            List
            Marriage Announcement
            Military Analysis
            News
            News Analysis
            Newsletter
            Obituary
            Obituary (Obit)
            Op-Ed
            Paid Death Notice
            Postscript
            Premium
            Question
            Quote
            Recipe
            Review
            Schedule
            SectionFront
            Series
            Slideshow
            Special Report
            Statistics
            Summary
            Text
            Video
            Web Log

    date params:
        begin_date -- beginning date of filter 
        end_date -- end date of filter
    n_page -- the selected page of returns- for large queries this will select sections to return
    '''
    # Set the base url for the query
    base_url = f'https://api.nytimes.com/svc/search/v2/articlesearch.json?q={query}'             
    # Empty dictionary for filters
    filter_queries = {}
    # empty dictionary for dates
    dates = {}

    # Populate the filter dictionary
    if glocations:
        filter_queries.update({'glocations': glocations})
    if headline:
        filter_queries.update({'headline': headline})
    if news_desk:
        filter_queries.update({'news_desk': news_desk})
    if organizations:
        filter_queries.update({'organizations': organizations})
    if persons:
        filter_queries.update({'persons': persons})
    if subject:
        filter_queries.update({'subject': subject})
    if byline:
        filter_queries.update({'byline': byline})
    if news_type:
        filter_queries.update({'news_type': news_type})
    if type_of_material:
        filter_queries.update({'type_of_material': type_of_material})
    
    # Populate the date dictionary
    if begin_date:
        dates.update({'begin_date': begin_date})
    if end_date:
        dates.update({'end_date':end_date})
    
    # If 1 filter is present, and/or date params, add to URL and execute query
    if len(filter_queries) == 1:
        base_url += f'&fq={list(filter_queries.keys())[0]}:({list(filter_queries.values())[0]})'
        if len(dates) == 1:
            base_url += f'&{list(filter_queries.keys())[0]}={list(filter_queries.values())[0]}&'
        elif len(dates) == 2:
            base_url += '&'
            for i in dates.keys():
                base_url += f'{i}={dates[i]}&'

    # If 2 or more filters are present, concatenate with AND, add dates if present and execute
    elif  len(filter_queries) > 1:
        base_url += '&fq='
        for i in filter_queries.keys():
            base_url += f'{i}:({filter_queries[i]}) AND '
        base_url = base_url[:-5]
        if len(dates) == 1:
            base_url += f'&{list(filter_queries.keys())[0]}={list(filter_queries.values())[0]}&'
        elif len(dates) == 2:
            base_url += '&'
            for i in dates.keys():
                base_url += f'{i}={dates[i]}&'

    # concatenate page number and api key and make the request. 
    # Returns a truncated JSON indexed past the metadata
    base_url += f'&page={n_page}'
    base_url += f'&api-key={api_key}'
    r = requests.get(base_url)
    json_data = r.json()
    return r.json()['response']['docs']

In [92]:
def review_url_names(api_key, query, n_pages_min, n_pages_max, **kwargs):
    urls = []
    names = []
    dates = []
    for i in range(n_pages_min, n_pages_max):
        qref = nytimes_query(api_key, query, n_page = i, **kwargs)
        names += [j['web_url'].split('dining/')[1][:-5].replace('-', ' ').replace('reviews/', '') for j in qref]
        urls += [k['web_url'] for k in qref]
        dates += [m['pub_date'] for m in qref]
        time.sleep(6)
    return urls, names, dates

In [93]:
urls, restaurants_100, dates = review_url_names(api_key, 'Pete Wells', news_desk = 'Dining', type_of_material = 'Review', n_pages_min = 0, n_pages_max = 40)

In [79]:
# attempting to iterate through kwargs in order to shorten the if statements in the main function

def test_function(glocations = None, headline = None, news_desk = None, 
                  organizations = None, persons = None, byline = None, 
                  subject = None, news_type = None, type_of_material = None, 
                  begin_date = None, end_date = None):
    for i in [glocations, headline, news_desk, 
              organizations, persons, byline, 
              subject, news_type, type_of_material]:
        if i:
            print(i)


In [None]:
Re run function with glocation set to new york city? Filter out national reviews, 29 and under and briefs. Critic's notebook?

Try using Google Maps API or webscraping to see if restaurants are closed

In [99]:
df = pd.DataFrame(index = dates)

In [100]:
df['urls'] = urls

In [102]:
df['restaurants'] = restaurants_100

In [103]:
df.head()

Unnamed: 0,urls,restaurants
2021-03-01T18:03:31+0000,https://www.nytimes.com/2021/03/01/dining/winn...,winner restaurant review
2021-02-12T18:36:55+0000,https://www.nytimes.com/2021/02/12/dining/fala...,falansai review vietnamese food
2020-12-01T17:44:20+0000,https://www.nytimes.com/2020/12/01/dining/ayat...,ayat review palestinian food
2020-11-23T18:00:37+0000,https://www.nytimes.com/2020/11/23/dining/bila...,bilao review
2020-11-10T17:12:25+0000,https://www.nytimes.com/2020/11/10/dining/silv...,silver apricot review


In [112]:
df['restaurants'] = df.restaurants.apply(lambda i: i.replace(' review', '').replace('pete wells', '').replace('restaurant', '').replace('on the lower east side', ''))

In [113]:
df.head()

Unnamed: 0,urls,restaurants
2021-03-01T18:03:31+0000,https://www.nytimes.com/2021/03/01/dining/winn...,winner
2021-02-12T18:36:55+0000,https://www.nytimes.com/2021/02/12/dining/fala...,falansai vietnamese food
2020-12-01T17:44:20+0000,https://www.nytimes.com/2020/12/01/dining/ayat...,ayat palestinian food
2020-11-23T18:00:37+0000,https://www.nytimes.com/2020/11/23/dining/bila...,bilao
2020-11-10T17:12:25+0000,https://www.nytimes.com/2020/11/10/dining/silv...,silver apricot


In [114]:
df.tail()

Unnamed: 0,urls,restaurants
2012-08-14T20:49:59+0000,https://www.nytimes.com/2012/08/15/dining/revi...,biang nyc
2012-07-10T20:22:27+0000,https://www.nytimes.com/2012/07/11/dining/revi...,almayass in manhattan
2014-12-09T17:51:00+0000,https://www.nytimes.com/2014/12/10/dining/rest...,dirty french
2014-08-26T18:39:02+0000,https://www.nytimes.com/2014/08/27/dining/rest...,batard in tribeca
2009-10-06T20:55:17+0000,https://www.nytimes.com/2009/10/07/dining/revi...,07rest


In [58]:
df.to_csv('restaurants.csv')

In [4]:
df = pd.read_csv('restaurants.csv')

In [18]:
df.urls[30]

'https://www.nytimes.com/2012/03/07/dining/reviews/pok-pok-wing-on-the-lower-east-side.html'

In [19]:
page = requests.get('https://www.nytimes.com/2019/03/05/dining/madame-vo-bbq-review.html')
soup = BeautifulSoup(page.content, 'html.parser')
l = soup.find(class_="css-z4hz5")

In [21]:
l.text

'★'

In [24]:
stars = []
for i in df.urls:
    page = requests.get(i)
    soup = BeautifulSoup(page.content, 'html.parser')
    l = soup.find(class_="css-z4hz5")
    if l:
        stars.append(l.text)
    else:
        stars.append(np.NaN)
    

In [35]:
df['stars'] = stars

In [49]:
df['stars'] = df.stars.apply(lambda i: len(i) if type(i) == str else 0)

In [51]:
df[0:50]

Unnamed: 0.1,Unnamed: 0,urls,restaurants,stars
0,2021-03-01T18:03:31+0000,https://www.nytimes.com/2021/03/01/dining/winn...,winner,0
1,2021-02-12T18:36:55+0000,https://www.nytimes.com/2021/02/12/dining/fala...,falansai vietnamese food,0
2,2020-12-01T17:44:20+0000,https://www.nytimes.com/2020/12/01/dining/ayat...,ayat palestinian food,0
3,2020-11-23T18:00:37+0000,https://www.nytimes.com/2020/11/23/dining/bila...,bilao,0
4,2020-11-10T17:12:25+0000,https://www.nytimes.com/2020/11/10/dining/silv...,silver apricot,0
5,2020-10-27T16:30:08+0000,https://www.nytimes.com/2020/10/27/dining/moky...,mokyo,0
6,2020-10-20T16:13:52+0000,https://www.nytimes.com/2020/10/20/dining/pata...,pata paplean queens,0
7,2020-10-06T15:54:22+0000,https://www.nytimes.com/2020/10/06/dining/koko...,kokomo brooklyn,0
8,2020-09-21T19:30:55+0000,https://www.nytimes.com/2020/09/21/dining/rang...,rangoon burmese food nyc,0
9,2019-10-29T14:55:49+0000,https://www.nytimes.com/2019/10/29/dining/pete...,peter luger,0


In [57]:
[i for i in df.restaurants if 'brief' in i]

['17brief 001',
 '05brief 001',
 '17dinbriefs 001',
 '23dinbriefs',
 '31brief 002',
 '19dinbriefs 2',
 '30brief 001',
 '17brief 001',
 '01dinbriefs 2',
 '26brief']

In [70]:
[i for i in df.restaurants if '29' in i]

['29cheap']