## NYTimes API
- This notebook contains the code for retrieving and parsing the New York Times archive headlines over a period of time and exports it as 
- The code is adapted and modified from [Brienna Herold](https://brienna.medium.com/)'s amazing [article](https://towardsdatascience.com/collecting-data-from-the-new-york-times-over-any-period-of-time-3e365504004).

In [1]:
import requests
from pprint import pprint as pp
import os
import json
import time
import datetime
import dateutil
import pandas as pd
from dateutil.relativedelta import relativedelta

In [7]:
# I'm behind a proxy so had to include this. Set this to None by default
# proxies = {
#    'http': os.environ["http_proxy"],
#    'https': os.environ["https_proxy"],
# }

# set your own NYT Developer's API key
API = os.environ["NYT_dev_API"]

In [108]:
# modified from https://gist.github.com/brienna/bbb381e84649a55ce1c8647665943e3b

def send_request(date):
    '''Sends a request to the NYT Archive API for given year-month.'''
    base_url = 'https://api.nytimes.com/svc/archive/v1'
    # date[0] -> year | date[1] -> month
    url = base_url + '/' + date[0] + '/' + date[1] + '.json?api-key=' + API # make sure to change the API key
    print(f"Retrieving response from {url.replace(API, '$API_KEY')}")
    response = requests.get(url, proxies=None).json() # set proxy if needed

    # we need to make sure we don't exceed the limit of 10 requests/min
    #   (note that there's also 4000 requests / day limit)
    print("Sleep for 20 secs...")
    time.sleep(20)
    print("Resuming parsing...")
    return response

def is_valid(article, date):
    '''An article is only worth checking if it is in range, and has a headline.'''
    is_in_range = date > start and date < end
    has_headline = type(article['headline']) == dict and 'main' in article['headline'].keys()
    return is_in_range and has_headline

def parse_response(response):
    '''Parses and returns response as pandas data frame.'''
    print("Parsing response...")
    data = {'headline': [],  
        'date': [], 
        'doc_type': [],
        'material_type': [],
        'section': [],
        'keywords': []}
    
    articles = response['response']['docs'] 
    for article in articles: # For each article, make sure it falls within our date range
        date = dateutil.parser.parse(article['pub_date']).date()
        if is_valid(article, date):
            data['date'].append(date)
            data['headline'].append(article['headline']['main']) 
            if 'section' in article:
                data['section'].append(article['section_name'])
            else:
                data['section'].append(None)
            data['doc_type'].append(article['document_type'])
            if 'type_of_material' in article: 
                data['material_type'].append(article['type_of_material'])
            else:
                data['material_type'].append(None)
            keywords = [keyword['value'] for keyword in article['keywords'] if keyword['name'] == 'subject']
            data['keywords'].append(keywords)
    return pd.DataFrame(data) 

def get_data(dates):
    '''Sends and parses request/response to/from NYT Archive API for given dates.'''
    total = 0
    print('Date range: ' + str(dates[0]) + ' to ' + str(dates[-1]))
    if not os.path.exists('headlines'):
        os.mkdir('headlines')
    df_headlines = pd.read_csv("headlines/2020-5_2022-8_NYtimes_headlines.csv")
    for date in dates:
        response = send_request(date)
        df = parse_response(response)
        total += len(df)
        print("Concatenating headlines...")
        df_headlines = pd.concat([df_headlines, df])
        print('Saving current data to "headlines/' + dates[0][0] + '-' + dates[0][1] + '_' + dates[-1][0] + '-' + dates[-1][1] + '_NYtimes_headlines.csv"...')
        print(f"Headlines retrieved for {date[0]}/{date[1]}.")
        print()
        df_headlines.to_csv('headlines/' + dates[0][0] + '-' + dates[0][1] + '_' + dates[-1][0] + '-' + dates[-1][1] + '_NYtimes_headlines.csv', index=False)
    print('Number of articles collected: ' + str(len(df_headlines)))

In [92]:
end = datetime.date(2022,8,1)
start = end - relativedelta(years=3)

months_in_range = [x.split(' ') for x in pd.date_range(start, end, freq='MS').strftime("%Y %-m").tolist()]

In [107]:
get_data(months_in_range)

Date range: ['2021', '6'] to ['2022', '8']
Retrieving response from https://api.nytimes.com/svc/archive/v1/2021/6.json?api-key=$API_KEY
Sleep for 20 secs...
Resuming parsing...
Parsing response...
Concatenating headlines...
Headlines retrieved for 2021/6.

Saving current data to "headlines/2021-6_2022-8_NYtimes_headlines.csv"...
Retrieving response from https://api.nytimes.com/svc/archive/v1/2021/7.json?api-key=$API_KEY
Sleep for 20 secs...
Resuming parsing...
Parsing response...
Concatenating headlines...
Headlines retrieved for 2021/7.

Saving current data to "headlines/2021-6_2022-8_NYtimes_headlines.csv"...
Retrieving response from https://api.nytimes.com/svc/archive/v1/2021/8.json?api-key=$API_KEY
Sleep for 20 secs...
Resuming parsing...
Parsing response...
Concatenating headlines...
Headlines retrieved for 2021/8.

Saving current data to "headlines/2021-6_2022-8_NYtimes_headlines.csv"...
Retrieving response from https://api.nytimes.com/svc/archive/v1/2021/9.json?api-key=$API_KEY
