## Fetch Data - New York Times - Article Search API - Sports

In [None]:
import requests
import os
import json
import time
from datetime import datetime

#### Confirm key

In [None]:
nyt_api_key = os.getenv('NYT_API_KEY')
articlesearch_url = 'https://api.nytimes.com/svc/search/v2/articlesearch.json'
print('Value of the environment variable for api-key: ', nyt_api_key) 

### Define all the functions needed to carry out the download process dynamically

In [None]:
def createdir(filename):                           # funtion to create directory if not exists
    dirname=os.path.dirname(filename)
    if not os.path.exists(dirname):
        os.makedirs(dirname)

In [None]:
def saveDocs(folderName, page, docs):             # funtion to save downloaded data
    for doc in docs:
        pub_datetime_str = doc['pub_date']
        pub_datetime = datetime.strptime(pub_datetime_str[0:19], "%Y-%m-%dT%H:%M:%S")
        month_name = pub_datetime.strftime("%b")
        
        finalfilepath = '../Data/NYT/Sports/' + folderName + '/' + month_name + '/' + doc['_id'] + '.json'
        createdir(finalfilepath)
        with open(finalfilepath, 'w') as open_file:
            json.dump(doc, open_file)

In [None]:
# This is the primary function that starts the whole process
def downloadData(url, reqparams, destFolder):
    remaining_pages = downloadFirstPage(url, reqparams, destFolder)
    downloadRemainingPages(url, reqparams, remaining_pages, destFolder)

In [None]:
# First request
def downloadFirstPage(url, reqparams, destFolder):
    r = requests.get(url, params = reqparams)
    if r.status_code == 200:
        jsonObj = json.loads(r.text)
        # save docs from current request
        saveDocs(destFolder, page = 0, docs = jsonObj['response']['docs'])
        # get total hits to prepare for dynamically getting remaining docs
        hits = jsonObj['response']['meta']['hits']
        remaining_pages = int(hits / 10)
        if hits % 10 == 0:
            remaining_pages = remaining_pages - 1
    else:
        print('Error downloading first page', r.text)
    print('Remaining pages:', remaining_pages)
    return remaining_pages
    

In [None]:
# Download documents from remaining_pages
def downloadRemainingPages(url, reqparams, remaining_pages, destFolder):
    for page_number in range(1, remaining_pages + 1):
        if page_number % 5:
            # API restriction: sleep 1 sec after every 5 request 
            time.sleep(2)
        reqparams['page'] = page_number
        r = requests.get(url, params = reqparams)
        if r.status_code == 200:
            jsonObj = json.loads(r.text)
            # save docs from current request
            saveDocs(destFolder, page = page_number, docs = jsonObj['response']['docs'])
        else:
            print('Error downloading page:', page_number, r.text)




In [None]:
# Request Map for Celtics
reqparams = {'api-key':nyt_api_key, 'q':'Boston Celtics','begin_date':'20160101', 'end_date':'20161231'}
downloadData(articlesearch_url, reqparams, 'Boston Celtics')

In [None]:
# Request Map for Patriots
reqparams = {'api-key':nyt_api_key, 'q':'New England Patriots','begin_date':'20160101', 'end_date':'20161231'}
downloadData(articlesearch_url, reqparams, 'New England Patriots')