## Fetch Data - New York Times - Article Search API - Sports

In [1]:
import requests
import os
import json
import time
from datetime import datetime

#### Confirm key

In [2]:
nyt_api_key = os.getenv('NYT_API_KEY')
articlesearch_url = 'https://api.nytimes.com/svc/search/v2/articlesearch.json'
print('Value of the environment variable for api-key: ', nyt_api_key) 

Value of the environment variable for api-key:  d813d3ec9be6406a904efa4ce9589c82


### Define all the functions needed to carry out the download process dynamically

In [3]:
def createdir(filename):                           # funtion to create directory if not exists
    dirname=os.path.dirname(filename)
    if not os.path.exists(dirname):
        os.makedirs(dirname)

In [4]:
def saveDocs(folderName, page, docs):             # funtion to save downloaded data
    for doc in docs:
        pub_datetime_str = doc['pub_date']
        pub_datetime = datetime.strptime(pub_datetime_str[0:19], "%Y-%m-%dT%H:%M:%S")
        month_name = pub_datetime.strftime("%b")
        
        finalfilepath = '../Data/NYT/Sports/' + folderName + '/' + month_name + '/' + doc['_id'] + '.json'
        createdir(finalfilepath)
        with open(finalfilepath, 'w') as open_file:
            json.dump(doc, open_file)

In [5]:
# This is the primary function that starts the whole process
def downloadData(url, reqparams, destFolder):
    remaining_pages = downloadFirstPage(url, reqparams, destFolder)
    downloadRemainingPages(url, reqparams, remaining_pages, destFolder)

In [6]:
# First request
def downloadFirstPage(url, reqparams, destFolder):
    r = requests.get(url, params = reqparams)
    if r.status_code == 200:
        jsonObj = json.loads(r.text)
        # save docs from current request
        saveDocs(destFolder, page = 0, docs = jsonObj['response']['docs'])
        # get total hits to prepare for dynamically getting remaining docs
        hits = jsonObj['response']['meta']['hits']
        remaining_pages = int(hits / 10)
        if hits % 10 == 0:
            remaining_pages = remaining_pages - 1
    else:
        print('Error downloading first page', r.text)
    print('Remaining pages:', remaining_pages)
    return remaining_pages
    

In [7]:
# Download documents from remaining_pages
def downloadRemainingPages(url, reqparams, remaining_pages, destFolder):
    for page_number in range(1, remaining_pages + 1):
        if page_number % 5:
            # API restriction: sleep 1 sec after every 5 request 
            time.sleep(1)
        reqparams['page'] = page_number
        r = requests.get(url, params = reqparams)
        if r.status_code == 200:
            jsonObj = json.loads(r.text)
            # save docs from current request
            saveDocs(destFolder, page = page_number, docs = jsonObj['response']['docs'])
        else:
            print('Error downloading page:', page_number, r.text)




In [8]:
# Request Map for Celtics
reqparams = {'api-key':nyt_api_key, 'q':'Boston Celtics','begin_date':'20160101', 'end_date':'20161231'}
downloadData(articlesearch_url, reqparams, 'Boston Celtics')

Remaining pages: 48
Requesting info for page: 1
Got results for page: 1.0
Requesting info for page: 2
Got results for page: 2.0
Requesting info for page: 3
Got results for page: 3.0
Requesting info for page: 4
Got results for page: 4.0
Requesting info for page: 5
Got results for page: 5.0
Requesting info for page: 6
Got results for page: 6.0
Requesting info for page: 7
Got results for page: 7.0
Requesting info for page: 8
Got results for page: 8.0
Requesting info for page: 9
Got results for page: 9.0
Requesting info for page: 10
Got results for page: 10.0
Requesting info for page: 11
Got results for page: 11.0
Requesting info for page: 12
Got results for page: 12.0
Requesting info for page: 13
Got results for page: 13.0
Requesting info for page: 14
Got results for page: 14.0
Requesting info for page: 15
Requesting info for page: 16
Got results for page: 16.0
Requesting info for page: 17
Got results for page: 17.0
Requesting info for page: 18
Got results for page: 18.0
Requesting info f

In [9]:
# Request Map for Patriots
reqparams = {'api-key':nyt_api_key, 'q':'New England Patriots','begin_date':'20160101', 'end_date':'20161231'}
downloadData(articlesearch_url, reqparams, 'New England Patriots')

Remaining pages: 85
Requesting info for page: 1
Got results for page: 1.0
Requesting info for page: 2
Got results for page: 2.0
Requesting info for page: 3
Got results for page: 3.0
Requesting info for page: 4
Got results for page: 4.0
Requesting info for page: 5
Requesting info for page: 6
Got results for page: 6.0
Requesting info for page: 7
Got results for page: 7.0
Requesting info for page: 8
Got results for page: 8.0
Requesting info for page: 9
Got results for page: 9.0
Requesting info for page: 10
Got results for page: 10.0
Requesting info for page: 11
Got results for page: 11.0
Requesting info for page: 12
Got results for page: 12.0
Requesting info for page: 13
Got results for page: 13.0
Requesting info for page: 14
Got results for page: 14.0
Requesting info for page: 15
Requesting info for page: 16
Got results for page: 16.0
Requesting info for page: 17
Got results for page: 17.0
Requesting info for page: 18
Got results for page: 18.0
Requesting info for page: 19
Got results fo