# Fetch stories by a certain author. 

This uses the [NYT Article Search API](https://developer.nytimes.com/docs/articlesearch-product/1/routes/articlesearch.json/get).

Load dependencies.

In [29]:
import os
import time
import requests
import dateutil
import pandas as pd

# Usernames and passwords
import configparser
configs = configparser.ConfigParser()
configs.read('../../config.ini')

['../../config.ini']

Set up code.

In [22]:
def send_request(query, page):
    '''Sends a request to the NYT Archive API for given date.'''
    base_url = 'http://api.nytimes.com/svc/search/v2/articlesearch.json'
    url = base_url + '?fq=' + query + '&api-key=' + configs['NYT']['ACCESS_KEY'] + '&page=' + str(page)
    response = requests.get(url).json()
    time.sleep(6)
    return response

def parse_response(response, data):
    '''Parses and returns response as pandas data frame.'''
    articles = response['response']['docs'] 
    for article in articles: 
        # id
        data['id'].append(article['_id'])
        
        # Date
        date = dateutil.parser.parse(article['pub_date']).date()
        data['date'].append(date)
        
        # Headline
        data['headline'].append(article['headline']['main']) 
        
        # Section
        if 'section_name' in article:
            data['section'].append(article['section_name'])
        else:
            data['section'].append(None)
        
        # News desk
        if 'news_desk' in article:
            data['news_desk'].append(article['news_desk'])
        else:
            data['news_desk'].append(None)
        
        # Document type
        data['doc_type'].append(article['document_type'])
        
        # Type of material
        if 'type_of_material' in article: 
            data['material_type'].append(article['type_of_material'])
        else:
            data['material_type'].append(None)
            
        # Keywords
        keywords = [keyword['value'] for keyword in article['keywords'] if keyword['name'] == 'subject']
        data['keywords'].append(keywords)
        
        # Web URL
        if 'web_url' in article:
            data['url'].append(article['web_url'])
            
        # Author
        if 'byline' in article:
            data['byline'].append(article['byline']['original'])
        else:
            data['byline'].append(None)

def send_query(query, data, date=None):
    if date: # If date is provided, append to query string
        query_str = query + date
    else: 
        query_str = query
    
    page_num = 0
    while True:
        print('Querying string: ' + query_str + '\n')
        response = send_request(query_str, page_num)
        offset = response['response']['meta']['offset']
        hits = response['response']['meta']['hits']
        
        if offset > hits: 
            print('Done processing results.\n')
            return True
        # If we have 2,000 hits or more, we will need to break down our query into date intervals
        elif hits >= 2000: 
            print('We have over 2,000 hits.\n')
            # Send the same query again, once for each date interval
            for date in q_dates:
                send_query(query, data, date) 
            return True
            
        print('Processing results ' + str(offset) + '—' + str(min((offset + 10), hits)) + '/' + str(hits) + '...')
        parse_response(response, data)
        page_num += 1

Get papers written by given author.

In [83]:
author = 'Shuaib Almosawa'
query = 'byline:"' + author + '"'
data = {'headline': [],  
            'date': [], 
            'doc_type': [],
            'material_type': [],
            'news_desk': [],
            'section': [],
            'keywords': [],
            'url': [],
            'id': [],
            'byline': []}

success = send_query(query, data)

Querying string: byline:"Shuaib Almosawa"

Processing results 0—10/126...
Querying string: byline:"Shuaib Almosawa"

Processing results 10—20/126...
Querying string: byline:"Shuaib Almosawa"

Processing results 20—30/126...
Querying string: byline:"Shuaib Almosawa"

Processing results 30—40/126...
Querying string: byline:"Shuaib Almosawa"

Processing results 40—50/126...
Querying string: byline:"Shuaib Almosawa"

Processing results 50—60/126...
Querying string: byline:"Shuaib Almosawa"

Processing results 60—70/126...
Querying string: byline:"Shuaib Almosawa"

Processing results 70—80/126...
Querying string: byline:"Shuaib Almosawa"

Processing results 80—90/126...
Querying string: byline:"Shuaib Almosawa"

Processing results 90—100/126...
Querying string: byline:"Shuaib Almosawa"

Processing results 100—110/126...
Querying string: byline:"Shuaib Almosawa"

Processing results 110—120/126...
Querying string: byline:"Shuaib Almosawa"

Processing results 120—126/126...
Querying string: by

Save first csv, with all results.

In [91]:
if success:
    data_df = pd.DataFrame(data)
    data_df['date'] = pd.to_datetime(data_df['date'])
    data_df.to_csv('data/' + author.replace(' ', '_') + '_1.csv', index=False)
    print('Saved as ' + author.replace(' ', '_') + '_1.csv.\n')
else:
    print('Error.')

Saved as Shuaib_Almosawa_1.csv.



Save second csv, with results filtered to only those that contain the author in the byline. For some reason a few articles were returned that did not list the author in their bylines, but they seem to link to the author's articles. 

In [92]:
data_df = data_df[data_df['byline'].str.contains('Shuaib Almosawa')].reset_index(drop=True)
data_df.to_csv('data/' + author.replace(' ', '_') + '_2.csv', index=False)
print('Saved as ' + author.replace(' ', '_') + '_2.csv.\n')

Saved as Shuaib_Almosawa_2.csv.

