# Task 1 - Data Collection 

In this task, we collect New York Times Archive data through the Archive API. This API returns the data based on the date range we provide.

In [4]:
import os
import json
import time
import requests
import datetime
import dateutil
import pandas as pd
import numpy as np
from dateutil.relativedelta import relativedelta
import matplotlib.pyplot as plt


In [8]:
# End time would be the current date, years = 1 means that we would be fetching data of 1 year.
end = datetime.date.today()
start = end - relativedelta(years=1)
#Creating a list having all the dates that we require.
months_in_range = [x.split(' ') for x in pd.date_range(start, end, freq='MS').strftime("%Y %-m").tolist()]

#This function is used to send a request to the API. We concatenate the base url with the date followed by the API key.
def sendRequest(date):
    base_url = 'https://api.nytimes.com/svc/archive/v1/'
    url = base_url + '/' + date[0] + '/' + date[1] + '.json?api-key=' + "kzIdsTwX2ed0OKy0ZUSiv90i0eHTn4Ud"
    response = requests.get(url).json() #Sending a GET request.
    time.sleep(6)
    return response


def isValid(article, date):
    is_in_range = date > start and date < end #Checking if the given date range is valid
    has_headline = type(article['headline']) == dict and 'main' in article['headline'].keys() #Checking if the article has headline, if not then we wouldn't be considering that record.
    return is_in_range and has_headline

#In this function, we create a dictionary having the column names as the key and values to be an empty list.
def parseResponse(response):
    data = {'headline': [],  
        'date': [],
        'id': [],
        'doc_type': [],
        'material_type': [],
        'sectionname': [],
        'keywords': [],
        'printpage': [],
        'wordcount':[],
        'newsdesk':[]}
    
    articles = response['response']['docs'] 
    for article in articles: # For each article, make sure it falls within our date range
        date = dateutil.parser.parse(article['pub_date']).date() #pub_date represent the published date.
        if is_valid(article, date): #Here we're basically appending the data from nyt to the dictionary.
            data['date'].append(date)
            if '_id' in article:
                data['id'].append(article['_id'])
            else:
                data['id'].append(None)
            data['headline'].append(article['headline']['main']) 
            data['doc_type'].append(article['document_type'])
            if 'type_of_material' in article: 
                data['material_type'].append(article['type_of_material'])
            else:
                data['material_type'].append(None)
            if 'print_page' in article:
                data['printpage'].append(article['print_page'])
            else:
                data['printpage'].append(None)
            
            data['wordcount'].append(article['word_count'])
            data['sectionname'].append(article['section_name'])
            data['newsdesk'].append(article['news_desk'])
            keywords = [keyword['value'] for keyword in article['keywords'] if keyword['name'] == 'subject']
            data['keywords'].append(keywords)
    return pd.DataFrame(data) #finally returning the data in the form of dataframe.

def getData(dates):
    #Sends and parses request/response to/from NYT Archive API for given dates.
    total=0
    print('Date range: ' + str(dates[0]) + ' to ' + str(dates[-1]))
    if not os.path.exists('headlines'): #Creating a direcrtory called headlines to store all the csv files based on month.
        os.mkdir('headlines')
    for date in dates:
        print('Working on ' + str(date) + '...')
        csv_path = 'headlines/' + date[0] + '-' + date[1] + '.csv' #File naming format.
        if not os.path.exists(csv_path): # If we don't already have this month
            response = sendRequest(date)
            if response is not None:
                df = parseResponse(response)
                total += len(df)
                df.to_csv(csv_path, index=False)
                print('Saving ' + csv_path + '...')
    print('Number of articles collected: ' + str(total))
        
         



In [6]:
getData(months_in_range)

Date range: ['2021', '12'] to ['2022', '11']
Working on ['2021', '12']...
Saving headlines/2021-12.csv...
Working on ['2022', '1']...
Saving headlines/2022-1.csv...
Working on ['2022', '2']...
Saving headlines/2022-2.csv...
Working on ['2022', '3']...
Saving headlines/2022-3.csv...
Working on ['2022', '4']...
Saving headlines/2022-4.csv...
Working on ['2022', '5']...
Saving headlines/2022-5.csv...
Working on ['2022', '6']...
Saving headlines/2022-6.csv...
Working on ['2022', '7']...
Saving headlines/2022-7.csv...
Working on ['2022', '8']...
Saving headlines/2022-8.csv...
Working on ['2022', '9']...
Saving headlines/2022-9.csv...
Working on ['2022', '10']...
Saving headlines/2022-10.csv...
Working on ['2022', '11']...
Saving headlines/2022-11.csv...
Number of articles collected: 44673


Merging all the csv files into one file.

In [7]:
import os
import glob
import pandas as pd
os.chdir("/Users/aditya/Desktop/Sem 1/Data Science/Assignment 1/headlines")

extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]

combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
#export to csv
combined_csv.to_csv( "combined_csv.csv", index=False, encoding='utf-8-sig')