# Collecting NYT Data


**Table of Contents**  

1. [Collecting Data](#sec1)
2.  [Combining data](#sec2)

<a id="sec1"></a>

## 1. Collecting NYT data from:

2020 - March, April, May ==> Covid <br>
2021 - January, February, March ==> Insurrection <br>
2022 - February, March, April ==> Ukraine War <br>
2023 - October, November, December ==> Palestine War <br>
2024 - January, February, March ==> Elections <br>

In [None]:
myAPIkey = "G2G9F7zgkQ6gQYd6WWjrwwzLHAtG192b"
import pandas as pd

import requests, json
import string

def getNYTArticles(year, month, apiKey):
    """Function that sends a request to the NYT API for all articles in a month
    and then stores the results in a JSON file.
    """
    # create URL
    URL = f"https://api.nytimes.com/svc/archive/v1/{year}/{month}.json?api-key={apiKey}"

    # send the request to get the data
    data = requests.get(URL)
    if data.status_code == 200:
        print(f'Successfully got the data for month {month}.')

    dataJson = data.json() # get response as JSON

    return dataJson

In [None]:
def write_allMonths(year, list_month, apiKey):
    year_headlines = {}
    for month in list_month:
       jsonMonth = getNYTArticles(year, month, apiKey)
       year_headlines[month] = jsonMonth
    return year_headlines
#write_allMonths(2022, [1,2,3,4,5,6,7,8,9,10,11,12], myAPIkey)

In [None]:
dataJson20 = write_allMonths(2020, [3,4,5], myAPIkey)
with open(f"NYT_2020.json", 'w') as fout:
        json.dump(dataJson20, fout)

In [None]:
dataJson21 = write_allMonths(2021, [1,2,3], myAPIkey)
with open(f"NYT_2021.json", 'w') as fout:
        json.dump(dataJson21, fout)

In [None]:
dataJson22 = write_allMonths(2022, [2,3,4], myAPIkey)
with open(f"NYT_2022.json", 'w') as fout:
        json.dump(dataJson22, fout)

In [None]:
dataJson23 = write_allMonths(2023, [10,11,12], myAPIkey)
with open(f"NYT_2023.json", 'w') as fout:
        json.dump(dataJson23, fout)

In [None]:
dataJson24 = write_allMonths(2024, [1,2,3], myAPIkey)
with open(f"NYT_2024.json", 'w') as fout:
        json.dump(dataJson24, fout)

<a id="sec2"></a>

## 2. Combine Data

In [17]:
import json
with open("NYT_2020.json", 'r') as file:
    data = json.load(file)

data['3']['response']['docs']

# abstract, headline, keywords, pub_date': '2020-03-06T19:23:09+0000',
# 'document_type': 'article',
# 'news_desk': 'Climate',
# 'section_name': 'Climate',

[{'abstract': 'Joseph R. Biden Jr. drew on his decades-long relationships and leveraged his close bond with black voters to wrap up a state long considered his stronghold.',
  'web_url': 'https://www.nytimes.com/2020/02/29/us/politics/joe-biden-south-carolina-primary.html',
  'snippet': 'Joseph R. Biden Jr. drew on his decades-long relationships and leveraged his close bond with black voters to wrap up a state long considered his stronghold.',
  'lead_paragraph': 'COLUMBIA, S.C. — Joseph R. Biden Jr. scored a decisive victory in the South Carolina primary on Saturday, reviving his listing campaign and establishing himself as the leading contender to slow Senator Bernie Sanders as the turbulent Democratic race turns to a slew of coast-to-coast contests on Tuesday.',
  'print_section': 'A',
  'print_page': '1',
  'source': 'The New York Times',
  'multimedia': [],
  'headline': {'main': 'Winning South Carolina, Biden Makes Case Against Sanders: ‘Win Big or Lose’',
   'kicker': None,
   '

In [27]:
import os
import json
import pandas as pd

directory_path = r"C:\Users\ashle\OneDrive\School\CS315_Final\NYT_data"

df = pd.DataFrame(columns=['Title', 'Abstract', 'Date', 'DocType', 'NewsDesk', 'SectionName', 'Keywords'])
for filename in os.listdir(directory_path):
    if filename.endswith('.json'):  
        file_path = os.path.join(directory_path, filename)
        with open(file_path, 'r') as file:
            data = json.load(file)
            for month in list(data.keys()):
                    data_parse = data[month]['response']['docs']
                    print(f'There are {len(data_parse)} articles in month {month}')
                    for article in data_parse:
                        title = article['headline']['main']
                        abstract = article["abstract"]
                        date = article["pub_date"]
                        doc_type = article['document_type']
                        news_desk = article['news_desk']
                        section_name = article['section_name']
                        keywords = [word['value'] for word in article['keywords']]
                        article_df = pd.DataFrame([{
                                'Title': title,
                                'Abstract': abstract,
                                'Date': date,
                                'DocType': doc_type,
                                'NewsDesk': news_desk,
                                'SectionName': section_name,
                                'Keywords': keywords
                            }])

                        df = pd.concat([df, article_df], ignore_index=True)

print(df)

There are 4883 articles in month 3
There are 5019 articles in month 4
There are 4347 articles in month 5
There are 7001 articles in month 1
There are 4260 articles in month 2
There are 4786 articles in month 3
There are 4059 articles in month 2
There are 4310 articles in month 3
There are 3934 articles in month 4
There are 3906 articles in month 10
There are 3734 articles in month 11
There are 3525 articles in month 12
There are 3785 articles in month 1
There are 3791 articles in month 2
There are 4242 articles in month 3
                                                   Title  \
0      Winning South Carolina, Biden Makes Case Again...   
1                     At CPAC, Trump Takes Aim at Rivals   
2           The Islanders Are Saying Goodbye to Brooklyn   
3      Trump Moves to Calm Fears as First U.S. Death ...   
4      Mother and Daughter Attacked for Speaking Span...   
...                                                  ...   
65577      Beyoncé’s Country Is America: Every Bit o

In [28]:
csv_file_path = 'NYT_Data.csv'
df.to_csv(csv_file_path, index=False)