In [2]:
import requests
import json
import time
import csv
from tqdm import tqdm

def fetch_articles_for_month(year, month, api_key, csv_writer):
    base_url = "https://api.nytimes.com/svc/archive/v1/"
    url = f"{base_url}{year}/{month}.json?api-key={api_key}"
    response = requests.get(url)
    if response.status_code == 200:
        print(f"Fetching articles for {year}-{month}")
        json_data = response.json()
        articles = json_data["response"]["docs"]
        for idx, article in tqdm(enumerate(articles, start=1)):
                pub_date = article["pub_date"]
                formatted_pub_date = pub_date[:10].replace("|", "-")
                headline = article["headline"]["main"].replace("|", "-")
                kicker = article["headline"]["kicker"].replace("|", "-") if article["headline"]["kicker"] and "kicker" in article["headline"] else None
                content_kicker = article["headline"]["content_kicker"].replace("|", "-") if article["headline"]["content_kicker"] and "content_kicker" in article["headline"] else None
                print_headline = article["headline"]["print_headline"].replace("|", "-") if article["headline"]["print_headline"] and "print_headline" in article["headline"] else None
                document_type = article["document_type"].replace("|", "-")
                csv_writer.writerow([f"{idx}|{formatted_pub_date}|{headline}|{kicker}|{content_kicker}|{print_headline}|{document_type}"])
    else:
        print(f"Error: Unable to fetch articles for {year}-{month}.")

def fetch_articles_for_range(api_key):
    current_year = time.localtime().tm_year
    current_month = time.localtime().tm_mon
    with open('NYT_headlines_dataset_2.csv', mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file, delimiter='|')  # Setting delimiter to "|"
        writer.writerow(['ID', 'Publication Date', 'Headline', 'Kicker', 'Content Kicker', 'Print Headline', 'Document Type'])
        for year in tqdm(range(2020, current_year + 1)):
            start_month = 1 if year != 2020 else 1  # NYT Archive API data starts from April 2016
            end_month = 12 if year != current_year else current_month
            for month in range(start_month, end_month + 1):
                fetch_articles_for_month(year, month, api_key, writer)
                time.sleep(16)  # Delay between requests

api_key = "Nan"
fetch_articles_for_range(api_key)


  0%|          | 0/5 [00:00<?, ?it/s]

Fetching articles for 2020-1


4480it [00:00, 191577.36it/s]


Fetching articles for 2020-2


4240it [00:00, 182948.23it/s]


Fetching articles for 2020-3


4883it [00:00, 192653.36it/s]


Fetching articles for 2020-4


5019it [00:00, 172973.43it/s]


Fetching articles for 2020-5


4347it [00:00, 194220.46it/s]


Fetching articles for 2020-6


4492it [00:00, 203517.26it/s]


Fetching articles for 2020-7


4459it [00:00, 200432.98it/s]


Fetching articles for 2020-8


4439it [00:00, 191155.19it/s]


Fetching articles for 2020-9


4609it [00:00, 215110.46it/s]


Fetching articles for 2020-10


5257it [00:00, 218168.88it/s]


Fetching articles for 2020-11


5114it [00:00, 210894.63it/s]


Fetching articles for 2020-12


4154it [00:00, 202141.00it/s]
 20%|██        | 1/5 [06:33<26:13, 393.35s/it]

Fetching articles for 2021-1


7001it [00:00, 241988.38it/s]


Fetching articles for 2021-2


4260it [00:00, 229603.38it/s]


Fetching articles for 2021-3


4786it [00:00, 228908.92it/s]


Fetching articles for 2021-4


4592it [00:00, 222155.83it/s]


Fetching articles for 2021-5


4265it [00:00, 220412.85it/s]


Fetching articles for 2021-6


4260it [00:00, 232114.83it/s]


Fetching articles for 2021-7


4467it [00:00, 195733.02it/s]


Fetching articles for 2021-8


4223it [00:00, 216598.34it/s]


Fetching articles for 2021-9


4482it [00:00, 217182.36it/s]


Fetching articles for 2021-10


4254it [00:00, 231724.69it/s]


Fetching articles for 2021-11


4006it [00:00, 220521.06it/s]


Fetching articles for 2021-12


3933it [00:00, 227474.15it/s]
 40%|████      | 2/5 [13:24<20:11, 403.75s/it]

Fetching articles for 2022-1


3799it [00:00, 217323.53it/s]


Fetching articles for 2022-2


4059it [00:00, 205820.88it/s]


Fetching articles for 2022-3


4310it [00:00, 192581.69it/s]


Fetching articles for 2022-4


3934it [00:00, 205727.72it/s]


Fetching articles for 2022-5


4179it [00:00, 190422.35it/s]


Fetching articles for 2022-6


4374it [00:00, 133590.76it/s]


Fetching articles for 2022-7


3835it [00:00, 45117.50it/s]


Fetching articles for 2022-8


3983it [00:00, 127452.11it/s]


Fetching articles for 2022-9


4087it [00:00, 105364.18it/s]


Fetching articles for 2022-10


3857it [00:00, 123305.47it/s]


Fetching articles for 2022-11


4742it [00:00, 129083.15it/s]


Fetching articles for 2022-12


3548it [00:00, 127411.35it/s]
 60%|██████    | 3/5 [19:49<13:10, 395.20s/it]

Fetching articles for 2023-1


3423it [00:00, 126800.40it/s]


Fetching articles for 2023-2


3260it [00:00, 156430.47it/s]


Fetching articles for 2023-3


5769it [00:00, 181735.11it/s]


Fetching articles for 2023-4


3718it [00:00, 168045.15it/s]


Fetching articles for 2023-5


3881it [00:00, 157848.18it/s]


Fetching articles for 2023-6


3788it [00:00, 164225.78it/s]


Fetching articles for 2023-7


3548it [00:00, 162944.45it/s]


Fetching articles for 2023-8


3668it [00:00, 162193.55it/s]


Fetching articles for 2023-9


3823it [00:00, 159225.70it/s]


Fetching articles for 2023-10


3906it [00:00, 163121.57it/s]


Fetching articles for 2023-11


3734it [00:00, 164129.14it/s]


Fetching articles for 2023-12


3525it [00:00, 161681.03it/s]
 80%|████████  | 4/5 [26:16<06:32, 392.11s/it]

Fetching articles for 2024-1


3785it [00:00, 161827.51it/s]


Fetching articles for 2024-2


3791it [00:00, 161235.96it/s]


Fetching articles for 2024-3


4242it [00:00, 163286.97it/s]


Fetching articles for 2024-4


3954it [00:00, 166086.93it/s]


Fetching articles for 2024-5


1586it [00:00, 156132.14it/s]
100%|██████████| 5/5 [28:55<00:00, 347.03s/it]
