In [None]:
import requests
import pandas as pd
import json
from pandas.io.json import json_normalize
import time
import sys
import os

In [None]:
# This set of news sources was taken as a representative sample originally. We cast a wider net in a different
# version later in this nb

# list of national news sources confirmed to be in API
national_news_list = ['huffpost.com',  'politico.com', 
                      'time.com', 'msnbc.com',  'cbsnews.com',
                     'latimes.com', 'washingtonpost.com','cnn.com', 
                      'breitbart.com','usatoday.com', 
                      'foxnews.com','wsj.com','nbcnews.com',
                     'bloomberg.com','npr.org']

# list of local news sources confirmed to be in the API
regional_news_df = pd.read_csv('regional_news/top_regional_newspapers_clean.csv', index_col = 0)

regional_domains = []

for index, row in regional_news_df[regional_news_df['in_api_flag'] == 1].iterrows():
    regional_domains.append(row['url'])

# combine the two lists
for source in regional_domains:
    if(source not in national_news_list):
        national_news_list.append(source)
        
the_list = national_news_list

# Paid API Key version

In [None]:
# API loop
news_api_key = '######################'


# build the endpoint query
query = 'covid%20OR%20coronavirus'
year = "2020"
month = 12
startday = 1
endday = 30

full_start ="{}-{}-{}".format(year, month, startday)
full_end = "{}-{}-{}".format(year, month, endday)

dir_name = "news_dfs_{month}.{start}_{month}.{end}".format(month = month, start = startday, end = endday)

# makes the local folder if it doesn't exist
if not os.path.isdir(dir_name):
    os.mkdir(dir_name)

total_calls = 0
total_items = 0
# loops through each URL    
for item in the_list:
    print(item)
    domain = item
    superdata = pd.DataFrame(columns=['title', 'description', 'url', 'publishedAt', 'content', 'source'])
    i = 1
    check = True
    while check:
        endpoint1 = "http://newsapi.org/v2/everything?q={query}&domains={domain}&from={date}".format(query = query, domain = domain, date = full_start) + "T00:00:00&to={date}".format(date = full_end) + "T23:59:59&language=en&pageSize=100&page={page_num}&apiKey={key}".format(query = query, page_num = i, key = news_api_key)
        print(endpoint1)

        news1 = requests.get(endpoint1)
        formatted_news1 = news1.json() 

        formatted_news1 = formatted_news1['articles']
        
        # for checking purposes
        total_calls = total_calls + 1
        
        # breaks loop if there are no more articles
        if not formatted_news1:
            print("no more articles for {}".format(domain))
            break
            
        for j, article in enumerate(formatted_news1):
            title = article['title']
            description = article['description']
            url = article['url']
            publishedAt = article['publishedAt']
            content = article['content']
            source = article['source'].get('name')

            temp = pd.DataFrame({'title': title, 'description': description, 'url':url, 'publishedAt':publishedAt,
                             'content':content, 'source':source}, index = [j])
            superdata = superdata.append(temp, ignore_index = True)

        time.sleep(1)
        
        # increments page number
        i = i+1
    print("Writing {}".format(item))
    superdata.to_csv(dir_name + "/news_{domain}_{month}.{start}_{month}.{end}.csv".format(domain = domain, month = month, start = startday, end = endday), encoding = "utf-8")
    
    # for check at end
    total_items = total_items + 1
    #the_list.remove(item)

print("total_calls: {}".format(total_calls))
print("total_domains: {}".format(total_items))    

In [None]:
# This is the widest possible net, querying for any mention of covid/coronavirus over all articles 
# available in the API. The only thing that needs to be changed is the month and date range
# This cell writes out 4 chunks for each day 
month = 12
for i in range(1,31):
    
    news_api_key = '######################' 


    # build the endpoint query
    query = 'covid%20OR%20coronavirus'
    year = "2020"

    startday = endday = i



    full_start ="{}-{}-{}".format(year, month, startday)
    full_end = "{}-{}-{}".format(year, month, endday)

    dir_name = "fullset"

    # makes the local folder if it doesn't exist
    if not os.path.isdir(dir_name):
        os.mkdir(dir_name)

    total_calls = 0
    total_items = 0
    # loops through each URL    

    superdata = pd.DataFrame(columns=['title', 'description', 'url', 'publishedAt', 'content', 'source'])
    i = 1
    check = True
    while check:
        endpoint1 = "http://newsapi.org/v2/everything?q={query}&from={date}".format(query = query, domain = domain, date = full_start) + "T00:00:00&to={date}".format(date = full_end) + "T11:29:59&language=en&pageSize=100&page={page_num}&apiKey={key}".format(query = query, page_num = i, key = news_api_key)
        print(endpoint1)

        news1 = requests.get(endpoint1)
        formatted_news1 = news1.json()
        print(formatted_news1['totalResults'])
        if(formatted_news1['totalResults'] >=9900):        
            print("total results greater than 9900, redo dates")
            break

        formatted_news1 = formatted_news1['articles']


        # for checking purposes
        total_calls = total_calls + 1


        # breaks loop if there are no more articles
        if not formatted_news1:
            print("no more articles")
            break

        for j, article in enumerate(formatted_news1):
            source = article['source'].get('name')
            if source in list_of_domains:
                continue
            title = article['title']
            description = article['description']
            url = article['url']
            publishedAt = article['publishedAt']
            content = article['content']

            temp = pd.DataFrame({'title': title, 'description': description, 'url':url, 'publishedAt':publishedAt,
                             'content':content, 'source':source}, index = [j])
            superdata = superdata.append(temp, ignore_index = True)


        time.sleep(.2)

        # increments page number
        i = i+1
    print("Writing")
    superdata.to_csv(dir_name + "/news_{month}.{start}_{month}.{end}_fullset_p1.csv".format(domain = domain, month = month, start = startday, end = endday), encoding = "utf-8")

    # for check at end

    print("total_calls: {}".format(total_calls))


    # second part
    superdata1 = pd.DataFrame(columns=['title', 'description', 'url', 'publishedAt', 'content', 'source'])
    i = 1
    check = True
    while check:
        endpoint1 = "http://newsapi.org/v2/everything?q={query}&from={date}".format(query = query, domain = domain, date = full_start) + "T11:30:00&to={date}".format(date = full_end) + "T15:59:59&language=en&pageSize=100&page={page_num}&apiKey={key}".format(query = query, page_num = i, key = news_api_key)
        print(endpoint1)

        news1 = requests.get(endpoint1)
        formatted_news1 = news1.json()
        print(formatted_news1['totalResults'])
        if(formatted_news1['totalResults'] >=9900):

            print("total results greater than 9900, redo dates")
            break

        formatted_news1 = formatted_news1['articles']


        # for checking purposes
        total_calls = total_calls + 1


        # breaks loop if there are no more articles
        if not formatted_news1:
            print("no more articles")
            break

        for j, article in enumerate(formatted_news1):
            source = article['source'].get('name')
            if source in list_of_domains:
                continue
            title = article['title']
            description = article['description']
            url = article['url']
            publishedAt = article['publishedAt']
            content = article['content']

            temp = pd.DataFrame({'title': title, 'description': description, 'url':url, 'publishedAt':publishedAt,
                             'content':content, 'source':source}, index = [j])
            superdata1 = superdata1.append(temp, ignore_index = True)


        time.sleep(.2)

        # increments page number
        i = i+1
    print("Writing")
    superdata1.to_csv(dir_name + "/news_{month}.{start}_{month}.{end}_fullset_p2.csv".format(domain = domain, month = month, start = startday, end = endday), encoding = "utf-8")

    print("total_calls: {}".format(total_calls))
    # third part
    superdata2 = pd.DataFrame(columns=['title', 'description', 'url', 'publishedAt', 'content', 'source'])
    i = 1
    check = True
    while check:
        endpoint1 = "http://newsapi.org/v2/everything?q={query}&from={date}".format(query = query, domain = domain, date = full_start) + "T16:00:00&to={date}".format(date = full_end) + "T20:59:59&language=en&pageSize=100&page={page_num}&apiKey={key}".format(query = query, page_num = i, key = news_api_key)
        print(endpoint1)

        news1 = requests.get(endpoint1)
        formatted_news1 = news1.json()
        print(formatted_news1['totalResults'])
        if(formatted_news1['totalResults'] >=9900):

            print("total results greater than 9900, redo dates")
            break

        formatted_news1 = formatted_news1['articles']


        # for checking purposes
        total_calls = total_calls + 1


        # breaks loop if there are no more articles
        if not formatted_news1:
            print("no more articles")
            break

        for j, article in enumerate(formatted_news1):
            source = article['source'].get('name')
            if source in list_of_domains:
                continue
            title = article['title']
            description = article['description']
            url = article['url']
            publishedAt = article['publishedAt']
            content = article['content']

            temp = pd.DataFrame({'title': title, 'description': description, 'url':url, 'publishedAt':publishedAt,
                             'content':content, 'source':source}, index = [j])
            superdata2 = superdata2.append(temp, ignore_index = True)


        time.sleep(.2)

        # increments page number
        i = i+1
    print("Writing")
    superdata2.to_csv(dir_name + "/news_{month}.{start}_{month}.{end}_fullset_p3.csv".format(domain = domain, month = month, start = startday, end = endday), encoding = "utf-8")


    # for check at end

    print("total_calls: {}".format(total_calls))

    # fourth part
    superdata3 = pd.DataFrame(columns=['title', 'description', 'url', 'publishedAt', 'content', 'source'])
    i = 1
    check = True
    while check:
        endpoint1 = "http://newsapi.org/v2/everything?q={query}&from={date}".format(query = query, domain = domain, date = full_start) + "T21:00:00&to={date}".format(date = full_end) + "T23:59:59&language=en&pageSize=100&page={page_num}&apiKey={key}".format(query = query, page_num = i, key = news_api_key)
        print(endpoint1)

        news1 = requests.get(endpoint1)
        formatted_news1 = news1.json()
        print(formatted_news1['totalResults'])
        if(formatted_news1['totalResults'] >=9900):

            print("total results greater than 9900, redo dates")
            break

        formatted_news1 = formatted_news1['articles']


        # for checking purposes
        total_calls = total_calls + 1


        # breaks loop if there are no more articles
        if not formatted_news1:
            print("no more articles")
            break

        for j, article in enumerate(formatted_news1):
            source = article['source'].get('name')
            if source in list_of_domains:
                continue
            title = article['title']
            description = article['description']
            url = article['url']
            publishedAt = article['publishedAt']
            content = article['content']

            temp = pd.DataFrame({'title': title, 'description': description, 'url':url, 'publishedAt':publishedAt,
                             'content':content, 'source':source}, index = [j])
            superdata3 = superdata3.append(temp, ignore_index = True)


        time.sleep(.2)

        # increments page number
        i = i+1
    print("Writing")
    superdata3.to_csv(dir_name + "/news_{month}.{start}_{month}.{end}_fullset_p4.csv".format(domain = domain, month = month, start = startday, end = endday), encoding = "utf-8")


    # for check at end

    print("total_calls: {}".format(total_calls))