# Retrieving headlines

### Set up

You may need to install the libraries `beautifulsoup4` and `newspaper3k`.

The `GNews` library needs to be installed  from the Github source. Here is a [StackOverflow forum] I referenced, in case it is helpful.

In [None]:
import sys
!{sys.executable} -m pip install beautifulsoup4
!{sys.executable} -m pip install newspaper3k
!{sys.executable} -m pip install git+https://github.com/ranahaani/GNews.git

In [None]:
from gnews import GNews
import datetime as dt
import pandas as pd
import numpy as np

### Function

In [None]:
# set up the 2 week periods

first_day = np.ones(12, dtype = int)
middle_day = np.repeat(15, 12)
middle_day[1] = 14 # feb
last_day = np.tile([31, 30], 6)
last_day[7:12] = last_day[0:5]
last_day[1] = 28 # feb

start_days = []
end_days = []

for i in range(12):
    
    start_days.append(first_day[i])
    end_days.append(middle_day[i])
    
    start_days.append(middle_day[i])
    end_days.append(last_day[i])

months = np.repeat(range(12), 2) + 1

# print(start_days)
# print(end_days)
# print(months)

In [None]:
def get_headlines(year, keyword):
    """
    year: int 
    keyword: str, the company name
    """
    
    headlines_df = pd.DataFrame(columns = ["date", "title", "publisher"])
    
    for two_week_period in range(24):
    
        month = months[two_week_period]
        start_day = start_days[two_week_period]
        end_day = end_days[two_week_period]

        start = dt.datetime(year, month, start_day)
        end = dt.datetime(year, month, end_day)

        gnews = GNews(language = "en",
                      start_date = start, 
                      end_date = end)

        news_df = pd.DataFrame(gnews.get_news(keyword))

        if news_df.shape == (0, 0):
            print(f"No news between {start} and {end} for {keyword}.\n")
            continue

        news_df['date'] = pd.to_datetime(news_df['published date'])

        headlines_df = pd.concat([headlines_df, news_df[['date', 'title', 'publisher']].copy()],
                                 ignore_index = True)
    
    return headlines_df

### Retrieve data

Only run one cell at a time!

When running these cells, you will get errors about having no news for certain time periods. That's fine, don't re-run the cell. Keep it the way it is so we have records about when the headlines were missing. Just commit and push what you have from that one run.

One day later, it can be helpful to duplicate the cell, change the `range(2018, 2023+1)` to start from whichever year there is missing headlines, and run the code again.

In [None]:
# wafer: apple, amazon

In [None]:
# apple

company = "Apple"

for year in range(2018, 2023+1):

    headlines_df = get_headlines(year, company)

    file = "headlines/" + str(year) + "_" + company + "_headlines.csv"
    headlines_df.to_csv(file, index = False)

In [None]:
# amazon

company = "Amazon"

for year in range(2018, 2023+1):

    headlines_df = get_headlines(year, company)

    file = "headlines/" + str(year) + "_" + company + "_headlines.csv"
    headlines_df.to_csv(file, index = False)

In [None]:
# cindy: nvidia, microsoft

In [None]:
# nvidia

company = "Nvidia"

for year in range(2018, 2023+1):

    headlines_df = get_headlines(year, company)

    file = "headlines/" + str(year) + "_" + company + "_headlines.csv"
    headlines_df.to_csv(file, index = False)

In [None]:
# microsoft

company = "Microsoft"

for year in range(2018, 2023+1):

    headlines_df = get_headlines(year, company)

    file = "headlines/" + str(year) + "_" + company + "_headlines.csv"
    headlines_df.to_csv(file, index = False)