# Retrieving headlines

### Set up

You may need to install the libraries `beautifulsoup4` and `newspaper3k`.

The `GNews` library needs to be installed  from the Github source. Here is a [StackOverflow forum] I referenced, in case it is helpful.

In [1]:
import sys
!{sys.executable} -m pip install beautifulsoup4
!{sys.executable} -m pip install newspaper3k
!{sys.executable} -m pip install git+https://github.com/ranahaani/GNews.git

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl (211 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.1/211.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.10-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.1/81.1 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting tinysegmenter==0.3 (from newspaper3k)
  Downloading tinysegmenter-0.3.tar.gz (16 kB)

In [1]:
from gnews import GNews
import datetime as dt
import pandas as pd
import numpy as np

ModuleNotFoundError: No module named 'gnews'

### Function

In [3]:
# set up the 2 week periods

first_day = np.ones(12, dtype = int)
middle_day = np.repeat(15, 12)
middle_day[1] = 14 # feb
last_day = np.tile([31, 30], 6)
last_day[7:12] = last_day[0:5]
last_day[1] = 28 # feb

start_days = []
end_days = []

for i in range(12):
    
    start_days.append(first_day[i])
    end_days.append(middle_day[i])
    
    start_days.append(middle_day[i])
    end_days.append(last_day[i])

months = np.repeat(range(12), 2) + 1

# print(start_days)
# print(end_days)
# print(months)

In [4]:
def get_headlines(year, keyword):
    """
    year: int 
    keyword: str, the company name
    """
    
    headlines_df = pd.DataFrame(columns = ["date", "title", "publisher"])
    
    for two_week_period in range(24):
    
        month = months[two_week_period]
        start_day = start_days[two_week_period]
        end_day = end_days[two_week_period]

        start = dt.datetime(year, month, start_day)
        end = dt.datetime(year, month, end_day)

        gnews = GNews(language = "en",
                      start_date = start, 
                      end_date = end)

        news_df = pd.DataFrame(gnews.get_news(keyword))

        if news_df.shape == (0, 0):
            print(f"No news between {start} and {end} for {keyword}.\n")
            continue

        news_df['date'] = pd.to_datetime(news_df['published date'])

        headlines_df = pd.concat([headlines_df, news_df[['date', 'title', 'publisher']].copy()],
                                 ignore_index = True)
    
    return headlines_df

### Retrieve data

Only run one cell at a time!

When running these cells, you will get errors about having no news for certain time periods. That's fine, don't re-run the cell. Keep it the way it is so we have records about when the headlines were missing. Just commit and push what you have from that one run.

One day later, it can be helpful to duplicate the cell, change the `range(2018, 2023+1)` to start from whichever year there is missing headlines, and run the code again.

In [None]:
# wafer: apple, amazon

In [None]:
# apple

company = "Apple"

for year in range(2018, 2023+1):

    headlines_df = get_headlines(year, company)

    file = "headlines/" + str(year) + "_" + company + "_headlines.csv"
    headlines_df.to_csv(file, index = False)

In [None]:
# amazon

company = "Amazon"

for year in range(2018, 2023+1):

    headlines_df = get_headlines(year, company)

    file = "headlines/" + str(year) + "_" + company + "_headlines.csv"
    headlines_df.to_csv(file, index = False)

In [None]:
# cindy: nvidia, microsoft

In [78]:
# nvidia

company = "Google"

for year in range(2018, 2023+1):

    headlines_df = get_headlines(year, company)

    file = "headlines/" + str(year) + "_" + company + "_headlines.csv"
    headlines_df.to_csv(file, index = False)

No news between 2023-07-15 00:00:00 and 2023-07-31 00:00:00 for Google.

No news between 2023-08-01 00:00:00 and 2023-08-15 00:00:00 for Google.

No news between 2023-08-15 00:00:00 and 2023-08-31 00:00:00 for Google.

No news between 2023-09-01 00:00:00 and 2023-09-15 00:00:00 for Google.

No news between 2023-09-15 00:00:00 and 2023-09-30 00:00:00 for Google.

No news between 2023-10-01 00:00:00 and 2023-10-15 00:00:00 for Google.

No news between 2023-10-15 00:00:00 and 2023-10-31 00:00:00 for Google.

No news between 2023-11-01 00:00:00 and 2023-11-15 00:00:00 for Google.

No news between 2023-11-15 00:00:00 and 2023-11-30 00:00:00 for Google.

No news between 2023-12-01 00:00:00 and 2023-12-15 00:00:00 for Google.

No news between 2023-12-15 00:00:00 and 2023-12-31 00:00:00 for Google.



In [6]:
# microsoft

company = "Microsoft"

for year in range(2018, 2023+1):

    headlines_df = get_headlines(year, company)

    file = "headlines/" + str(year) + "_" + company + "_headlines.csv"
    headlines_df.to_csv(file, index = False)

No news between 2023-07-15 00:00:00 and 2023-07-31 00:00:00 for Microsoft.

No news between 2023-08-01 00:00:00 and 2023-08-15 00:00:00 for Microsoft.

No news between 2023-08-15 00:00:00 and 2023-08-31 00:00:00 for Microsoft.

No news between 2023-09-01 00:00:00 and 2023-09-15 00:00:00 for Microsoft.

No news between 2023-09-15 00:00:00 and 2023-09-30 00:00:00 for Microsoft.

No news between 2023-10-01 00:00:00 and 2023-10-15 00:00:00 for Microsoft.

No news between 2023-10-15 00:00:00 and 2023-10-31 00:00:00 for Microsoft.

No news between 2023-11-01 00:00:00 and 2023-11-15 00:00:00 for Microsoft.

No news between 2023-11-15 00:00:00 and 2023-11-30 00:00:00 for Microsoft.

No news between 2023-12-01 00:00:00 and 2023-12-15 00:00:00 for Microsoft.

No news between 2023-12-15 00:00:00 and 2023-12-31 00:00:00 for Microsoft.



In [81]:
import warnings
warnings.filterwarnings("ignore")

AAPL = pd.DataFrame()
MSFT = pd.DataFrame()
AMZN = pd.DataFrame()
GOOG = pd.DataFrame()
NVDA = pd.DataFrame()
for year in range(2018, 2023+1):
    AMZN = AMZN.append([pd.read_csv('headlines/' + str(year) + '_Amazon_headlines.csv')],
              ignore_index = True)
    AAPL = AAPL.append([pd.read_csv('headlines/' + str(year) + '_Apple_headlines.csv')],
              ignore_index = True)
    GOOG = GOOG.append([pd.read_csv('headlines/' + str(year) + '_Google_headlines.csv')],
              ignore_index = True)
    MSFT = MSFT.append([pd.read_csv('headlines/' + str(year) + '_Microsoft_headlines.csv')],
              ignore_index = True)
    NVDA = NVDA.append([pd.read_csv('headlines/' + str(year) + '_Nvidia_headlines.csv')],
              ignore_index = True)

In [82]:
AMZN.insert(1,'ticker', 'AMZN')
AAPL.insert(1, 'ticker', 'AAPL')
GOOG.insert(1, 'ticker', 'GOOG')
MSFT.insert(1, 'ticker', 'MSFT')
NVDA.insert(1, 'ticker', 'NVDA')

In [109]:
headlines = pd.concat([AMZN, AAPL, GOOG, MSFT, NVDA], ignore_index = True)
headlines['date'] = pd.to_datetime(headlines['date']).dt.tz_convert(None)
headlines.sort_values(by = ['date','ticker'], ignore_index = True, inplace = True)

In [110]:
headlines['publisher'] = headlines['publisher'].apply(lambda x: ' '.join(x.split()[3:]).strip("'").strip("}")[:-1])
headlines

Unnamed: 0,date,ticker,title,publisher
0,2018-01-01 08:00:00,AAPL,3 tips to maximize Apple's free Pages word pro...,TechRepublic
1,2018-01-01 08:00:00,AAPL,The 20 best iOS games of 2017 - Macworld,Macworld
2,2018-01-01 08:00:00,AAPL,How to Switch Apple Watch Home Screen from Gri...,Wccftech
3,2018-01-01 08:00:00,AMZN,"Will Ferrell, Molly Shannon Tease Trump and Ti...",Hollywood Reporter
4,2018-01-01 08:00:00,AMZN,The Limits of Amazon - WSJ - The Wall Street J...,The Wall Street Journal
...,...,...,...,...
65704,2023-07-11 19:49:00,GOOG,Google's Searchbot Could Put Me Out of a Job -...,The Atlantic
65705,2023-07-11 20:08:55,GOOG,Google On Fixing Discovered Currently Not Inde...,Search Engine Journal
65706,2023-07-11 20:16:08,GOOG,"Google’s head of AR software quits, citing “un...",Ars Technica
65707,2023-07-11 23:30:19,GOOG,Google quietly ditched plans for an AI-powered...,CNBC


In [115]:
headlines.to_csv('headlines.csv', index = False)

# Add sentiment analysis to headlines dataset

In [5]:
import datetime as dt
import pandas as pd
import numpy as np

from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/amberlee/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [45]:
headlines = pd.read_csv('headlines.csv')
headlines.head()

Unnamed: 0,date,ticker,title,publisher
0,2018-01-01 08:00:00,AAPL,3 tips to maximize Apple's free Pages word pro...,TechRepublic
1,2018-01-01 08:00:00,AAPL,The 20 best iOS games of 2017 - Macworld,Macworld
2,2018-01-01 08:00:00,AAPL,How to Switch Apple Watch Home Screen from Gri...,Wccftech
3,2018-01-01 08:00:00,AMZN,"Will Ferrell, Molly Shannon Tease Trump and Ti...",Hollywood Reporter
4,2018-01-01 08:00:00,AMZN,The Limits of Amazon - WSJ - The Wall Street J...,The Wall Street Journal


### Cleaning

**Remove publisher tag -- this a rough solution that works for most tags**

In [46]:
headlines['title'][4].split(' - ')

['The Limits of Amazon', 'WSJ', 'The Wall Street Journal']

In [47]:
headlines['publisher'][4]

'The Wall Street Journal'

In [48]:
def remove_source_tag(headline):
    """
    after each headline, there is a " - [source name]" like " - New York Times". 
    
    this function removes the final occurence of " - ..."
    it doesn't perfectly remove the source tags, ie " - WSJ - The Wall Street Journal" becomes "WSJ"
    
    headline: string
    """
    
    split_title = headline.split(' - ')
    n_splits = len(split_title) - 1
    
    return(' '.join(split_title[0:n_splits]))
    

In [51]:
def clean_headlines(df, start_date='2018-01-01', end_date='2023-07-01'):
    
    df['date'] = pd.to_datetime(df['date'])
    df['date'] = df['date'].apply(lambda ts: ts.replace(hour=0, minute=0, second=0))

    # date range
    start = pd.to_datetime(start_date)
    end = pd.to_datetime(end_date)
    
    df = df.loc[(df['date'] >= start) & (df['date'] <= end)]
    
    # contains company names
    df = df.loc[pd.Series(
        df['title']
    ).str.contains("|".join(['Apple', 'Amazon', 'Google', 
                             'Microsoft','Nvidia']), case = False)]
    
    # remove duplicates
    df.drop_duplicates()
    
    df['title'] = df['title'].apply(lambda headline : remove_source_tag(headline))
    
    return df.reset_index(drop=True)

In [52]:
headlines = clean_headlines(headlines)

### Add scores!

In [53]:
def add_sentiment_scores(df):
    
    compound = []
    neg = []
    neu = []
    pos = []
    
    sid = SentimentIntensityAnalyzer()

    for headline in df['title']:
        ss = sid.polarity_scores(headline)

        compound.append(ss['compound'])
        neg.append(ss['neg'])
        neu.append(ss['neu'])
        pos.append(ss['pos'])
        
    df['compound'] = compound
    df['neg'] = neg
    df['neu'] = neu
    df['pos'] = pos
    
    return df

In [67]:
def wrangle_day(df):
    
    
    # number of headlines per day
    volume = df.groupby(['date', 'ticker']).count().iloc[:, 1]
    
    # take avg
    df = df.groupby(['date', 'ticker']).mean(numeric_only = True)
    df['volume'] = volume
    
    return df.reset_index().sort_values('date')
    

In [59]:
headlines = add_sentiment_scores(headlines)

In [56]:
headlines

Unnamed: 0,date,ticker,title,publisher,compound,neg,neu,pos
0,2018-01-01,AAPL,3 tips to maximize Apple's free Pages word pro...,TechRepublic,0.5106,0.000,0.708,0.292
1,2018-01-01,AAPL,How to Switch Apple Watch Home Screen from Gri...,Wccftech,0.0000,0.000,1.000,0.000
2,2018-01-01,AMZN,"Will Ferrell, Molly Shannon Tease Trump and Ti...",Hollywood Reporter,-0.3182,0.141,0.859,0.000
3,2018-01-01,AMZN,The Limits of Amazon WSJ,The Wall Street Journal,0.1779,0.000,0.702,0.298
4,2018-01-01,AMZN,Amazon Mistakenly Sends AWS Budget Emails Fore...,BleepingComputer,0.4215,0.167,0.455,0.379
...,...,...,...,...,...,...,...,...
56959,2023-07-01,AAPL,"New Apple Watch Ultra, 30-inch iMac make us sa...",Cult of Mac,0.0000,0.000,1.000,0.000
56960,2023-07-01,MSFT,"A quick look back at Microsoft Bob, which was ...",Neowin,-0.6249,0.215,0.785,0.000
56961,2023-07-01,AAPL,The Best Apple AirPods Models to Buy in 2023,IGN,0.6369,0.000,0.656,0.344
56962,2023-07-01,AAPL,How Google Beat Apple To AR And Still Failed,SlashGear,-0.5106,0.292,0.708,0.000


In [68]:
day_headlines = wrangle_day(headlines)
day_headlines

Unnamed: 0,date,ticker,compound,neg,neu,pos,volume
0,2018-01-01,AAPL,0.255300,0.000000,0.854000,0.146000,2
1,2018-01-01,AMZN,0.070300,0.077000,0.754000,0.169250,4
2,2018-01-01,GOOG,0.249680,0.000000,0.843000,0.157000,5
3,2018-01-01,MSFT,0.484700,0.081500,0.594500,0.324000,2
4,2018-01-02,AAPL,0.064093,0.093143,0.815571,0.091286,14
...,...,...,...,...,...,...,...
9263,2023-07-01,GOOG,-0.061820,0.071200,0.888800,0.040000,5
9264,2023-07-01,MSFT,-0.129689,0.067333,0.932667,0.000000,9
9261,2023-07-01,AAPL,0.112262,0.022462,0.897308,0.080231,13
9262,2023-07-01,AMZN,0.253340,0.040400,0.700600,0.258900,10


In [69]:
day_headlines.to_csv('daily_sentiment.csv', index=False)