In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import os
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt

In [None]:
html_tables = {}
for table_name in os.listdir('datasets'):
    table_path = f'datasets/{table_name}'
    table_file = open(table_path,'r')
    html = BeautifulSoup(table_file)
    html_table = html.find(id='news-table')
    html_tables[table_name] = html_table
    
tsla = html_tables['tsla_22sep.html']
# Get all the table rows tagged in HTML with <tr> into 'tesla_tr'
tsla_tr = tsla.find_all("tr")

for i, table_row in enumerate(tsla_tr):
    # Read the text of the element 'a' into 'link_text'
    link_text = table_row.a.get_text()
    # Read the text of the element 'td' into 'data_text'
    data_text = table_row.td.get_text()
    if i == 3:
        break

In [None]:
parsed_news = []
for file_name, news_table in html_tables.items():
    for x in news_table.findAll('tr'):
        text = x.get_text() 
        headline = x.a.get_text()
        # Split the text in the td tag into a list 
        date_scrape = x.td.text.split()
        if len(date_scrape) == 1:
            time = date_scrape[0]
        else:
            date = date_scrape[0]
            time = date_scrape[1]
        ticker = file_name[:file_name.index("_")]
        parsed_news.append([ticker,date, time, headline])

In [2]:
new_words = {
    'crushes': 10,
    'beats': 5,
    'misses': -5,
    'trouble': -10,
    'falls': -100,
}
vader = SentimentIntensityAnalyzer()
vader.lexicon.update(new_words)

columns = ['ticker', 'date', 'time', 'headline']
scored_news = pd.DataFrame(parsed_news,columns=columns)
scores = scored_news['headline'].apply(lambda x:vader.polarity_scores(x))
scores_df = pd.DataFrame([x.values() for x in scores],columns=scores[0].keys())
scored_news = pd.concat([scored_news,scores_df],axis=1)
scored_news['date'] = pd.to_datetime(scored_news.date).dt.date

In [None]:
plt.style.use("fivethirtyeight")
%matplotlib inline

mean_c = scored_news.groupby(['date','ticker']).mean()
mean_c = mean_c.unstack(level='ticker')
mean_c = mean_c.xs('compound',axis=1)
mean_c.plot.bar()

In [None]:
num_news_before = len(scored_news['headline'])
scored_news_clean = scored_news.drop_duplicates(subset=['ticker','headline'])
num_news_after = len(scored_news_clean['headline'])

single_day = scored_news_clean.set_index(['ticker', 'date'])
single_day.drop('compound')
single_day = single_day.xs('fb')
single_day = single_day['2019-01-03']
single_day['time'] = pd.to_datetime(single_day['time']).dt.time
single_day = single_day.set_index('time')
single_day = single_day.sort_index()

In [None]:
TITLE = "Positive, negative and neutral sentiment for FB on 2019-01-03"
COLORS = ["red", "orange", "green"]
plot_day = single_day.drop(['headline','compound'],axis=1)
plot_day.columns = ['negative','neutral','positive']
plot_day.plot.bar(stacked=True,figsize=(10,6),
                  title = TITLE,
                  color=COLORS).legend(bbox_to_anchor=(1.2,0.5))