In [2]:
import os
import pandas as pd
%matplotlib inline
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/anna/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

Use news api to get all news articles and headline sentiment related to Covid-19 or Coronavirus

In [4]:
# Read api key environment variable
from dotenv import load_dotenv
load_dotenv()
api_key = os.getenv("newsapikey")

In [5]:
import newsapi

In [6]:
# Create a newsapi client
from newsapi import NewsApiClient
newsapi = NewsApiClient(api_key=api_key)


In [7]:
# Set start and end datetimes of for 30 days allowed access to articles
from datetime import datetime, timedelta
end_date = datetime.now()
start_date = end_date + timedelta(-30)
end_date=end_date.strftime("%Y-%m-%d")
start_date=start_date.strftime("%Y-%m-%d")


In [8]:
# Fetch the Covid19 news articles
covid19_news = newsapi.get_everything(
    from_param=start_date,
    to=end_date,
    q="Covid 19",
    language="en",
    page_size=100,
    sort_by="relevancy"
)
# Print number articles found for reference
print(f"Total articles related to Covid-19: {covid19_news['totalResults']}")

Total articles related to Covid-19: 263672


In [9]:
# Fetch the Coronavirus news articles
corona_news = newsapi.get_everything(
    from_param=start_date,
    to=end_date,
    q="coronavirus",
    language="en",
    page_size=100,
    sort_by="relevancy"
)
# Print number articles found for reference 
print(f"Total articles related to Coronavirus: {corona_news['totalResults']}")

Total articles related to Coronavirus: 264587


In [10]:
# Create the Covid-19 and Coronavirus combined sentiment scores DataFrame
#covid-19 sentiment list to dataframe
covid19_sentiment_list=[]
for article in covid19_news["articles"]:        
   try: 
        text = article["content"]
        sentiment = analyzer.polarity_scores(text)
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]
        scores={"Compound":compound, "Negative":neg, "Neutral":neu, "Positive":pos, "text":text}
        covid19_sentiment_list.append(scores)
        
   except: 
        pass

covid19_sentiment_df=pd.DataFrame(covid19_sentiment_list)
covid19_sentiment_df.head()

Unnamed: 0,Compound,Negative,Neutral,Positive,text
0,0.8934,0.0,0.722,0.278,"Do you remember earlier this year, when people..."
1,0.0772,0.0,0.966,0.034,In addition to the tally of confirmed COVID-19...
2,-0.2924,0.056,0.944,0.0,"Airbus is, of course, one of the world’s two m..."
3,-0.7717,0.193,0.807,0.0,"As a workplace strategist, I am constantly ask..."
4,0.3304,0.071,0.847,0.082,"Most researchers, however, dont believe a huma..."


In [11]:
# Create the corona sentiment scores DataFrame
corona_sentiment_list=[]
for article in corona_news["articles"]:        
   try: 
        text = article["content"]
        sentiment = analyzer.polarity_scores(text)
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]
        scores={"Compound":compound, "Negative":neg, "Neutral":neu, "Positive":pos, "text":text}
        corona_sentiment_list.append(scores)
        
   except: 
        pass

corona_sentiment_df=pd.DataFrame(corona_sentiment_list)
corona_sentiment_df.tail()

Unnamed: 0,Compound,Negative,Neutral,Positive,text
91,0.1779,0.073,0.831,0.096,Image copyrightGeoff Adams-Spink\r\nMany disab...
92,-0.5423,0.193,0.731,0.076,Media captionProtesters overturn car to use as...
93,-0.3182,0.086,0.836,0.078,One of the first national coronavirus contacts...
94,0.4019,0.053,0.849,0.098,Kenyans Geoffrey Kamworor and Joyciline Jepkos...
95,-0.4588,0.086,0.914,0.0,Media captionImages from last year's commemora...


In [12]:
#Combine the data frames to one large Covid-19/Coronavirus Dataframe
#use frames to join the dataframes at the last row
frames=[covid19_sentiment_df, corona_sentiment_df]
allcovidnews_sentiment_df=pd.concat(frames)
allcovidnews_sentiment_df.head()


Unnamed: 0,Compound,Negative,Neutral,Positive,text
0,0.8934,0.0,0.722,0.278,"Do you remember earlier this year, when people..."
1,0.0772,0.0,0.966,0.034,In addition to the tally of confirmed COVID-19...
2,-0.2924,0.056,0.944,0.0,"Airbus is, of course, one of the world’s two m..."
3,-0.7717,0.193,0.807,0.0,"As a workplace strategist, I am constantly ask..."
4,0.3304,0.071,0.847,0.082,"Most researchers, however, dont believe a huma..."


In [13]:
allcovidnews_sentiment_df.tail()

Unnamed: 0,Compound,Negative,Neutral,Positive,text
91,0.1779,0.073,0.831,0.096,Image copyrightGeoff Adams-Spink\r\nMany disab...
92,-0.5423,0.193,0.731,0.076,Media captionProtesters overturn car to use as...
93,-0.3182,0.086,0.836,0.078,One of the first national coronavirus contacts...
94,0.4019,0.053,0.849,0.098,Kenyans Geoffrey Kamworor and Joyciline Jepkos...
95,-0.4588,0.086,0.914,0.0,Media captionImages from last year's commemora...


In [16]:
# Describe the  Sentiment Related to Covid19/Coronavirus
allsentiment_df=allcovidnews_sentiment_df.describe()
allsentiment_df

Unnamed: 0,Compound,Negative,Neutral,Positive
count,193.0,193.0,193.0,193.0
mean,0.029748,0.056254,0.877922,0.065819
std,0.430906,0.061729,0.087105,0.068947
min,-0.9042,0.0,0.628,0.0
25%,-0.3182,0.0,0.829,0.0
50%,0.0,0.053,0.885,0.057
75%,0.3818,0.093,0.943,0.098
max,0.8934,0.311,1.0,0.278


---

# Tokenizer

Use NLTK to get token words

In [18]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
import re

In [19]:
# Expand the default stopwords list if necessary
stop_words=set(stopwords.words('english'))

In [20]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/anna/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [21]:
# Complete the tokenizer function
def tokenizer(sentiment_data):
   # """Tokenizes text."""
    tokens_list=[] 
    for row in sentiment_data['text']: 
        text_block=row
        sentence_tokenized=sent_tokenize(text_block)
        # Create a list of the words
        tokenized_words=[word_tokenize(sentence) for sentence in sentence_tokenized]

        # Convert the words to lowercase
        first_result=[word.lower() for word in tokenized_words[0] if word.lower() not in stop_words]
        #print(first_result)

        # Remove the punctuation
        second_result= [word for word in first_result if word.isalnum()]

        # Remove the stop words
        third_result= [word for word in second_result if not word in stop_words]

        # Lemmatize Words into root words
        # Instantiate the lemmatizer
        lemmatizer = WordNetLemmatizer()
        lem_words=[lemmatizer.lemmatize(word) for word in third_result]

        tokens={"token":lem_words}
        tokens_list.append(tokens)  
        
    return tokens_list

In [22]:
allcovid_tokens=tokenizer(allcovidnews_sentiment_df)
allcovid_tokens[0]

{'token': ['remember',
  'earlier',
  'year',
  'people',
  'sure',
  'pandemic',
  'would',
  'end',
  'weather',
  'warmed']}

In [23]:
# Create a new tokens column for covid19/coronavirus
from pandas import DataFrame
allcovid_tokens_df=pd.DataFrame(allcovid_tokens)
allcovid_tokens_df.head()

Unnamed: 0,token
0,"[remember, earlier, year, people, sure, pandem..."
1,"[addition, tally, confirmed, case, secondary, ..."
2,"[official, initially, wanted, track, location,..."
3,"[workplace, strategist, constantly, asked, wor..."
4,"[smartphones, set, play, significant, role, he..."


In [24]:
# Create a new tokens column for covid19/coronavirus and join to sentiment df
allcovidnews_sentiment_df=pd.concat([allcovidnews_sentiment_df,allcovid_tokens_df], axis=1, join="inner")
allcovidnews_sentiment_df.head()

Unnamed: 0,Compound,Negative,Neutral,Positive,text,token
0,0.8934,0.0,0.722,0.278,"Do you remember earlier this year, when people...","[remember, earlier, year, people, sure, pandem..."
1,0.0772,0.0,0.966,0.034,In addition to the tally of confirmed COVID-19...,"[addition, tally, confirmed, case, secondary, ..."
2,-0.886,0.278,0.722,0.0,Officials initially wanted to track the locati...,"[official, initially, wanted, track, location,..."
3,-0.7717,0.193,0.807,0.0,"As a workplace strategist, I am constantly ask...","[workplace, strategist, constantly, asked, wor..."
4,0.6597,0.0,0.833,0.167,Our smartphones are set to play a significant ...,"[smartphones, set, play, significant, role, he..."


---

# NGrams and Frequency Analysis

Find top 5 words related to covid19 and coronavirus

In [25]:
from collections import Counter
from nltk import ngrams

In [26]:
allcovid_tokens[0]

{'token': ['remember',
  'earlier',
  'year',
  'people',
  'sure',
  'pandemic',
  'would',
  'end',
  'weather',
  'warmed']}

In [27]:
#trying to make list of dictionarys containing strings as values into one big string
#get list of all tokens as a single string for covid-19 and coronavirus articles
allcovid_big_token_list=[]
for dictionary in allcovid_tokens:
    allcovid_big_token_list.extend(dictionary['token'])


In [28]:
def ngram_count(token_list): 
    # Make all articles in the text to one single string
    bigrams = ngrams(token_list, n=2)
    top_5 = dict(Counter(bigrams).most_common(5))
    return list(top_5.items())

In [29]:
# Generate the Covid-19/Coronavirus N-grams where N=2
ngram_count(allcovid_big_token_list)

[(('char', 'image'), 25),
 (('coronavirus', 'pandemic'), 13),
 (('image', 'copyrightgetty'), 13),
 (('chat', 'u'), 11),
 (('u', 'facebook'), 11)]

In [30]:
# using token_count function to generate top 5 words related to covid/corona news
def token_count(token_list, N=5):
    """Returns the top N tokens from the frequency count"""
    # Combine all articles in corpus into one large string
    return Counter(token_list).most_common(N)

In [31]:
# Get the top 5 words for Covid19/Coronavirus
allcovid_top5=token_count(allcovid_big_token_list,N=5)
allcovid_top5

[('char', 67),
 ('coronavirus', 63),
 ('image', 49),
 ('caption', 37),
 ('pandemic', 35)]

In [17]:
allsentiment_df

Unnamed: 0,Compound,Negative,Neutral,Positive
count,193.0,193.0,193.0,193.0
mean,0.029748,0.056254,0.877922,0.065819
std,0.430906,0.061729,0.087105,0.068947
min,-0.9042,0.0,0.628,0.0
25%,-0.3182,0.0,0.829,0.0
50%,0.0,0.053,0.885,0.057
75%,0.3818,0.093,0.943,0.098
max,0.8934,0.311,1.0,0.278


In [22]:
allsentiment_df['Negative'][7]

0.311

In [25]:
# Construct a trading signal based on all covid news sentiment
sentiment_signal_df=[]
if allsentiment_df['Neutral'][1] > (allsentiment_df['Positive'][1] and allsentiment_df['Negative'][1]):
    sentiment_signal = 0.0
elif allsentiment_df['Negative'][1] > (allsentiment_df['Positive'][1] and allsentiment_df['Neutral'][1]):
    sentiment_signal= -1.0
elif allsentiment_df['Positive'][1] > (allsentiment_df['Negative'][1] and allsentiment_df['Netural'][1]):
    sentiment_signal = 1.0
