# Trump Pandemic Tweet Sentiment Analysis

In [25]:
import pandas_market_calendars as mcal
import re
import os
from path import Path
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import numpy as np
import hvplot.pandas

In [26]:
# Download/Update the VADER Lexicon
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/baileycameron/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## Dataset Cleaning Part 1
The loaded dataset of trump tweets contains duplicate and missing values. Duplicate values are caused by stalls in the scraping program, while missing values are due to tweets which were censored by twitter.

In [27]:
# Load the sentiment data for vaccine
trump_df = pd.read_csv('trump_tweets_Jan_2020_Sep_30_2020.csv', infer_datetime_format=True, parse_dates=True)
trump_df = trump_df.drop(columns="Unnamed: 0")
trump_df = trump_df.drop_duplicates(subset=['Time', 'Tweet'], keep='last')
trump_df = trump_df.dropna()
trump_df.tail()

Unnamed: 0,Time,Tweet
2442,2020-01-03T13:54:43.000Z,General Qassem Soleimani has killed or badly w...
2443,2020-01-03T12:44:30.000Z,"Iran never won a war, but never lost a negotia..."
2444,2020-01-02T13:58:01.000Z,A lot of very good people were taken down by a...
2445,2020-01-02T13:42:41.000Z,"Sohrab Ahmari, New York Post “The Trump Campai..."
2446,2020-01-01T01:30:35.000Z,HAPPY NEW YEAR!


In [28]:
# cleanup 'Time' column

def split_timestamps(df):
    
    Date = []
    Time = []
    
    for index,row in trump_df.iterrows():
    
        date = str(row['Time'])[:10]
        time = str(row['Time'])[11:19]
    
        Time.append(time)
        Date.append(date)
    
    df['Date'] = Date
    df['Time Stamp'] = Time 
    
    return df

trump_df = split_timestamps(trump_df)
trump_df = trump_df.drop(columns='Time')
#trump_df['Date'] = pd.to_datetime(trump_df['Date'])
trump_df.tail()

Unnamed: 0,Tweet,Date,Time Stamp
2442,General Qassem Soleimani has killed or badly w...,2020-01-03,13:54:43
2443,"Iran never won a war, but never lost a negotia...",2020-01-03,12:44:30
2444,A lot of very good people were taken down by a...,2020-01-02,13:58:01
2445,"Sohrab Ahmari, New York Post “The Trump Campai...",2020-01-02,13:42:41
2446,HAPPY NEW YEAR!,2020-01-01,01:30:35


In [10]:
trump_df['counter'] = [1 for i in range(len(trump_df))]

-----------
## Identifying Pandemic Tweets
Not every tweet contains information about the pandemic. To identify which tweets were about the pandemic, regex was used to flag tweets containing any combination of:
* china
* flu
* vaccine
* covid
* corona
* chinese
* pharma

In [413]:
# assign a vaccine identifier

flu_pattern = re.compile(r'(china|flu|vaccine|covid|virus|corona|chinese|pharma)')
flu_column = []

for index, row in trump_df.iterrows():

    # converts obj to string
    tweet_str = str(row['Tweet']).lower()
    
    #indicates REGEX match
    if bool(flu_pattern.search(tweet_str)):
        flu_column.append(1)
    else:
        flu_column.append(0)
        
trump_df['COVID_Tweet'] = flu_column

# extracting vaccine tweets
vaccine_df = trump_df[trump_df['COVID_Tweet'] == 1]
vaccine_df.head()

Unnamed: 0,Tweet,Date,Time Stamp,COVID_Tweet
9,Many more people would have died from the Chin...,2020-09-30,18:01:48,1
37,Governor Andrew Cuomo of New York wants to put...,2020-09-25,21:58:06,1
53,White House News Conference today at 6:00 P.M....,2020-09-23,13:21:10,1
77,Joe Biden delivered remarks to union members a...,2020-09-18,22:58:15,1
78,Joe Biden says this is a race between Scranton...,2020-09-18,22:56:18,1


---------------------
## Preliminary Tweet Analysis
Preliminary analysis was done to determine whether tweets related to pandemic occured frequently enough times to justify 'pandemic tweets' as a feature in the subsequent machine learning models.

In [429]:
## Check for frequency of pandemic tweets
graph_df = trump_df.copy()
graph_df['Date'] = pd.to_datetime(graph_df['Date'])

graph_df = graph_df.sort_values(by='Date')

graph_df = graph_df.set_index('Date')
graph_df['Cumulative COVID Tweets'] = np.cumsum(graph_df['COVID_Tweet'])
graph_df['Daily Tweet'] = [i+ 1 for i in range(len(trump_df))]

graph_df.hvplot(y=['Daily Tweet','Cumulative COVID Tweets'],
                title='Cumulative Tally of Trump Daily Tweets to Pandemic Tweets',
                width=700,
                height=400,
               xticks=6)


In [415]:
daily_covid_tweets = trump_df.groupby('Date').sum()

total_days = len(daily_covid_tweets.index)
COVID_days = len(daily_covid_tweets[daily_covid_tweets['COVID_Tweet'] != 0])

print(f'Total days from January 1, 2020 to September 30, 2020: {total_days}')
print(f'Number of days which contain pandemic tweets: {COVID_days}\n')

print(f'Precentage of tweets about pandemic to total tweets: {round((len(vaccine_df)/len(trump_df))*100,2)}%')
print(f'Percentage of days tweeted about pandemic to total days: {round(COVID_days/total_days*100,2)}%\n')

Total days from January 1, 2020 to September 30, 2020: 271
Number of days which contain pandemic tweets: 126

Precentage of tweets about pandemic to total tweets: 11.42%
Percentage of days tweeted about pandemic to total days: 46.49%




**Based on the above analysis, it was decided that there were enough pandemic related tweets to serve as a feature in the subsequent machine learning models.**

--------------
## Calculating Sentiment Score
The following section uses VADER sentiment analysis to analyse each pandemic tweet for a compound, postive, negative, and neutral scores.

In [431]:
# Create the sentiment scores DataFrame
sentiments = []

for index, row in vaccine_df.iterrows():
    try:
        text = row['Tweet']
        date = row['Date']
        sentiment = analyzer.polarity_scores(text)
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]
        
        sentiments.append({
            "text": text,
            "date": date,
            "compound": compound,
            "positive": pos,
            "negative": neg,
            "neutral": neu
            
        })
        
    except AttributeError:
        pass
    
# Create DataFrame
sentiment_df = pd.DataFrame(sentiments)

sentiment_df.head()

Unnamed: 0,text,date,compound,positive,negative,neutral
0,Many more people would have died from the Chin...,2020-09-30,-0.6114,0.072,0.124,0.804
1,Governor Andrew Cuomo of New York wants to put...,2020-09-25,0.8553,0.17,0.0,0.83
2,White House News Conference today at 6:00 P.M....,2020-09-23,0.8188,0.32,0.0,0.68
3,Joe Biden delivered remarks to union members a...,2020-09-18,0.34,0.094,0.0,0.906
4,Joe Biden says this is a race between Scranton...,2020-09-18,-0.6476,0.0,0.115,0.885


---------
## Data Cleaning - Part 2
Since Trump may tweet a pandemic tweet mulitiple times in one day, the daily sentiment scores were aggregated. On days where Trump did not tweet a pandemic tweet, missing values were added.

In [432]:
# averaging daily sentiment scores
daily_sent_df = (sentiment_df.groupby('date')['compound','positive','negative','neutral'].mean()).reset_index()

# creating an empty df with all dates
all_dates = trump_df['Date'].unique()
placeholder = pd.DataFrame(all_dates,columns=['date'])

# joining dfs on the date to identify days trump didn't tweet
placeholder = placeholder.set_index('date')
daily_sent_df = daily_sent_df.set_index('date')
df = pd.concat([placeholder,daily_sent_df],axis=1,join='outer')

df = df.sort_index()
df.head(16)

  


Unnamed: 0,compound,positive,negative,neutral
2020-01-01,,,,
2020-01-02,,,,
2020-01-03,,,,
2020-01-04,,,,
2020-01-05,,,,
2020-01-06,,,,
2020-01-07,,,,
2020-01-08,,,,
2020-01-09,,,,
2020-01-10,,,,


Missing values were filled in according to the last available data point.

In [433]:
# fill from the last available date
df_clean = df.fillna(method='ffill')

# fills blank values from Jan 1 to Jan 15 where no virus tweets
df_clean = df_clean.fillna(0)
df_clean.head(16)

Unnamed: 0,compound,positive,negative,neutral
2020-01-01,0.0,0.0,0.0,0.0
2020-01-02,0.0,0.0,0.0,0.0
2020-01-03,0.0,0.0,0.0,0.0
2020-01-04,0.0,0.0,0.0,0.0
2020-01-05,0.0,0.0,0.0,0.0
2020-01-06,0.0,0.0,0.0,0.0
2020-01-07,0.0,0.0,0.0,0.0
2020-01-08,0.0,0.0,0.0,0.0
2020-01-09,0.0,0.0,0.0,0.0
2020-01-10,0.0,0.0,0.0,0.0


Sentiments that were recorded on non-trading days were removed from the dataset. This was done to align with the stock price availability used in the machine learning models.

In [434]:
# Get only trading days
nyse = mcal.get_calendar('NYSE')

df_final = df_clean[['compound']].copy()

valid_days = nyse.valid_days(start_date='2020-01-01', end_date='2020-09-30')
valid_days = valid_days.strftime('%Y-%m-%d').to_frame()

df_final = pd.concat([valid_days,df_final],join='inner',axis=1)
df_final = df_final.drop(columns=0)

df_final.head(16)


Unnamed: 0,compound
2020-01-02,0.0
2020-01-03,0.0
2020-01-06,0.0
2020-01-07,0.0
2020-01-08,0.0
2020-01-09,0.0
2020-01-10,0.0
2020-01-13,0.0
2020-01-14,0.0
2020-01-15,0.0


Exported the resulting CSV file for the machine learning models.

In [420]:
# export to csv to serve in models.
df_final.to_csv('trump_sentiment_Jan_2020_Sep_30_2020.csv')

-------
## Data Analysis - Part 2
The final dataframe was graphed to check for trends in the sentiment. 

Using a 10 day window, the sentiment analysis shows that Trumps overall sentiment is trending to negative.

In [436]:
graph2_df = df_final.copy()
graph2_df.index = pd.to_datetime(graph2_df.index)

graph2_df['trend'] = graph2_df['compound'].rolling(window=10).mean()

graph2_df.hvplot(y=['compound','trend'],
                height=500, width=900, 
                title='Trump Daily Sentiment Score - Jan 2020 to Sep 30, 2020')
