# Sentiment Analysis of COVID-19 Tweets: When did the Public Panic Set In?

    Notebook by Allison Kelly - allisonkelly42@gmail.com
    

# Imports

In [1]:
%matplotlib inline

# Generic Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, time

# Get JSON
import json

# Text preprocessing libraries
import string
import contractions
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

# Obtain Data

View method to obtain data <a href="https://github.com/akelly66/COVID-Tweet-Sentiment/blob/master/tweet-scraping/COVID-tweets-true.ipynb">here</a>. <br>
<br>The tweet query parameters were as follows:

- <b>Keywords: </b> "coronavirus OR Wuhan virus OR 2019-nCoV OR China flu"<br>
- <b>Date Range: </b> 28 Jan 2020 - 03 Feb 2020<br>
- <b>Location:</b> United States of America<br><br>


In [2]:
df = pd.read_csv("expanded_query_tweets.csv")
df.drop_duplicates(inplace=True)
df = df.query("lang == 'en'")
df.head()

Unnamed: 0,created_at,id,id_str,text,source,truncated,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,...,lang,matching_rules,possibly_sensitive,quoted_status_id,quoted_status_id_str,quoted_status,quoted_status_permalink,display_text_range,extended_tweet,extended_entities
1,Sun Feb 02 23:59:59 +0000 2020,1224120307717410816,1224120307717410816,RT @QuestForSense: Amazing Timelapse as China ...,"<a href=""http://twitter.com/download/android"" ...",False,,,,,...,en,[{'tag': None}],,,,,,,,
2,Sun Feb 02 23:59:58 +0000 2020,1224120306668883971,1224120306668883971,RT @selinawangtv: Bloomberg SCOOP on #coronavi...,"<a href=""http://twitter.com/#!/download/ipad"" ...",False,,,,,...,en,[{'tag': None}],,,,,,,,
3,Sun Feb 02 23:59:58 +0000 2020,1224120305431375872,1224120305431375872,RT @Marfoogle: I have become Ill. But no worri...,"<a href=""http://twitter.com/download/android"" ...",False,,,,,...,en,[{'tag': None}],,,,,,,,
4,Sun Feb 02 23:59:58 +0000 2020,1224120305322467329,1224120305322467329,RT @NPRHealth: U.S. Hospitals Unprepared For A...,"<a href=""http://twitter.com/#!/download/ipad"" ...",False,,,,,...,en,[{'tag': None}],False,,,,,,,
5,Sun Feb 02 23:59:58 +0000 2020,1224120304731029504,1224120304731029504,"RT @SecAzar: At this time, the risk to America...","<a href=""http://twitter.com/download/android"" ...",False,,,,,...,en,[{'tag': None}],,1.223347e+18,1.223347e+18,{'created_at': 'Fri Jan 31 20:47:02 +0000 2020...,"{'url': 'https://t.co/eb4YN1H7QN', 'expanded':...",,,


In [3]:
print(len(df))
print(df.info())
df.describe()

2375
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2375 entries, 1 to 4396
Data columns (total 36 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   created_at                 2375 non-null   object 
 1   id                         2375 non-null   int64  
 2   id_str                     2375 non-null   int64  
 3   text                       2375 non-null   object 
 4   source                     2375 non-null   object 
 5   truncated                  2375 non-null   bool   
 6   in_reply_to_status_id      108 non-null    float64
 7   in_reply_to_status_id_str  108 non-null    float64
 8   in_reply_to_user_id        117 non-null    float64
 9   in_reply_to_user_id_str    117 non-null    float64
 10  in_reply_to_screen_name    112 non-null    object 
 11  user                       2375 non-null   object 
 12  geo                        0 non-null      object 
 13  coordinates                0 non-null      

Unnamed: 0,id,id_str,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,contributors,quote_count,reply_count,retweet_count,favorite_count,quoted_status_id,quoted_status_id_str
count,2375.0,2375.0,108.0,108.0,117.0,117.0,0.0,2375.0,2375.0,2375.0,2375.0,189.0,189.0
mean,1.223425e+18,1.223425e+18,1.223349e+18,1.223349e+18,2.647967e+17,2.647967e+17,,0.065684,0.134737,0.506526,1.038316,1.222812e+18,1.222812e+18
std,145045700000000.0,145045700000000.0,252446100000000.0,252446100000000.0,4.316935e+17,4.316935e+17,,1.009173,1.312387,6.461853,9.356226,2511830000000000.0,2511830000000000.0
min,1.223394e+18,1.223394e+18,1.221944e+18,1.221944e+18,786764.0,786764.0,,0.0,0.0,0.0,0.0,1.190002e+18,1.190002e+18
25%,1.223394e+18,1.223394e+18,1.223345e+18,1.223345e+18,70541620.0,70541620.0,,0.0,0.0,0.0,0.0,1.222912e+18,1.222912e+18
50%,1.223395e+18,1.223395e+18,1.223391e+18,1.223391e+18,1344897000.0,1344897000.0,,0.0,0.0,0.0,0.0,1.223229e+18,1.223229e+18
75%,1.223395e+18,1.223395e+18,1.223393e+18,1.223393e+18,7.302201e+17,7.302201e+17,,0.0,0.0,0.0,0.0,1.223373e+18,1.223373e+18
max,1.22412e+18,1.22412e+18,1.224119e+18,1.224119e+18,1.202716e+18,1.202716e+18,,37.0,36.0,176.0,218.0,1.224101e+18,1.224101e+18


In [4]:
import ast
test = ast.literal_eval(df.retweeted_status[1])

In [5]:
test['extended_tweet']['full_text']

'Amazing Timelapse as China Completes First of Two Hospitals in Wuhan within 10 days having 1,000 beds and 1,400 medical staff to treat those infected with the #coronavirus #CoronavirusOutbreak https://t.co/2LH0xhNsHf'

In [6]:
def get_full_tweet(series):
    series = series.dropna()
    full_tweets = []
    for value in series:
   
        converted_value = ast.literal_eval(value)
        full_tweet = converted_value['text']
        full_tweets.append(full_tweet)
    
    extended_tweet_df = pd.DataFrame(full_tweets, index=series.index, columns=['full_tweet'])
    return extended_tweet_df

In [7]:
extended_tweets = get_full_tweet(df.retweeted_status)

In [8]:
df = pd.DataFrame.join(df, extended_tweets)

In [9]:
df['full_tweet'].fillna(df['text'],inplace=True)

# Preprocess Tweets

The preprocessing portion of this project will only include processing text data, so we'll single out that column now. Further preprocessing on the full dataset will be included in the following section. 

In [10]:
tweet_df = df.loc[:,['created_at','full_tweet']]
tweet_df.head()


Unnamed: 0,created_at,full_tweet
1,Sun Feb 02 23:59:59 +0000 2020,Amazing Timelapse as China Completes First of ...
2,Sun Feb 02 23:59:58 +0000 2020,Bloomberg SCOOP on #coronavirus impact: Chines...
3,Sun Feb 02 23:59:58 +0000 2020,"I have become Ill. But no worries, Its just st..."
4,Sun Feb 02 23:59:58 +0000 2020,U.S. Hospitals Unprepared For A Quickly Spread...
5,Sun Feb 02 23:59:58 +0000 2020,"At this time, the risk to Americans remains lo..."


In [29]:
def clean_tweet(tweet):
    
    '''
    This function takes a tweet variable,
    removes punctuation and linebreaks,
    sets all words to lowercase, and 
    returns the cleaned tweet as a single
    variable list.
    '''
    
    # Grabbing most common punctuation symbols and ellipsis symbol
    punctuation_list = list(string.punctuation) + ["…"]
    
    cleaned_tweet = []
    
    for symbol in punctuation_list:
        tweet = tweet.replace(symbol, "").lower()
        tweet = tweet.rstrip()
      
    cleaned_tweet.append(tweet)
    
    return cleaned_tweet

cleaned_tweet_test = clean_tweet(tweet_df.full_tweet[1])
cleaned_tweet_test        

['amazing timelapse as china completes first of two hospitals in wuhan within 10 days having 1000 beds and 1400 med httpstco3f3pibqhjx']

In [33]:
def tokenize(clean_tweet):
    
    '''
    This function takes a cleaned tweet,
    joins into one string (if not already),
    runs the tweet through NLTK work tokenizer, 
    removes English stopwords, and returns
    the tokenized tweet in list format.
    '''
        
    joined_tweet = ' '.join(clean_tweet)
    stopwords_list = stopwords.words('english')
    
    tokenized_tweet = word_tokenize(joined_tweet)
    tokenized_tweet = [w for w in tokenized_tweet if w not in stopwords_list]
    return tokenized_tweet

tokenized_tweet_test = tokenize(cleaned_tweet_test)
tokenized_tweet_test

['amazing',
 'timelapse',
 'china',
 'completes',
 'first',
 'two',
 'hospitals',
 'wuhan',
 'within',
 '10',
 'days',
 '1000',
 'beds',
 '1400',
 'med',
 'httpstco3f3pibqhjx']

In [38]:
def remove_url(tokenized_tweet):
    '''
    This function takes a tokenized tweet,
    applies a regex search for a url,
    removes the url, and returns 
    the tokenized tweet.
    '''
    url_re = re.compile(r'^https', re.IGNORECASE)
    for word in tokenized_tweet:
        if url_re.search(word) is not None:
            tokenized_tweet.remove(word)
        else:
            continue
    return tokenized_tweet

no_url_test = remove_url(tokenized_tweet_test)
no_url_test

['amazing',
 'timelapse',
 'china',
 'completes',
 'first',
 'two',
 'hospitals',
 'wuhan',
 'within',
 '10',
 'days',
 '1000',
 'beds',
 '1400',
 'med']

In [43]:
def process_tweet(tweet):
    
    cleaned = clean_tweet(tweet)
    tokenized = tokenize(cleaned)
    processed_tweet = remove_url(tokenized)
    
    return processed_tweet

tweet_df['processed_tweets'] = tweet_df.full_tweet.apply(process_tweet)

In [44]:
tweet_df.head()

Unnamed: 0,created_at,full_tweet,processed_tweets
1,Sun Feb 02 23:59:59 +0000 2020,Amazing Timelapse as China Completes First of ...,"[amazing, timelapse, china, completes, first, ..."
2,Sun Feb 02 23:59:58 +0000 2020,Bloomberg SCOOP on #coronavirus impact: Chines...,"[bloomberg, scoop, coronavirus, impact, chines..."
3,Sun Feb 02 23:59:58 +0000 2020,"I have become Ill. But no worries, Its just st...","[become, ill, worries, stuff, related, existin..."
4,Sun Feb 02 23:59:58 +0000 2020,U.S. Hospitals Unprepared For A Quickly Spread...,"[us, hospitals, unprepared, quickly, spreading..."
5,Sun Feb 02 23:59:58 +0000 2020,"At this time, the risk to Americans remains lo...","[time, risk, americans, remains, low, working,..."


# Exploratory Data Analysis

In [13]:
# Drop geo and contributer data
df.drop(['geo', 'coordinates', 'place', 'contributors'], axis=1, inplace=True)

In [14]:
df.drop(['id','id_str','source'], axis=1,inplace=True)

In [15]:
df.created_at = df.created_at.astype(str)
df.created_at = pd.to_datetime(df.created_at)

As seen below, the earliest date from this set of tweets is January 31, though once my request limits reset, I'll be able to get (hopefully) the rest of the tweets dating back to January 28.

In [16]:
df.created_at.min()

Timestamp('2020-01-31 23:54:31+0000', tz='UTC')