# Sentiment Analysis of COVID-19 Tweets: When did the Public Panic Set In?

    Notebook by Allison Kelly - allisonkelly42@gmail.com
    

# Imports

In [None]:
%matplotlib inline

# Generic Imports
import pandas as pd
pd.set_option('display.max_colwidth', 100) # See more text
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, time

# Get JSON
import json

# Text preprocessing libraries
import string
import contractions
import re
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

# Exploratory data analysis libraries
from wordcloud import WordCloud

# Obtain Data

View method to obtain data <a href="https://github.com/akelly66/COVID-Tweet-Sentiment/blob/master/tweet-scraping/COVID-tweets-true.ipynb">here</a>. <br>
<br>The tweet query parameters were as follows:

- <b>Keywords: </b> "coronavirus OR Wuhan virus OR 2019-nCoV OR China flu"<br>
- <b>Date Range: </b> 28 Jan 2020 - 03 Feb 2020<br>
- <b>Location:</b> United States of America<br><br>


In [None]:
df = pd.read_csv("expanded_query_tweets.csv")
df.drop_duplicates(inplace=True)
df = df.query("lang == 'en'")
df.head()

In [None]:
print(len(df))
print(df.info())
df.describe()

In [None]:
import ast
test = ast.literal_eval(df.retweeted_status[1])

In [None]:
test['extended_tweet']['full_text']

In [None]:
def get_full_tweet(series):
    series = series.dropna()
    full_tweets = []
    for value in series:
   
        converted_value = ast.literal_eval(value)
        full_tweet = converted_value['text']
        full_tweets.append(full_tweet)
    
    extended_tweet_df = pd.DataFrame(full_tweets, index=series.index, columns=['full_tweet'])
    return extended_tweet_df

In [None]:
extended_tweets = get_full_tweet(df.retweeted_status)

In [None]:
extended_tweets.head()

In [None]:
df = pd.DataFrame.join(df, extended_tweets)

In [None]:
df['full_tweet'].fillna(df['text'],inplace=True)

# Preprocess Tweets

The preprocessing portion of this project will only include processing text data, so we'll single out that column now. Further preprocessing on the full dataset will be included in the following section. 

In [None]:
tweet_df = df.loc[:,['created_at','full_tweet']]
tweet_df.head()

In [None]:
def remove_url(row):
    '''
    This function takes each tweet
    and removes the urls from them
    for easier processing.
    '''
    
    row = re.sub(r'http\S+', "", row)
    return row

tweet_df.full_tweet = tweet_df.full_tweet.apply(remove_url)

In [None]:
def clean_tweet(tweet):
    
    '''
    This function takes a tweet variable,
    removes punctuation and linebreaks,
    sets all words to lowercase, and 
    returns the cleaned tweet as a single
    variable list.
    '''
    
    # Grabbing most common punctuation symbols and ellipsis symbol
    punctuation_list = list(string.punctuation)+ ["…"]
    punctuation_list.remove('#')
    
    cleaned_tweet = []
    
    for symbol in punctuation_list:
        tweet = tweet.replace(symbol, "").lower()
        tweet = tweet.rstrip()
      
    cleaned_tweet.append(tweet)
    
    return cleaned_tweet

cleaned_tweet_test = clean_tweet(tweet_df.full_tweet[3])
cleaned_tweet_test        

In [None]:
def tokenize(clean_tweet):
    
    '''
    This function takes a cleaned tweet,
    joins into one string (if not already),
    runs the tweet through NLTK work tokenizer, 
    removes English stopwords, and returns
    the tokenized tweet in list format.
    '''
    
    joined_tweet = ' '.join(clean_tweet)
    stopwords_list = stopwords.words('english')
    
    tokenizer = TweetTokenizer()
    tokenized_tweet = tokenizer.tokenize(joined_tweet)
    tokenized_tweet = [word for word in tokenized_tweet if word not in stopwords_list]
    return tokenized_tweet

    

tokenized_tweet_test = tokenize(cleaned_tweet_test)
tokenized_tweet_test

In [None]:
from nltk.stem import WordNetLemmatizer

def lem_tweet(tweet):
    lemmatizer = WordNetLemmatizer()
    
    lemmed_tweet = [lemmatizer.lemmatize(word) for word in tweet]
    
    return lemmed_tweet

lemmed_tweet_test = lem_tweet(tokenized_tweet_test)

In [None]:
# def stem_tweet(tweet):
    
#     stemmer = SnowballStemmer('english')
#     stemmed_tweet = [stemmer.stem(word) for word in tweet]
    
#     return stemmed_tweet

# stem_test = stem_tweet(no_url_test)
# stem_test

In [None]:
def process_tweet(tweet):
    
    cleaned = clean_tweet(tweet)
    tokenized = tokenize(cleaned)
#     stemmed_tweet = stem_tweet(tokenized)
    lemmed_tweet = lem_tweet(tokenized)
    
    return lemmed_tweet

tweet_df['processed_tweets'] = tweet_df['full_tweet'].apply(process_tweet)

In [None]:
tweet_df = tweet_df.reset_index().drop('index',axis=1)

In [None]:
tweet_df.head()

# Exploratory Data Analysis

In [None]:
pd.set_option('display.max_colwidth', 100)

In [None]:
all_words = [item for sublist in tweet_df.processed_tweets for item in sublist]
all_words = (" ").join(all_words)

In [None]:
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()