# Coronavirus Tweets: Pandemic Panic

In [95]:
import pandas as pd
from nltk.corpus import stopwords
import numpy as np
import re 
import string
from nltk.tokenize import TweetTokenizer
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from tweet_processing_funcs import *
from nltk import FreqDist

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
import xgboost as xgb


from lime import lime_text

In [92]:
df = pd.read_csv('tweet_polarity.csv')
df.head()

Unnamed: 0,contributors,coordinates,created_at,display_text_range,entities,extended_entities,extended_tweet,favorite_count,favorited,filter_level,...,retweeted,retweeted_status,source,text,truncated,user,expanded,full_text,sentiment,polarity
0,,,Fri Jan 31 23:58:59 +0000 2020,,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",,,0,False,low,...,False,{'created_at': 'Fri Jan 31 23:30:08 +0000 2020...,"<a href=""http://twitter.com/download/iphone"" r...","RT @CNN: There have been more than 9,800 cases...",False,"{'id': 292918761, 'id_str': '292918761', 'name...",{'created_at': 'Fri Jan 31 23:30:08 +0000 2020...,"RT @CNN: There have been more than 9,800 cases...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.5
1,,,Fri Jan 31 23:58:59 +0000 2020,,"{'hashtags': [], 'urls': [], 'user_mentions': ...",,,0,False,low,...,False,{'created_at': 'Fri Jan 31 22:19:00 +0000 2020...,"<a href=""http://twitter.com/download/iphone"" r...","RT @JaneLytv: 43. Zero Hedge, a pro-Trump webs...",False,"{'id': 262334207, 'id_str': '262334207', 'name...",{'created_at': 'Fri Jan 31 22:19:00 +0000 2020...,"43. Zero Hedge, a pro-Trump website, has doxxe...","{'neg': 0.096, 'neu': 0.853, 'pos': 0.05, 'com...",0.0
2,,,Fri Jan 31 23:58:59 +0000 2020,,"{'hashtags': [], 'urls': [], 'user_mentions': ...",,,0,False,low,...,False,{'created_at': 'Fri Jan 31 21:08:39 +0000 2020...,"<a href=""http://twitter.com/download/iphone"" r...",RT @skarlamangla: This flu season has killed 1...,False,"{'id': 2451840349, 'id_str': '2451840349', 'na...",{'created_at': 'Fri Jan 31 21:08:39 +0000 2020...,RT @skarlamangla: This flu season has killed 1...,"{'neg': 0.429, 'neu': 0.571, 'pos': 0.0, 'comp...",0.0
3,,,Fri Jan 31 23:58:59 +0000 2020,,"{'hashtags': [], 'urls': [], 'user_mentions': ...",,,0,False,low,...,False,{'created_at': 'Fri Jan 31 05:06:49 +0000 2020...,"<a href=""http://twitter.com/download/iphone"" r...",RT @jamesmassola: Medical experts raise concer...,False,"{'id': 513208128, 'id_str': '513208128', 'name...",{'created_at': 'Fri Jan 31 05:06:49 +0000 2020...,Medical experts raise concerns about Indonesia...,"{'neg': 0.0, 'neu': 0.846, 'pos': 0.154, 'comp...",1.0
4,,,Fri Jan 31 23:58:59 +0000 2020,,"{'hashtags': [], 'urls': [], 'user_mentions': ...",,,0,False,low,...,False,{'created_at': 'Fri Jan 31 01:27:50 +0000 2020...,"<a href=""http://twitter.com/download/iphone"" r...","RT @popplioikawa: the flu: *results in 500,000...",False,"{'id': 993138436667314176, 'id_str': '99313843...",{'created_at': 'Fri Jan 31 01:27:50 +0000 2020...,"the flu: *results in 500,000 hospitalizations ...","{'neg': 0.113, 'neu': 0.845, 'pos': 0.042, 'co...",0.0


In [105]:
import string
import contractions
import re
import nltk

def remove_url_and_RT(row):
    '''
    This function takes each tweet
    and removes the urls and retweet
    indicator from them.
    '''
    row = re.sub('https://[A-Za-z0-9./]+',"",row)
    row = re.sub('http://[A-Za-z0-9./]+',"",row)
    row = re.sub('^RT',"", row)
    return row

def clean_tweet(tweet):

    '''
    This function takes a tweet variable,
    removes punctuation and linebreaks,
    sets all words to lowercase, and
    returns the cleaned tweet as a single
    variable list.
    '''

    # Grabbing most common punctuation symbols and ellipsis symbol
    punctuation_list = list(string.punctuation)+ ["…"] + ['’']
    punctuation_list.remove('#')


    cleaned_tweet = []

    for symbol in punctuation_list:

        tweet = tweet.replace(symbol, "").lower()

        # Removing trailing characters
        tweet = tweet.rstrip()

        # Cleaning non-ASCII characters
        tweet = re.sub("([^\x00-\x7F])+","",tweet)

    cleaned_tweet.append(tweet)

    return cleaned_tweet

def tokenize(clean_tweet):

    '''
    This function takes a cleaned tweet,
    joins into one string (if not already),
    runs the tweet through NLTK work tokenizer,
    removes English stopwords, replaces "us"
    with "usa," removes numbers and returns
    the tokenized tweet in list format.
    '''

    joined_tweet = ' '.join(clean_tweet)
    stopwords_list = stopwords.words('english')

    tokenizer = TweetTokenizer()
    tokenized_tweet = tokenizer.tokenize(joined_tweet)
    # Removing stopwords
    tokenized_tweet = [word for word in tokenized_tweet if word not in stopwords_list]

    # Subbing 'usa' for 'us'
    tokenized_tweet = ['usa' if word == 'us' else word for word in tokenized_tweet]

    # Removing numbers
    tokenized_tweet = [word for word in tokenized_tweet if not word.isnumeric()]

    return tokenized_tweet

def lem_tweet(tweet):
    '''
    This function takes a tweet in
    the form of a tokenized
    word list and lemmatizes it.
    '''
    lemmatizer = WordNetLemmatizer()

    lemmed_tweet = [lemmatizer.lemmatize(word) for word in tweet]

    return lemmed_tweet

def stem_tweet(tweet):

    stemmer = SnowballStemmer('english')
    stemmed_tweet = [stemmer.stem(word) for word in tweet]

    return stemmed_tweet

def process_tweet(tweet):
    '''
    This function takes an original
    tweet, cleans, tokenizes,
    and lemmatizes the tweet.
    '''
    sans_url_tweet = remove_url_and_RT(tweet)
    cleaned = clean_tweet(sans_url_tweet)
    tokenized = tokenize(cleaned)
    stemmed_tweet = stem_tweet(tokenized)
    lemmed_tweet = lem_tweet(stemmed_tweet)

    return lemmed_tweet

In [113]:
tweets = df['full_text']
processed_tweets = tweets.apply(process_tweet)
processed_tweets = processed_tweets.apply((' ').join)
labels = df['polarity'].astype('str')

X_train, X_test, y_train, y_test = train_test_split(processed_tweets, labels, test_size=.1, random_state=1)

In [16]:
labels.value_counts()

0.0    4346
1.0    2774
0.5    2180
Name: polarity, dtype: int64

In [19]:
y_train.value_counts()

0.0    3908
1.0    2497
0.5    1965
Name: polarity, dtype: int64

In [111]:
def tokenizer(tweet):
    tknzr = TweetTokenizer(strip_handles=True, preserve_case=True)
    return tknzr.tokenize(tweet)

# Vectorization Strategies

## Tfidf Vectorization

Explain

In [102]:
tfidf_unigram = TfidfVectorizer(stop_words='english', tokenizer=tokenizer, preprocessor=None)
tfidf_bigram = TfidfVectorizer(stop_words='english', tokenizer=tokenizer, ngram_range=(1,2))
tfidf_trigram = TfidfVectorizer(stop_words='english', tokenizer=tokenizer, ngram_range=(1,3))

## Count Vectorization

Explain

In [11]:
count_unigram = CountVectorizer(stop_words='english', tokenizer=tokenizer)
count_bigram = CountVectorizer(stop_words='english', tokenizer=tokenizer, ngram_range=(1,2))
count_trigram = CountVectorizer(stop_words='english', tokenizer=tokenizer, ngram_range=(1,3))

## Word2Vec

Explain

## GLoVe

Explain

# Classification Algorithms and Validation Metrics

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [39]:
def classification(X_train, y_train):
    # Logistic Regression
    lr = LogisticRegression()
    lr_model = lr.fit(X_train, y_train)
    # Multinomail Naive Bayes
    nb = MultinomialNB()
    nb_model = nb.fit(X_train, y_train)
    # Random Forest
    rf = RandomForest()
    rf_model = rf.fit(X_train, y_train)
    # Decision Trees
    dt = DecisionTreeClassifier(criterion='entropy')
    dt_model = dt.fit(X_train, y_train)
    # XGBoost
    xgb = xgb.XGBClassifier()
    xgb_model = xgb.fit(X_train, y_train)
    # Support Vector Machines
    svm = SVC(kernel='linear')
    svm_model = svm.fit(X_train, y_train)
    
    return [lr_model, nb_model, rf_model, dt_model, xgb_model, svm_model]

In [117]:
def model_validation(vectorizer, X_train, X_test, y_test):
    accuracy_scores = {}
    
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    
    classifiers = classification(X_train, X_test)
    for classifier in classifiers:
        preds = classifier.predict(X_test)
        accuracy = accuracy_score(y_test, preds)
        accuracy_scores[classifier] = accuracy
        
        
        
    accuracy_df = pd.from_dict(accuracy_scores, columns = ['Model', 'Accuracy'])
    return accuracy_df

# Hyperparameter Tuning