# Sentiment Analysis of COVID-19 Tweets: When did the Public Panic Set In? Part 4: Supervised Classification Modeling

    Notebook by Allison Kelly - allisonkelly42@gmail.com
    
This notebook is preceded by parts <a href="https://github.com/akelly66/COVID-Tweet-Sentiment/blob/master/tweet-scraping/Twitter-API-Scraping.ipynb">1</a>, <a href="https://github.com/akelly66/COVID-Tweet-Sentiment/blob/master/text-processing/NLP-Text-Processing.ipynb">2</a> and <a href="https://github.com/akelly66/COVID-Tweet-Sentiment/blob/master/EDA/tweet-EDA.ipynb">3</a>. Part 4 will focus on the modeling portion, but is still very much in ins infancy. Markdown cells and complete documentation to come. 

# Imports

In [1]:
import pandas as pd

from gensim.models import word2vec
from nltk import word_tokenize
from ast import literal_eval

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [2]:
train_tweets = pd.read_csv('Data/processed_train.csv', 
                       usecols=['polarity', 'processed_tweets'],
                       # Converting string to list
                       converters={"processed_tweets": literal_eval})
train_tweets.head()

Unnamed: 0,polarity,processed_tweets
0,0,"[switchfoot, httptwitpiccom, 2y1zl, awww, that..."
1,0,"[upset, cant, update, facebook, texting, might..."
2,0,"[kenichan, dived, many, time, ball, managed, s..."
3,0,"[whole, body, feel, itchy, like, fire]"
4,0,"[nationwideclass, behaving, im, mad, cant, see]"


In [3]:
target = train_tweets['polarity']
data = train_tweets['processed_tweets']

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42, test_size=.20)

train_df = pd.concat([X_train, y_train], axis=1) 
test_df = pd.concat([X_test, y_test], axis=1)

In [5]:
train_df.polarity.value_counts()

0    640506
4    639494
Name: polarity, dtype: int64

In [6]:
train_sample = train_df.sample(n=10000, random_state = 42)
train_sample.polarity.value_counts()

4    5017
0    4983
Name: polarity, dtype: int64

In [7]:
all_words_list = [item for sublist in train_sample.processed_tweets for item in sublist]

In [8]:
total_vocab = set(all_words_list)

In [9]:
print(len(all_words_list))
len(total_vocab)

77167


18750

In [10]:
vectorizer = TfidfVectorizer()

In [11]:
train_tweet_list = X_train.apply(('').join)
test_tweet_list = X_test.apply(('').join)

In [12]:
tfidf_train = vectorizer.fit_transform(train_tweet_list)
tfidf_test = vectorizer.transform(test_tweet_list)

In [13]:
tfidf_train.shape

(1280000, 1259141)

In [20]:
nb_classifier = MultinomialNB()
rf_classifier = RandomForestClassifier(n_estimators=5)

In [15]:
nb_classifier.fit(tfidf_train, y_train)
nb_train_preds = nb_classifier.predict(tfidf_train)
nb_test_preds = nb_classifier.predict(tfidf_test)

In [17]:
nb_train_score = accuracy_score(y_train, nb_train_preds)
nb_test_score = accuracy_score(y_test, nb_test_preds)
print("Multinomial Naive Bayes")
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(nb_train_score, nb_test_score))

Multinomial Naive Bayes
Training Accuracy: 0.9961 		 Testing Accuracy: 0.5118


In [None]:
rf_classifier.fit(tfidf_train, y_train)
rf_train_preds = rf_classifier.predict(tfidf_train)
rf_test_preds = rf_classifier.predict(tfidf_test)

In [None]:
rf_train_score = accuracy_score(y_train, rf_train_preds)
rf_test_score = accuracy_score(y_test, rf_test_preds)
print('Random Forest')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(rf_train_score, rf_test_score))