In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
data_raw_test = pd.read_csv("data/corona_sentiment/Corona_NLP_test.csv")
data_raw_train = pd.read_csv("data/corona_sentiment/Corona_NLP_train.csv", encoding = "ISO-8859-1")

In [3]:
raw_data = pd.concat([data_raw_test, data_raw_train])
display(raw_data)

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral
...,...,...,...,...,...,...
41152,44951,89903,"Wellington City, New Zealand",14-04-2020,Airline pilots offering to stock supermarket s...,Neutral
41153,44952,89904,,14-04-2020,Response to complaint not provided citing COVI...,Extremely Negative
41154,44953,89905,,14-04-2020,You know itÂs getting tough when @KameronWild...,Positive
41155,44954,89906,,14-04-2020,Is it wrong that the smell of hand sanitizer i...,Neutral


In [4]:
raw_data.isna().sum()

UserName            0
ScreenName          0
Location         9424
TweetAt             0
OriginalTweet       0
Sentiment           0
dtype: int64

Tweet body and labels, plus shuffle

In [5]:
raw_data = raw_data[["OriginalTweet", "Sentiment"]].sample(frac=1)

In [6]:
set(raw_data["Sentiment"].to_list())

{'Extremely Negative', 'Extremely Positive', 'Negative', 'Neutral', 'Positive'}

In [7]:
label_encoding = {'Extremely Negative': 0,  'Negative': 1,  'Neutral': 2, 'Positive': 3, 'Extremely Positive': 4} 

In [8]:
display(raw_data["Sentiment"])

16946    Extremely Positive
29321              Negative
1184                Neutral
31448               Neutral
22802              Positive
                ...        
9926     Extremely Negative
21605              Negative
35046              Positive
21551              Positive
29080              Negative
Name: Sentiment, Length: 44955, dtype: object

In [9]:
raw_data["Sentiment"] = raw_data["Sentiment"].replace(label_encoding)

In [10]:
features = raw_data["OriginalTweet"].to_numpy()

In [11]:
features = CountVectorizer().fit_transform(features)

In [12]:
labels = raw_data["Sentiment"].to_numpy()

In [13]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.3, random_state = 42)

In [14]:
# clf = LogisticRegression()
clf = RandomForestClassifier()
clf.fit(train_features, train_labels)

RandomForestClassifier()

In [15]:
clf.score(test_features, test_labels)

0.48276117743011787

In [16]:
pred_labels = clf.predict(test_features)
confusion_matrix(test_labels,pred_labels)

array([[ 492,  882,  131,  325,   11],
       [ 124, 1566,  555,  994,   45],
       [  12,  285, 1652,  552,    5],
       [  22,  662,  579, 2260,  158],
       [  13,  168,  116, 1337,  541]])

In [17]:
f1_score(test_labels,pred_labels, average = 'weighted')

0.4701466073641439