In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report,confusion_matrix

### Training labelled sentiment data

First let's load the sentiment data. In part 1 of this series we prepared the sentiment data so that it is divided in the input and output (target) variables. Let's load the prepared data.

In [2]:
target = np.load('target.npy')/4
sent_input = np.load('sentiment_input.npy')

print(sent_input.shape)
print(target.shape)

(1600000, 100)
(1600000, 1)


In [3]:
# Splitting input and output variables into training and test sets
X = sent_input
y = target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=7)

In [4]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

  return f(**kwargs)


LinearDiscriminantAnalysis()

### Model Overview

In [5]:
y_pred = lda.predict(X_test)
print(y_pred)

[0. 0. 1. ... 1. 1. 1.]


In [6]:
def get_metrics(y_test, y_predicted):  
    # true positives / (true positives+false positives)
    precision = precision_score(y_test, y_predicted, pos_label=None,
                                    average='weighted')             
    # true positives / (true positives + false negatives)
    recall = recall_score(y_test, y_predicted, pos_label=None,
                              average='weighted')
    
    # harmonic mean of precision and recall
    f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
    
    # true positives + true negatives/ total
    accuracy = accuracy_score(y_test, y_predicted)
    return accuracy, precision, recall, f1

In [7]:
accuracy, precision, recall, f1 = get_metrics(y_test, y_pred)
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))

accuracy = 0.694, precision = 0.694, recall = 0.694, f1 = 0.694


In [8]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.70      0.68      0.69    201003
         1.0       0.69      0.71      0.70    198997

    accuracy                           0.69    400000
   macro avg       0.69      0.69      0.69    400000
weighted avg       0.69      0.69      0.69    400000



In [9]:
print(confusion_matrix(y_test,y_pred))

[[137296  63707]
 [ 58605 140392]]


In [10]:
error_lda = pd.DataFrame({
        'Actual Values': np.array(y_test).flatten(),
        'Predicted Values': y_pred.flatten()})
error_lda

Unnamed: 0,Actual Values,Predicted Values
0,0.0,0.0
1,0.0,0.0
2,0.0,1.0
3,0.0,0.0
4,1.0,1.0
...,...,...
399995,0.0,1.0
399996,0.0,0.0
399997,0.0,1.0
399998,0.0,1.0


In [11]:
# the probability of each sample belonging to each of the two classes
y_pred_proba = lda.predict_proba(X_test)

In [12]:
print(y_pred_proba)
print(y_pred_proba)

[[0.76496166 0.23503834]
 [0.63058882 0.36941118]
 [0.22380843 0.77619157]
 ...
 [0.32994998 0.67005002]
 [0.38721803 0.61278197]
 [0.34716863 0.65283137]]
[[0.76496166 0.23503834]
 [0.63058882 0.36941118]
 [0.22380843 0.77619157]
 ...
 [0.32994998 0.67005002]
 [0.38721803 0.61278197]
 [0.34716863 0.65283137]]


### Predicting the sentiment of covid tweets

In [13]:
covid_data = np.load('vectorized_100.npy')

In [14]:
print(covid_data.shape)

(8981, 100)


In [15]:
covid_predictions = lda.predict(covid_data)

In [16]:
covid_predicted_values = pd.DataFrame({
        'Predicted Values': covid_predictions.flatten()})

covid_predicted_values

Unnamed: 0,Predicted Values
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
8976,0.0
8977,0.0
8978,0.0
8979,0.0


In [17]:
covid_predicted_values.shape

(8981, 1)

In [18]:
# the probability of each sample belonging to each of the two classes
covid_predictions_probability = lda.predict_proba(covid_data)

In [19]:
covid_negative_probability=[]
covid_positive_probability=[]
for i in covid_predictions_probability:
    covid_positive_probability.append(i[1])
    covid_negative_probability.append(i[0])

In [20]:
print(len(covid_positive_probability))
print(len(covid_negative_probability))

8981
8981


In [21]:
data_country_random = pd.read_csv('data_country_random.csv')

In [22]:
data_country_random['predictions'] = covid_predicted_values
data_country_random['positive_probability'] = covid_positive_probability
data_country_random['negative_probability'] = covid_negative_probability

In [23]:
data_country_random.head()

Unnamed: 0.1,Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,...,cleaned_text,cleaned_tags,len_hashtag,language,country,day,time,predictions,positive_probability,negative_probability
0,0,UNCDF,"New York, NY","Making Finance Work for the Poor: #LDCs, #Fina...",2013-05-15 15:37:50,14838,936,404.0,,2020-08-11 08:34:02,...,humans unlimited needs planet limited capacity...,"sdgs, covid19",2.0,en,United States,2020-08-11,08:34:02,0.0,0.406396,0.593604
1,1,Zachary Burkett,"Austin, TX",I have NO FAITH in the Trump Administration. I...,2013-11-09 21:20:12,35,87,409.0,,2020-08-14 04:12:05,...,realdonaldtrump never run away covid19 nov 202...,covid19,1.0,en,United States,2020-08-14,04:12:05,0.0,0.33215,0.66785
2,2,Reshma Jose,"Chalakudy, India",Bold and beautiful......!,2019-03-21 04:22:32,44,191,7.0,,2020-08-17 06:40:59,...,simply sitting never means nothing morningmoti...,"morningmotivation, covid19, chingam",3.0,en,India,2020-08-17,06:40:59,0.0,0.419405,0.580595
3,3,@Caerdael,Genève,Artiste peintre \n \n@IFJGlobal @ICIJorg @amne...,2010-03-15 16:16:41,278,190,16673.0,,2020-08-16 09:25:31,...,millions covid19 funds flow el salvador imfnew...,covid19,1.0,en,Switzerland,2020-08-16,09:25:31,0.0,0.224099,0.775901
4,4,Douglas Clark,University of Saskatchewan,Associate Professor at @usaskSENS. Tweets on h...,2012-10-05 15:56:52,993,832,16845.0,,2020-07-28 03:02:35,...,probably wouldnt possible design pathogen coul...,covid19,1.0,en,Canada,2020-07-28,03:02:35,0.0,0.400789,0.599211


In [24]:
# printing out first 50 positive tweets and the probability at how likely it is positive vs negative
data_country_random[['text','positive_probability','negative_probability']].loc[data_country_random['predictions'] == 1][:50]

Unnamed: 0,text,positive_probability,negative_probability
12,A recent survey suggests that people are consi...,0.578747,0.421253
13,ICYMI With an expected rise in transfer studen...,0.624745,0.375255
14,karantacker narrates his 48hr ordeal which beg...,0.515527,0.484473
15,Are you a good communicator \nteambuilding w...,0.638869,0.361131
17,Over 10 lakh people were tested for COVID19 fo...,0.532405,0.467595
19,Nature hinting at medicine for covid Look it ...,0.641246,0.358754
22,In our recent blog we take a look at the futur...,0.57016,0.42984
23,we can still enjoy rooftops and beers summer2...,0.638086,0.361914
31,The effect of COVID19 on DigitalDiplomacy watc...,0.645479,0.354521
50,Fuck your RNAVaccine to try and change my DNA ...,0.521533,0.478467


In [25]:
# printing out first 50 negative tweets and the probability at how likely it is positive vs negative
data_country_random[['text','positive_probability','negative_probability']].loc[data_country_random['predictions'] == 0][:50]

Unnamed: 0,text,positive_probability,negative_probability
0,Humans have unlimited needs but the planet has...,0.406396,0.593604
1,realDonaldTrump you will never run away from C...,0.33215,0.66785
2,Simply sitting never means doing nothing morni...,0.419405,0.580595
3,As millions of COVID19 funds flow into El Salv...,0.224099,0.775901
4,It probably wouldnt be possible to design a pa...,0.400789,0.599211
5,State Health Director Warns Coronavirus Is Wid...,0.391273,0.608727
6,CoronaUpdates for Virudhunagar \n\nCOVID19 cas...,0.347414,0.652586
7,WestBengal Trinamool Congress TMC MLA from Egr...,0.231383,0.768617
8,Demand for services like ours has only increas...,0.356197,0.643803
9,Todays COVID19 in info and updates\n\nGOC web...,0.417757,0.582243


In [26]:
data_country_random.to_csv("covid_predictions_with_proba.csv")