In [1]:
import pandas as pd
import numpy as np
import pickle

import re
import string
from collections import Counter

import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import word_tokenize
from nltk.stem import SnowballStemmer
from nltk import classify
from nltk import NaiveBayesClassifier

from sklearn.utils import shuffle
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

### Sources

Dateset 1:  7410 from evalita16 dataset  http://www.di.unito.it/~tutreeb/sentipolc-evalita16/data.html       
Dataset 2:  1989 from evalita16 dataset2 datset link (http:???) total number of tweets 4513 of which 1989 rehydrated and available    
Dataset 3:  165815 from an open dataset for sentiment analysis: https://github.com/charlesmalafosse/open-dataset-for-sentiment-analysis   
 
For dataset 1 and 2 see also:   
http://www.di.unito.it/~tutreeb/sentipolc-evalita16/sentipolc-guidelines2016UPDATED130916.pdf 

In [2]:
# Import training/testing datasets
imp_directory = r'C:\Users\barsanti\Desktop\Courses\Computational social media\project\2_training tools\preclassified_data'

dataset1_path = r'\ita_training_set_sentipolc16.csv'
data1 = pd.read_csv(imp_directory + dataset1_path)

dataset2_path = r'\ita_kar_all_tweets.pkl'
pickle2_in = open(imp_directory + dataset2_path, 'rb')
data2 = pickle.load(pickle2_in)

dataset3_path = r'\ita_betsentiment-IT-tweets-sentiment-players.csv'
data3 = pd.read_csv(imp_directory + dataset3_path, encoding = "ISO-8859-1")

### Functions

In [3]:
#### Define pre-processing subfunctions ####

def removeNonAlphaNumChars(tweets):
    tweets = [re.sub(r'\-', r' ', tweet) for tweet in tweets]  # replace - with SPACE
    tweets = [re.sub(r'[^\w\s]', r'', tweet) for tweet in tweets]  #remove other NON alpha numeric, excluding whitespace
    return tweets

def replaceContractions(tweets):
    tweets = [re.sub(r"[’\']", r' ', tweet) for tweet in tweets]  ## !!!! WEIRD APOSTROPHE BEWARE !!!
    return tweets

# FROM DEFFRO: https://github.com/Deffro/text-preprocessing-techniques
def removeUnicode(tweets):
    """ Removes unicode strings like "\u002c" and "x96" """
    tweets = [re.sub(r'(\\u[0-9A-Fa-f]+)',r'', tweet) for tweet in tweets]
    tweets = [re.sub(r'[^\x00-\x7f]',r'',tweet) for tweet in tweets]
    return tweets

def replaceURL(tweets):
    """ Replaces url address with "url" """
    tweets = [re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL', tweet) for tweet in tweets]
    tweets = [re.sub(r'#([^\s]+)', r'\1', tweet) for tweet in tweets]
    return tweets

def replaceAtUser(tweets):
    """ Replaces "@user" with "atUser" """
    tweets = [re.sub('@[^\s]+','atUser',tweet) for tweet in tweets]
    return tweets

def removeHashtagInFrontOfWord(tweets):
    """ Removes hastag in front of a word """
    tweets = [re.sub(r'#([^\s]+)', r'\1', tweet) for tweet in tweets]
    return tweets
# end DEFFRO


#### DEFINE MAIN PREPROCESSING FUNCTIONS ####

def cleanTweets(tweets):
    tweets = [tweet.lower() for tweet in tweets]  #convert to lowercase
    tweets = replaceURL(tweets) #replace URLs
    tweets = replaceAtUser(tweets)  # replace @user
    tweets = removeHashtagInFrontOfWord(tweets)  # remove hashtag
    tweets = replaceContractions(tweets) # replace ' contractions with space (e.g. l'ordre = l ordre)
    tweets = removeNonAlphaNumChars(tweets)  # remove non alphanumeric characters
    return tweets

def tokenizeTweets(tweets):
    tokens = [nltk.word_tokenize(tweet) for tweet in tweets]
    return tokens

def applyStoplist(tokens):
    tokens = [[word for word in token if len(word) > 2 and (word not in stoplist)] for token in tokens]
    return tokens

def stemTokens(tokens):
    tokens = [[stemmer.stem(word) for word in token] for token in tokens]
    return tokens

In [68]:
def noiseremoval_and_labelling(tweets, polarity):

    tweets_clean = cleanTweets(tweets)
    tweets_tokens = tokenizeTweets(tweets_clean)
    tweets_tokens = applyStoplist(tweets_tokens)
    tweets_tokens = stemTokens(tweets_tokens)
    
    tweets_labeled = [[token, polarity] for token in tweets_tokens]
    
    return tweets_labeled, tweets_tokens

def noiseremoval(tweets):

    tweets_clean = cleanTweets(tweets)
    tweets_tokens = tokenizeTweets(tweets_clean)
    tweets_tokens = applyStoplist(tweets_tokens)
    tweets_tokens = stemTokens(tweets_tokens)
    
    return tweets_tokens

## Convert pos/neg/neu tokens to a dictionary for NLTK NB Classifier ##
def createDict(tokenList):
    for tweetTokens in tokenList:
        yield dict([token, True] for token in tweetTokens)

### Preclassified tweets import

In [5]:
#### Reorganize all datasets into a unique dataframe
all_tweets = pd.DataFrame(0, index = [0], columns = ['origin','id','text','pol'])

for index in data1.index:
    polarity = data1.loc[index,'opos'] - data1.loc[index,'oneg']
    all_tweets.loc[index,:] = ['evalita16', data1.loc[index,'idtwitter'], data1.loc[index,'text'], polarity]
    
for index in data2.index:
    polarity = data2.loc[index,'pos'] - data2.loc[index,'neg']
    all_tweets.loc[all_tweets.shape[0]+1,:] = ['karolos', data2.loc[index,'dtwitter'], data2.loc[index,'TEXT'], polarity]

for index in range(50000):
    if data3.loc[index,'sentiment'] == 'NEUTRAL':
        polarity = 0
    elif data3.loc[index,'sentiment'] == 'POSITIVE':
        polarity = 1
    elif data3.loc[index,'sentiment'] == 'NEGATIVE':
        polarity = -1
    else:
        continue
    
    all_tweets.loc[all_tweets.shape[0]+1,:] = ['sport', data3.loc[index,'tweet_id'], data3.loc[index,'tweet_text'], polarity]

In [6]:
neutral = all_tweets.loc[all_tweets.loc[:,"pol"]== 0]
positive = all_tweets.loc[all_tweets.loc[:,"pol"]== 1]
negative = all_tweets.loc[all_tweets.loc[:,"pol"]== -1]
print("No. neutral: ", len(neutral))
print("No. positive: ", len(positive))
print("No. negative: ", len(negative))
total = len(neutral) + len(positive) + len(negative)

No. neutral:  44932
No. positive:  9048
No. negative:  5260


### Tweets preprocessing

In [75]:
lang = 'italian'
# Clean, tokenize and label the tweets
stoplist = stopwords.words(lang)
other_stopwords = "atUser URL"
stoplist = stoplist + other_stopwords.split() # final list of stopwords
stemmer = SnowballStemmer(lang) # set stemmer
    
it_neu_labeled, it_preClass_neu_tokens = noiseremoval_and_labelling(neutral.loc[:,'text'], 'NEUTRAL')
it_pos_labeled, it_preClass_pos_tokens = noiseremoval_and_labelling(positive.loc[:,'text'], 'POSITIVE')
it_neg_labeled, it_preClass_neg_tokens = noiseremoval_and_labelling(negative.loc[:,'text'], 'NEGATIVE')

it_dataset = it_pos_labeled + it_neg_labeled + it_neu_labeled
it_dataset = shuffle(pd.DataFrame(it_dataset))
it_dataset_train = it_dataset[:int(total*0.7)]
it_dataset_test = it_dataset[int(total*0.7):]
print(it_dataset_train[:5])

# split features and labels for SKLEARN classifier
features = it_dataset.loc[:, 0]
labels  = it_dataset.loc[:, 1]

                                                       0         1
51184                                          [bentorn]   NEUTRAL
39715                                   [apprec, pogba6]   NEUTRAL
27538  [55mln, ricav, magliett, sol, 24h, ronaldoalla...   NEUTRAL
23438  [oggi, poi, quand, chied, qual, giorn, bell, v...   NEUTRAL
11683  [govern, mont, mirabil, esemp, sinerg, fra, we...  NEGATIVE


##### NLTK NB Classifier 

In [None]:
## Convert pos/neg/neu tokens to a dictionary for NLTK NB Classifier ##
posDict = createDict(it_preClass_pos_tokens)
negDict = createDict(it_preClass_neg_tokens)
neuDict = createDict(it_preClass_neu_tokens)

positiveDataset = [(tweet_dict, "Positive") for tweet_dict in posDict]
negativeDataset = [(tweet_dict, "Negative") for tweet_dict in negDict]
neutralDataset = [(tweet_dict, "Neutral") for tweet_dict in neuDict]
modelData = positiveDataset + negativeDataset + neutralDataset
modelData = shuffle(modelData)

print("No. tweets in dictionary: ", len(modelData))
print(modelData[:3])

In [None]:
## NAIVE BAYES FROM NLTK ##
# NB classifier

# Training/test set
it_train = modelData[:int(total*0.7)] #70%
it_test = modelData[int(total*0.7):] #30%

NB_classifier = NaiveBayesClassifier.train(it_train)

print("\nAccuracy (NLTK NB classifier):", classify.accuracy(NB_classifier, it_test ))

###### SKLEARN  classifiers

In [76]:
## Prepare for classifier training (SKLEARN) ##
def dummy(doc):
    return doc

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(tokenizer=dummy,preprocessor=dummy,max_df=0.9, min_df=10)
X=cv.fit_transform(features).toarray()
y=labels

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)

In [77]:
## NAIVE BAYES USING SKLEARN ##

#Import Multinomial Naive Bayes model (https://towardsdatascience.com/sentiment-analysis-of-tweets-using-multinomial-naive-bayes-1009ed24276b)
from sklearn.naive_bayes import MultinomialNB

#Create a multinomial NB Classifier
mnb = MultinomialNB()

#Train the model using the training sets
mnb.fit(X_train, y_train)

#Predict the response for test dataset
y_pred_NB = mnb.predict(X_test)

In [10]:
## RANDOM FOREST USING SKLEARN ##

# Fitting classifier to the Training set
from sklearn.ensemble import RandomForestClassifier

RFclassifier=RandomForestClassifier(n_estimators=200, random_state=0)
RFclassifier.fit(X_train,y_train)
# Predicting the Test set results
y_pred_RF = RFclassifier.predict(X_test)

In [32]:
from sklearn import metrics
from sklearn.metrics import classification_report
# NB report
print (classification_report(y_test, y_pred_NB))
print("\nAccuracy (NB):", metrics.accuracy_score(y_test, y_pred_NB))

              precision    recall  f1-score   support

    NEGATIVE       0.44      0.51      0.47      1571
     NEUTRAL       0.86      0.87      0.86     13520
    POSITIVE       0.55      0.46      0.50      2681

    accuracy                           0.78     17772
   macro avg       0.62      0.61      0.61     17772
weighted avg       0.77      0.78      0.77     17772


Accuracy (NB): 0.7757146072473554


In [None]:
# Random forest report
print ("\n", classification_report(y_test, y_pred_RF))
print("\nAccuracy (RF):", metrics.accuracy_score(y_test, y_pred_RF))

### Test with manually classified tweets

In [12]:
folder_path = r"C:\Users\barsanti\Desktop\Courses\Computational social media\project\2_training tools"
ita_doc_all = r"\tweets_manual_class_it.xlsx"
ita_doc_cov = r"\tweets_manual_class_it_covid.xlsx"

In [13]:
## load manually classified tweets (CH/Covid)
it_labeled_all = pd.read_excel(folder_path + ita_doc_all)
it_labeled_cov = pd.read_excel(folder_path + ita_doc_cov)
print(it_labeled_all.head())
print(len(it_labeled_all))
print(it_labeled_cov.head())
print(len(it_labeled_cov))

   Unnamed: 0                   id         day  hour  \
0           0  1244239906630775040  2020-03-29    12   
1           1  1244240206485742080  2020-03-29    12   
2           2  1244240370415930880  2020-03-29    12   
3           3  1244240702164340992  2020-03-29    12   
4           4  1244241096156332032  2020-03-29    12   

                             time  \
0  Sun Mar 29 12:28:05 +0000 2020   
1  Sun Mar 29 12:29:16 +0000 2020   
2  Sun Mar 29 12:29:55 +0000 2020   
3  Sun Mar 29 12:31:14 +0000 2020   
4  Sun Mar 29 12:32:48 +0000 2020   

                                                text lang  \
0                             @LordOfVenice Forza...   it   
1                    @Crocket81893187 :-( \nForza...   it   
2  Tra i pochi campionati ancora attivi, quello d...   it   
3  Desideriamo offire il nostro contributo concre...   it   
4  Aspettando che tutto torni come prima anche se...   it   

                                               place pol covid  
0  {'id'

In [14]:
# All tweets
neu = it_labeled_all.loc[it_labeled_all.loc[:,"pol"]== 0]
pos = it_labeled_all.loc[it_labeled_all.loc[:,"pol"]== 1]
neg = it_labeled_all.loc[it_labeled_all.loc[:,"pol"]== -1]
print("No. neutral (all): ", len(neu))
print("No. positive (all): ", len(pos))
print("No. negative (all): ", len(neg))
tot = len(neu) + len(pos) + len(neg)

it_neu_all_labeled, it_preClass_neu_all_tokens = noiseremoval_and_labelling(neu.loc[:,'text'], 'NEUTRAL')
it_pos_all_labeled, it_preClass_pos_all_tokens = noiseremoval_and_labelling(pos.loc[:,'text'], 'POSITIVE')
it_neg_all_labeled, it_preClass_neg_all_tokens = noiseremoval_and_labelling(neg.loc[:,'text'], 'NEGATIVE')

it_dataset_all = it_pos_all_labeled + it_neg_all_labeled + it_neu_all_labeled
it_dataset_all = shuffle(pd.DataFrame(it_dataset_all))


No. neutral (all):  90
No. positive (all):  60
No. negative (all):  61


In [15]:
# Covid tweets
neu_cov = it_labeled_cov.loc[it_labeled_cov.loc[:,"pol"]== 0]
pos_cov = it_labeled_cov.loc[it_labeled_cov.loc[:,"pol"]== 1]
neg_cov = it_labeled_cov.loc[it_labeled_cov.loc[:,"pol"]== -1]
print("No. neutral (cov): ", len(neu))
print("No. positive (cov): ", len(pos))
print("No. negative (cov): ", len(neg))
tot_cov = len(neu_cov) + len(pos_cov) + len(neg_cov)

it_neu_cov_labeled, it_preClass_neu_cov_tokens = noiseremoval_and_labelling(neu_cov.loc[:,'text'], 'NEUTRAL')
it_pos_cov_labeled, it_preClass_pos_cov_tokens = noiseremoval_and_labelling(pos_cov.loc[:,'text'], 'POSITIVE')
it_neg_cov_labeled, it_preClass_neg_cov_tokens = noiseremoval_and_labelling(neg_cov.loc[:,'text'], 'NEGATIVE')

it_dataset_cov = it_pos_cov_labeled + it_neg_cov_labeled + it_neu_cov_labeled
it_dataset_cov = shuffle(pd.DataFrame(it_dataset_cov))

No. neutral (cov):  90
No. positive (cov):  60
No. negative (cov):  61


In [16]:
# split features and labels for SKLEARN classifier

features_all = it_dataset_all.loc[:, 0]
labels_all  = it_dataset_all.loc[:, 1]
features_cov = it_dataset_cov.loc[:, 0]
labels_cov  = it_dataset_cov.loc[:, 1]

features1 = pd.concat([features_all, features], axis=0)
labels1  = pd.concat([labels_all, labels], axis=0)

features2 = pd.concat([features_cov, features], axis=0)
labels2  = pd.concat([labels_cov, labels], axis=0)

print("No. features (preclassified): ", len(features))
print("No. features (all): ", len(features1)-len(features))
print("No. features (covid): ", len(features2)-len(features))

# print(features[:3])
# print(features1[:3])
# print(features1[249:252])

No. features (preclassified):  59240
No. features (all):  211
No. features (covid):  112


In [17]:
## use manually labelled tweets (all) only as test set 

X1=cv.fit_transform(features1).toarray()
y1=labels1
# X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size = 249, shuffle = "FALSE", stratify=None)

test_all_index = len(it_dataset_all) # USE THIS INSTEAD OF SKLEARN HELPER TO ENSURE CORRECT SPLIT
X_test1, X_train1 = X1[:test_all_index], X1[test_all_index:]
y_test1, y_train1 = y1[:test_all_index], y1[test_all_index:]
print("Size (test set): ", len(X_test1))
print(y_test1[:3])

# sklearn multinomial NB classifier #
mnb.fit(X_train1, y_train1)
y_pred_NB1 = mnb.predict(X_test1)
print (classification_report(y_test1, y_pred_NB1))
print("\nAccuracy (NB) of manually classified tweets, all:", metrics.accuracy_score(y_test1, y_pred_NB1))

# sklearn random forest classfier #
RFclassifier.fit(X_train1,y_train1)
y_pred_RF1 = RFclassifier.predict(X_test1)
print (classification_report(y_test1, y_pred_RF1))
print("\nAccuracy (RF) of manually classified tweets, all:", metrics.accuracy_score(y_test1, y_pred_RF1))

Size (test set):  211
106    NEGATIVE
132     NEUTRAL
115    NEGATIVE
Name: 1, dtype: object
              precision    recall  f1-score   support

    NEGATIVE       0.67      0.13      0.22        61
     NEUTRAL       0.46      0.90      0.61        90
    POSITIVE       0.75      0.30      0.43        60

    accuracy                           0.51       211
   macro avg       0.63      0.44      0.42       211
weighted avg       0.60      0.51      0.45       211


Accuracy (NB) of manually classified tweets, all: 0.5071090047393365
              precision    recall  f1-score   support

    NEGATIVE       0.57      0.07      0.12        61
     NEUTRAL       0.42      0.88      0.57        90
    POSITIVE       0.50      0.13      0.21        60

    accuracy                           0.43       211
   macro avg       0.50      0.36      0.30       211
weighted avg       0.49      0.43      0.34       211


Accuracy (RF) of manually classified tweets, all: 0.4312796208530806


In [133]:
## use manually labelled tweets (covid) only as test set

X2=cv.fit_transform(features2).toarray()
y2=labels2
# X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size = 250, shuffle = "FALSE", stratify=None)

test_cov_index = len(it_dataset_cov) # USE THIS INSTEAD OF SKLEARN HELPER TO ENSURE CORRECT SPLIT
X_test2, X_train2 = X2[:test_cov_index], X2[test_cov_index:]
y_test2, y_train2 = y2[:test_cov_index], y2[test_cov_index:]
print("Size (test set): ", len(X_test2))
print(y_test2[:3])

# sklearn multinomial NB classifier #
mnb.fit(X_train2, y_train2)
y_pred_NB2 = mnb.predict(X_test2)
print (classification_report(y_test2, y_pred_NB2))
print("\nAccuracy (NB) of manually classified tweets, all:", metrics.accuracy_score(y_test2, y_pred_NB2))

Size (test set):  112
65    NEUTRAL
67    NEUTRAL
93    NEUTRAL
Name: 1, dtype: object
              precision    recall  f1-score   support

    NEGATIVE       0.30      0.55      0.39        22
     NEUTRAL       0.71      0.44      0.55        68
    POSITIVE       0.53      0.73      0.62        22

    accuracy                           0.52       112
   macro avg       0.52      0.57      0.52       112
weighted avg       0.60      0.52      0.53       112


Accuracy (NB) of manually classified tweets, all: 0.5178571428571429


In [None]:
# sklearn random forest classfier #
RFclassifier.fit(X_train2, y_train2)
y_pred_RF2 = RFclassifier.predict(X_test2)
print (classification_report(y_test2, y_pred_RF2))
print("\nAccuracy (RF) of manually classified tweets, all:", metrics.accuracy_score(y_test2, y_pred_RF2))

### Export the classifier for the analysis
    NAIVE BAYES USING SKLEARN

### Plot dynamic classification results

In [21]:
# List of days
days = ['2020-03-29','2020-03-30','2020-03-31']

for y in range(1,31):
    nday = str(y)
    if len(nday) == 1:
        nday = "0" + nday
    day = "2020-04-" +  nday
    days.append(day)

for y in range(1,12):
    nday = str(y)
    if len(nday) == 1:
        nday = "0" + nday
    day = "2020-05-" +  nday
    days.append(day)

In [112]:
# Import tweets
CH_tweets_path = r"C:\Users\barsanti\Desktop\Courses\Computational social media\project\CH_dataset"

# Export folder
export_path = r"C:\Users\barsanti\Desktop\Courses\Computational social media\project\1_descriptive analysis"

df_list = []
ref_list = []

for day in days:
    
    # Import the df of tweets as pickle file
    with open(CH_tweets_path + "\df_tweets_CH_" + day + ".pkl", "rb") as fp: 
        data_CH = pickle.load(fp)
    with open(CH_tweets_path + "\df_ref_tweets_CH_" + day + ".pkl", "rb") as fp: 
        data_ref_CH = pickle.load(fp)
        
    df_list.append(data_CH)
    ref_list.append(data_ref_CH)

In [118]:
def check(words, sentence):
    for word in words:
        if word in sentence:
            return True
    return False

def check_covidrelated_text(text):
    covid_hashtags = ["COVID2019", "COVID-19", "CORONAVIRUS", "CORONA", "VIRUS", "AUSGANSPERRE", "QUARANTINE",
           "BEVÖLKERUNG", "MASKEN", "STAYHOME", "COVID_19", "STAYATHOME", "CORONAKRISE",
            "CORONAVIRUSDELCASTIGOMINORE", "CORONAINFOCH", "COVID", "Desinfektion", "Spital",
           "Kranken", "infektion", "schutz", "Restriktionen", "BAG_OFSP_UFSP", "protectyourselfandothers",
           "Bundesrat"]
    
    melissa_words = ["covid", "covid19", "covid2019", "covid-19", "covid-2019", "covid_19", "covid_2019", "COVIDー19",
            "COVIDー2019", "corona", "virus", "coronavirus", "coronakrise", "coronacrisis", "corona-crise", 
            "pandemic", "coronapandemic", "coronavirusapandemic", "covidpandemic", "covid19pandemic", "covid2019pandemic" 
            "iorestoacasa", "forzalombardia", "quarantena", "covidswitzerland", "wtfockdown", "stayhome", "stayathome",
            "staysafe", "stayhomesavelives", "stayhomestaysafe", "lockdown", "socialdistancing", "distancing", "quarantine",
            "quarantinelife", "confinement", "confinementjour", "restezchezvous", "lavuedepuismonconfinement", 
            "BloqueoNoSolidaridadSi", "protectyourselfandothers", "coronavirusdelcastigominore", "coronainfoch", "masken",
            "masques", "masks", "homeoffice", "wfh", "workfromhome", "BAG_OFSP_UFSP", "restriktionen"]
    
    it_covid_hashtags = ['CoronavirusDelCastigoMinore','coronavirus','COVID19','Covid_19', 'COVID19italia', 
                     'iorestoacasa', 'andratuttobene', 'pandemia', 'Coronavirus', 'COVIDー19', 
                     'coronavirusitalIa', 'COVID2019italia', 'Covid19', 'versusvirus', 'covid19',
                     'coronavirusitalia', 'COVID2019','covid_19italia', 'StayAtHome', 'QuarantineLife']
    
    covid_hashtags = covid_hashtags + it_covid_hashtags + melissa_words
    
    covid_hashtags_upper = [k.upper() for k in covid_hashtags]
    
    sentence = text.upper()
    if check(covid_hashtags_upper, sentence):
        return 'yes'
    else:
        return 'no'
    
for n, df in enumerate(df_list):
    df_ref = ref_list[n]
    for i in df.index:
        if df.loc[i,'covid'] == 'no':
            df.loc[i,'covid'] = check_covidrelated_text(df_ref.loc[i,'ref_text'])

In [22]:
# Create a dataframe to collect all the classification info
df_classified = pd.DataFrame(0, index = days, columns = [('generic','pos'),
                                                         ('generic','neu'),
                                                         ('generic','neg'),
                                                         ('covid','pos'),
                                                         ('covid','neu'),
                                                         ('covid','neg')])

In [134]:
df_classified

Unnamed: 0,"(generic, pos)","(generic, neu)","(generic, neg)","(covid, pos)","(covid, neu)","(covid, neg)"
2020-03-29,0,0,0,0,0,0
2020-03-30,0,0,0,0,0,0
2020-03-31,0,0,0,0,0,0
2020-04-01,0,0,0,0,0,0
2020-04-02,0,0,0,0,0,0
2020-04-03,0,0,0,0,0,0
2020-04-04,0,0,0,0,0,0
2020-04-05,0,0,0,0,0,0
2020-04-06,0,0,0,0,0,0
2020-04-07,0,0,0,0,0,0


In [130]:
# Classify all the tweets
# Using sklearn mNB classifier
for n, day in enumerate(days):
    # classify all the tweets
    tweets = df_list[n].loc[:,'text']
    tokens = noiseremoval(tweets)
    tokens = pd.Series(tokens)
    new_features = pd.concat([tokens, features], axis=0)
    new_X=cv.fit_transform(new_features).toarray()
    y_pred = mnb.predict(new_X)
    
    for i, pol in enumerate(y_pred):
        if i < len(tokens):
            if pol == 'POSITIVE' and df_list[n].loc[:,'cov'] == 'yes':
                df_list[n].loc[i,'pol'] = 1
                df_classified.loc[day, '(covid,pos)'] += 1
            elif pol == 'POSITIVE' and df_list[n].loc[:,'cov'] == 'no':
                df_list[n].loc[i,'pol'] = 1
                df_classified.loc[day, '(generic,pos)'] += 1
            elif pol == 'NEGATIVE' and df_list[n].loc[:,'cov'] == 'yes':
                df_list[n].loc[i,'pol'] = -1
                df_classified.loc[day, '(covid,neg)'] += 1            
            elif pol == 'NEGATIVE' and df_list[n].loc[:,'cov'] == 'no':
                df_list[n].loc[i,'pol'] = -1
                df_classified.loc[day, '(generic,neg)'] += 1
            elif pol == 'NEUTRAL' and df_list[n].loc[:,'cov'] == 'yes':
                df_list[n].loc[i,'pol'] = 0
                df_classified.loc[day, '(covid,neu)'] += 1            
            elif pol == 'NEUTRAL' and df_list[n].loc[:,'cov'] == 'no':
                df_list[n].loc[i,'pol'] = 0
                df_classified.loc[day, '(generic,neu)'] += 1

df_classified.head()            

ValueError: shapes (64876,4987) and (4351,3) not aligned: 4987 (dim 1) != 4351 (dim 0)

In [None]:
fig = plt.subplots(figsize=(12, 6))
plt.bar(days,df_classified)
plt.xticks(rotation=90)
plt.tight_layout()
plt.title('Polarity')
plt.show()

In [129]:
# Test on a single sentence
sentence = ['non sono triste perchè mi è morto il arcotangente alla cosecante di alfa','sono triste perchè mi è morto il gatto']
tokens = noiseremoval(sentence)
tokens = pd.Series(tokens)
new_features = pd.concat([tokens, features], axis=0)

new_X=cv.fit_transform(new_features).toarray()

y_pred = mnb.predict(new_X)
y_pred[:len(tokens)]

array(['NEUTRAL', 'NEGATIVE'], dtype='<U8')

In [125]:
tweets = df_list[0].loc[:,'text']
tokens = noiseremoval(tweets)
tokens = pd.Series(tokens)
new_features = pd.concat([tokens, features], axis=0)
new_X=cv.fit_transform(new_features).toarray()

In [127]:
y_pred = mnb.predict(new_X)

ValueError: shapes (64876,4987) and (4351,3) not aligned: 4987 (dim 1) != 4351 (dim 0)