In [9]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

from textblob import TextBlob

import matplotlib.pyplot as plt

from nltk.corpus import stopwords

from textblob.classifiers import NaiveBayesClassifier

## Gaussian Naive Bayes with TextBlob 

In [2]:
namelist=['2021Apr.csv', '2020Oct.csv','2020Apr.csv','2019Oct.csv','2019Apr.csv','2018Oct.csv','2018Apr.csv','2017Oct.csv','2017Apr.csv']
df=pd.DataFrame(columns=['retweet_count', 'favorite_count','full_text','created_at'])


for i in namelist:
    dfi=pd.read_csv(i)
    dffi=dfi[['retweet_count', 'favorite_count','full_text','created_at']] # drop 'retweeted' as it's suspiciously all false
    df=pd.concat([df, dffi])

In [8]:
df

Unnamed: 0,retweet_count,favorite_count,full_text,created_at
0,0,0,Pregnant women should be offered Pfizer or Mod...,2021-04-20 23:07:25
1,17,0,RT @VernersViews: Covid Vaccines are 100% safe...,2021-04-20 23:02:16
2,85,0,RT @hanimomo: According to Observer: https://t...,2021-04-20 23:03:16
3,11,0,RT @BordersAgainst: University of Oxford who h...,2021-04-20 23:04:47
4,42,0,RT @CarryOnMargate: Woman had three brain surg...,2021-04-20 23:04:25
...,...,...,...,...
1195,1,0,RT @kasstanb: #Vegan #bodies &amp; #vaccine to...,2017-04-08 05:56:03
1196,91,0,RT @alanmcn1: come and join my new lab @IMIBir...,2017-04-08 09:37:09
1197,91,0,RT @alanmcn1: come and join my new lab @IMIBir...,2017-04-07 20:15:14
1198,46,0,RT @MailOnline: Scientists are working on an a...,2017-04-07 20:19:05


In [26]:
di=df.copy()

In [27]:
di=di[['full_text','created_at']]

In [28]:
# clean tweets
import preprocessor as p
import re
# this package remove Reserved words (RT, FAV),Emojis,Smileys
# https://towardsdatascience.com/basic-tweet-preprocessing-in-python-efd8360d529e

di.full_text=[p.clean(i) for i in di.full_text]

# remove puncutations 
di.full_text = [re.sub(r'[^\w\s]', '', i) for i in di.full_text]

# remove colon 
di.full_text=[re.sub(':', '',i) for i in di.full_text]

# lowercase all sentences
di.full_text=[x.lower() for x in di.full_text]

In [29]:
di

Unnamed: 0,full_text,created_at
0,pregnant women should be offered pfizer or mod...,2021-04-20 23:07:25
1,covid vaccines are safe for pregnant women t...,2021-04-20 23:02:16
2,according to observer,2021-04-20 23:03:16
3,university of oxford who have developed a vac...,2021-04-20 23:04:47
4,woman had three brain surgeries due to blood ...,2021-04-20 23:04:25
...,...,...
1195,amp toxins court orders mother to immunise,2017-04-08 05:56:03
1196,come and join my new lab phd studentship on ...,2017-04-08 09:37:09
1197,come and join my new lab phd studentship on ...,2017-04-07 20:15:14
1198,scientists are working on an acne vaccine tha...,2017-04-07 20:19:05


In [30]:
# extract month to build classification label 
di.created_at=[i[:7] for i in df.created_at]

In [31]:
# remove stopwords

dd=[]
for item in di['full_text']:    
    string = ' '.join([w for w in item.split() if w not in stopwords.words('english')])
    dd.append(string)

di['full_text']=dd

In [37]:
# removing stopwords takes long time. So better to export the results to later use it directly
di.to_csv('datasets without stopwords.csv')


In [2]:
di=pd.read_csv('datasets without stopwords.csv')

In [3]:
# make all tweet string
di['full_text']=[str(i) for i in di.full_text]

In [4]:
# POS
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# 获取单词的词性
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

# sentence = 'football is a family of team sports that involve, to varying degrees, kicking a ball to score a goal.'
# tokens = word_tokenize(sentence)  # 分词
# tagged_sent = pos_tag(tokens)     # 获取单词词性


In [5]:
# lemmatise
wnl = WordNetLemmatizer()
newlist=[]
for sentence in di.full_text:
    tokens = word_tokenize(sentence)  # tokenisation
    tagged_sent = pos_tag(tokens)     # get POS 
    
    lemmas_sent = []
    for tag in tagged_sent:
        wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN # assign a POS or assign noun
        lemmas_sent.append(wnl.lemmatize(tag[0], pos=wordnet_pos))
    temp=' '.join(lemmas_sent)
    newlist.append(temp)

In [47]:
di.full_text=newlist
di=di[['full_text', 'created_at']]


In [49]:
di.to_csv('cleaned dataset.csv')

In [16]:
di=pd.read_csv('cleaned dataset.csv')
di=di[['full_text', 'created_at']]
di.full_text=[str(i) for i in di.full_text]

In [17]:
# create dataset 1 which just has 3 time period. (split train-test and see performance) 
da=di.copy()
da=da[da.created_at.isin(['2021-04', '2020-10', '2020-04'])]

# create dataset 2 which just has 6 time period.
db=di.copy()
db=db[db.created_at.isin(['2021-04', '2020-10', '2020-04', '2019-10', '2019-04', '2018-10'])]

# create dataset 3 which just has 9 time period.
dc=di.copy()

# both NB and RNN will be tested in this way. 

In [8]:
di.created_at.unique()

array(['2021-04', '2020-10', '2020-04', '2019-10', '2019-04', '2018-10',
       '2018-04', '2017-10', '2017-04'], dtype=object)

In [18]:
da

Unnamed: 0,full_text,created_at
0,pregnant woman offer pfizer moderna vaccine sa...,2021-04
1,covid vaccine safe pregnant woman lieoriginal ...,2021-04
2,accord observer,2021-04
3,university oxford develop vaccine alongside as...,2021-04
4,woman three brain surgery due blood clot jampj...,2021-04
...,...,...
3595,team also successfully produce synthetic sarsc...,2020-04
3596,djokovic refuse vaccine marvellous rest supers...,2020-04
3597,cpi join national taskforce identify develop v...,2020-04
3598,wish share optimism paul sadly year watch anti...,2020-04


In [19]:
# da
# separate into training and test sections, check length of each

jtad_train_df, jtad_test_df = train_test_split(da, test_size=0.2, random_state=1)

print("training is ", len(jtad_train_df), " values long")
print("testing is ", len(jtad_test_df), " values long")

#convert the data frames to lists as textblob.classifier does not take data frames
jtad_train_list = jtad_train_df.values.tolist()
jtad_test_list = jtad_test_df.values.tolist()

# Train a textblob classifier on the training set
cl = NaiveBayesClassifier(jtad_train_list)

# check accuracy on test set
cl.accuracy(jtad_test_list)

training is  2880  values long
testing is  720  values long


0.9013888888888889

In [20]:
# db
# separate into training and test sections, check length of each

jtad_train_df, jtad_test_df = train_test_split(db, test_size=0.2, random_state=1)

print("training is ", len(jtad_train_df), " values long")
print("testing is ", len(jtad_test_df), " values long")

#convert the data frames to lists as textblob.classifier does not take data frames
jtad_train_list = jtad_train_df.values.tolist()
jtad_test_list = jtad_test_df.values.tolist()

# Train a textblob classifier on the training set
cl = NaiveBayesClassifier(jtad_train_list)

# check accuracy on test set
cl.accuracy(jtad_test_list)

training is  5760  values long
testing is  1440  values long


0.8513888888888889

In [21]:
# dc
# separate into training and test sections, check length of each

jtad_train_df, jtad_test_df = train_test_split(dc, test_size=0.2, random_state=1)

print("training is ", len(jtad_train_df), " values long")
print("testing is ", len(jtad_test_df), " values long")

#convert the data frames to lists as textblob.classifier does not take data frames
jtad_train_list = jtad_train_df.values.tolist()
jtad_test_list = jtad_test_df.values.tolist()

# Train a textblob classifier on the training set
cl = NaiveBayesClassifier(jtad_train_list)

# check accuracy on test set
cl.accuracy(jtad_test_list)

training is  8640  values long
testing is  2160  values long


0.7921296296296296

In [26]:
# Order the train data
# da
# separate into training and test sections, check length of each

jtad_train_df, jtad_test_df = train_test_split(da, test_size=0.2, random_state=1)
jtad_train_df.sort_values(by=['created_at'])

print("training is ", len(jtad_train_df), " values long")
print("testing is ", len(jtad_test_df), " values long")


#convert the data frames to lists as textblob.classifier does not take data frames
jtad_train_list = jtad_train_df.values.tolist()
jtad_test_list = jtad_test_df.values.tolist()

# Train a textblob classifier on the training set
cl = NaiveBayesClassifier(jtad_train_list)

# check accuracy on test set
print('accuracy for sorted train set is', cl.accuracy(jtad_test_list))


training is  2880  values long
testing is  720  values long
accuracy for sorted train set is 0.9013888888888889


In [27]:
# Order the train data
# db
# separate into training and test sections, check length of each

jtad_train_df, jtad_test_df = train_test_split(db, test_size=0.2, random_state=1)
jtad_train_df.sort_values(by=['created_at'])

print("training is ", len(jtad_train_df), " values long")
print("testing is ", len(jtad_test_df), " values long")


#convert the data frames to lists as textblob.classifier does not take data frames
jtad_train_list = jtad_train_df.values.tolist()
jtad_test_list = jtad_test_df.values.tolist()

# Train a textblob classifier on the training set
cl = NaiveBayesClassifier(jtad_train_list)

# check accuracy on test set
print('accuracy for sorted train set is', cl.accuracy(jtad_test_list))


training is  5760  values long
testing is  1440  values long
accuracy for sorted train set is 0.8513888888888889


In [28]:
# Order the train data
# dc
# separate into training and test sections, check length of each

jtad_train_df, jtad_test_df = train_test_split(dc, test_size=0.2, random_state=1)
jtad_train_df.sort_values(by=['created_at'])

print("training is ", len(jtad_train_df), " values long")
print("testing is ", len(jtad_test_df), " values long")


#convert the data frames to lists as textblob.classifier does not take data frames
jtad_train_list = jtad_train_df.values.tolist()
jtad_test_list = jtad_test_df.values.tolist()

# Train a textblob classifier on the training set
cl = NaiveBayesClassifier(jtad_train_list)

# check accuracy on test set
print('accuracy for sorted train set is', cl.accuracy(jtad_test_list))


training is  8640  values long
testing is  2160  values long
accuracy for sorted train set is 0.7921296296296296


In [30]:
# now ordered and unordered training have same results.
# is this the correct way to put in ordered data? -Anran

In [None]:
# put 3 data sets into NBC and RNN
# get accuracy, put into table. 6 values

In [None]:
# order the train set and test with NBC. 3 values

In [None]:
# plot word cloud for each period

In [None]:
# any further edition for RNN?

In [50]:
# separate into training and test sections, check length of each

jtad_train_df, jtad_test_df = train_test_split(di, test_size=0.2, random_state=1)

print("training is ", len(jtad_train_df), " values long")
print("testing is ", len(jtad_test_df), " values long")

training is  8640  values long
testing is  2160  values long


In [23]:
#convert the data frames to lists as textblob.classifier does not take data frames

jtad_train_list = jtad_train_df.values.tolist()
jtad_test_list = jtad_test_df.values.tolist()

In [24]:
# Train a textblob classifier on the training set
cl = NaiveBayesClassifier(jtad_train_list)

In [25]:
# check accuracy on test set

cl.accuracy(jtad_test_list)

0.8009259259259259

In [74]:
df1=df.copy()

In [75]:
df1['polarity']=[TextBlob(i).sentiment.polarity for i in df1.full_text] 
df1['subjectivity']=[TextBlob(i).sentiment.subjectivity for i in df1.full_text] 

In [76]:
df1=df1.drop(['full_text'], axis=1)

In [77]:
df1

Unnamed: 0,retweet_count,favorite_count,created_at,polarity,subjectivity
0,0,0,2021-04,0.333333,0.500000
1,17,0,2021-04,0.416667,0.500000
2,85,0,2021-04,0.000000,0.000000
3,11,0,2021-04,0.050000,0.650000
4,42,0,2021-04,-0.125000,0.375000
...,...,...,...,...,...
1195,1,0,2017-04,0.000000,0.000000
1196,91,0,2017-04,0.136364,0.454545
1197,91,0,2017-04,0.136364,0.454545
1198,46,0,2017-04,0.000000,0.000000


In [78]:
X = df1.drop(columns=['created_at'], axis=1)
y = df1['created_at']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
model = GaussianNB()
model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [79]:
y_pred = model.predict(X_test)
print("Number of mislabeled points out of a total %d points : %d"    % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 2160 points : 1799


In [None]:
# below is before text cleaning...surprisingly more accurate

In [30]:
X = df2.drop(columns=['created_at'], axis=1)
y = df2['created_at']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
model = GaussianNB()
model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [32]:
y_pred = model.predict(X_test)
print("Number of mislabeled points out of a total %d points : %d"    % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 2160 points : 1786


## Multinomial Naive Bayes with Tfidf

In [80]:
df2=df.copy()

In [81]:
tfidf_vectorizer = TfidfVectorizer()
df2['full_text'] = tfidf_vectorizer.fit_transform(df2['full_text']).toarray()

In [82]:
df2['full_text'].sum()

0.5592335247934099

In [93]:
X = df2.drop(columns=['created_at'], axis=1)
y = df2['created_at']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [95]:
y_pred = model.predict(X_test)
print("Number of mislabeled points out of a total %d points : %d"    % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 2160 points : 1940
