In [12]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

from textblob import TextBlob

import matplotlib.pyplot as plt

from nltk.corpus import stopwords

from textblob.classifiers import NaiveBayesClassifier

## Preprocessing

In [16]:
# import raw data  
namelist=['2021Apr.csv', '2020Oct.csv','2020Apr.csv','2019Oct.csv','2019Apr.csv','2018Oct.csv','2018Apr.csv','2017Oct.csv','2017Apr.csv']

df=pd.DataFrame(columns=['full_text','created_at'])

for i in namelist:
    dfi=pd.read_csv(i)
    dffi=dfi[['full_text','created_at']]
    df=pd.concat([df, dffi])

In [19]:
# inspect the merged data frame
df.head()

Unnamed: 0,full_text,created_at
0,Pregnant women should be offered Pfizer or Mod...,2021-04-20 23:07:25
1,RT @VernersViews: Covid Vaccines are 100% safe...,2021-04-20 23:02:16
2,RT @hanimomo: According to Observer: https://t...,2021-04-20 23:03:16
3,RT @BordersAgainst: University of Oxford who h...,2021-04-20 23:04:47
4,RT @CarryOnMargate: Woman had three brain surg...,2021-04-20 23:04:25


In [26]:
# make a copy in case we make irreversible mistakes 
di=df.copy()

In [28]:
# clean tweets
import preprocessor as p
import re
# this package remove reserved words (RT, FAV),Emojis,Smileys
# https://towardsdatascience.com/basic-tweet-preprocessing-in-python-efd8360d529e

# remove reserved words (RT, FAV),Emojis,Smileys
di.full_text=[p.clean(i) for i in di.full_text]

# remove puncutations 
di.full_text = [re.sub(r'[^\w\s]', '', i) for i in di.full_text]

# remove colon 
di.full_text=[re.sub(':', '',i) for i in di.full_text]

# lowercase all sentences
di.full_text=[x.lower() for x in di.full_text]


In [30]:
# extract month to build classification label 
di.created_at=[i[:7] for i in df.created_at]

In [31]:
# remove stopwords

dd=[]
for item in di['full_text']:    
    string = ' '.join([w for w in item.split() if w not in stopwords.words('english')])
    dd.append(string)

di['full_text']=dd

In [37]:
# optional
# removing stopwords takes long time. So better to export the results to later use it directly
di.to_csv('datasets without stopwords.csv')

In [2]:
# optional
# import data
di=pd.read_csv('datasets without stopwords.csv')

In [3]:
# make all tweet string
di['full_text']=[str(i) for i in di.full_text]

In [4]:
# Applying lemmatisation which transforms each words to its basic form.  
# POS(part of speech) tagging for lemmatisation
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# create function to adapt to POS tag in WordNet
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None



In [5]:
# lemmatise each tweet
wnl = WordNetLemmatizer()
newlist=[]
for sentence in di.full_text:
    tokens = word_tokenize(sentence)  # tokenisation
    tagged_sent = pos_tag(tokens)     # get POS 
    
    lemmas_sent = []
    for tag in tagged_sent:
        wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN # assign a POS or assign noun
        lemmas_sent.append(wnl.lemmatize(tag[0], pos=wordnet_pos))
    temp=' '.join(lemmas_sent)
    newlist.append(temp)

In [47]:
# replace the old text column
di.full_text=newlist
# drop unnecessary columns, e.g. extra index
di=di[['full_text', 'created_at']]


In [47]:
# We inspect that 'amp' appears many times, which is a part of ampersand that didn't get cleanned thoroughly before
# so an extra step: remove amp
di.full_text=[i.replace(' amp ', ' ') for i in di.full_text]

In [48]:
# store the cleaned dataset 
di.to_csv('cleaned dataset.csv')

## Gaussian Naive Bayes with TextBlob 

In [20]:
# import dataset
di=pd.read_csv('cleaned dataset.csv')
di=di[['full_text', 'created_at']]
di.full_text=[str(i) for i in di.full_text]

In [4]:
# create dataset 1 which just has 3 time period. 
da=di.copy()
da=da[da.created_at.isin(['2021-04', '2020-10', '2020-04'])]

# create dataset 2 which just has 6 time period.
db=di.copy()
db=db[db.created_at.isin(['2021-04', '2020-10', '2020-04', '2019-10', '2019-04', '2018-10'])]

# create dataset 3 which just has 9 time period.
dc=di.copy()


In [5]:
# NBC model for dataset1 
# separate into training and test sections, check length of each

jtad_train_df, jtad_test_df = train_test_split(da, test_size=0.2, random_state=1)

print("training is ", len(jtad_train_df), " values long")
print("testing is ", len(jtad_test_df), " values long")

#convert the data frames to lists as textblob.classifier does not take data frames
jtad_train_list = jtad_train_df.values.tolist()
jtad_test_list = jtad_test_df.values.tolist()

# Train a textblob classifier on the training set
cl = NaiveBayesClassifier(jtad_train_list)

# check accuracy on test set
cl.accuracy(jtad_test_list)

training is  2880  values long
testing is  720  values long


0.9027777777777778

In [6]:
# NBC model for dataset2
# separate into training and test sections, check length of each

jtad_train_df, jtad_test_df = train_test_split(db, test_size=0.2, random_state=1)

print("training is ", len(jtad_train_df), " values long")
print("testing is ", len(jtad_test_df), " values long")

#convert the data frames to lists as textblob.classifier does not take data frames
jtad_train_list = jtad_train_df.values.tolist()
jtad_test_list = jtad_test_df.values.tolist()

# Train a textblob classifier on the training set
cl = NaiveBayesClassifier(jtad_train_list)

# check accuracy on test set
cl.accuracy(jtad_test_list)

training is  5760  values long
testing is  1440  values long


0.8444444444444444

In [7]:
# NBC model for dataset3
# separate into training and test sections, check length of each

jtad_train_df, jtad_test_df = train_test_split(dc, test_size=0.2, random_state=1)

print("training is ", len(jtad_train_df), " values long")
print("testing is ", len(jtad_test_df), " values long")

#convert the data frames to lists as textblob.classifier does not take data frames
jtad_train_list = jtad_train_df.values.tolist()
jtad_test_list = jtad_test_df.values.tolist()

# Train a textblob classifier on the training set
cl = NaiveBayesClassifier(jtad_train_list)

# check accuracy on test set
cl.accuracy(jtad_test_list)

training is  8640  values long
testing is  2160  values long


0.7925925925925926

In [26]:
# Order the train data and repeat the process to explore whether order matters for NBC
# NBC model for dataset1
# separate into training and test sections, check length of each

jtad_train_df, jtad_test_df = train_test_split(da, test_size=0.2, random_state=1)
jtad_train_df.sort_values(by=['created_at'])

print("training is ", len(jtad_train_df), " values long")
print("testing is ", len(jtad_test_df), " values long")


#convert the data frames to lists as textblob.classifier does not take data frames
jtad_train_list = jtad_train_df.values.tolist()
jtad_test_list = jtad_test_df.values.tolist()

# Train a textblob classifier on the training set
cl = NaiveBayesClassifier(jtad_train_list)

# check accuracy on test set
print('accuracy for sorted train set is', cl.accuracy(jtad_test_list))


training is  2880  values long
testing is  720  values long
accuracy for sorted train set is 0.9013888888888889


In [27]:
# Order the train data and repeat the process to explore whether order matters for NBC
# NBC model for dataset2
# separate into training and test sections, check length of each

jtad_train_df, jtad_test_df = train_test_split(db, test_size=0.2, random_state=1)
jtad_train_df.sort_values(by=['created_at'])

print("training is ", len(jtad_train_df), " values long")
print("testing is ", len(jtad_test_df), " values long")


#convert the data frames to lists as textblob.classifier does not take data frames
jtad_train_list = jtad_train_df.values.tolist()
jtad_test_list = jtad_test_df.values.tolist()

# Train a textblob classifier on the training set
cl = NaiveBayesClassifier(jtad_train_list)

# check accuracy on test set
print('accuracy for sorted train set is', cl.accuracy(jtad_test_list))


training is  5760  values long
testing is  1440  values long
accuracy for sorted train set is 0.8513888888888889


In [28]:
# Order the train data and repeat the process to explore whether order matters for NBC
# NBC model for dataset3
# separate into training and test sections, check length of each

jtad_train_df, jtad_test_df = train_test_split(dc, test_size=0.2, random_state=1)
jtad_train_df.sort_values(by=['created_at'])

print("training is ", len(jtad_train_df), " values long")
print("testing is ", len(jtad_test_df), " values long")


#convert the data frames to lists as textblob.classifier does not take data frames
jtad_train_list = jtad_train_df.values.tolist()
jtad_test_list = jtad_test_df.values.tolist()

# Train a textblob classifier on the training set
cl = NaiveBayesClassifier(jtad_train_list)

# check accuracy on test set
print('accuracy for sorted train set is', cl.accuracy(jtad_test_list))


training is  8640  values long
testing is  2160  values long
accuracy for sorted train set is 0.7921296296296296


In [30]:
# now ordered and unordered training have same results.
# is this the correct way to put in ordered data? -Anran

## Multinomial Naive Bayes with Tfidf

In [80]:
df2=df.copy()

In [81]:
tfidf_vectorizer = TfidfVectorizer()
df2['full_text'] = tfidf_vectorizer.fit_transform(df2['full_text']).toarray()

In [82]:
df2['full_text'].sum()

0.5592335247934099

In [93]:
X = df2.drop(columns=['created_at'], axis=1)
y = df2['created_at']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [95]:
y_pred = model.predict(X_test)
print("Number of mislabeled points out of a total %d points : %d"    % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 2160 points : 1940
