In [87]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

from textblob import TextBlob

import matplotlib.pyplot as plt

## Gaussian Naive Bayes with TextBlob 

In [69]:
namelist=['2021Apr.csv', '2020Oct.csv','2020Apr.csv','2019Oct.csv','2019Apr.csv','2018Oct.csv','2018Apr.csv','2017Oct.csv','2017Apr.csv']
df=pd.DataFrame(columns=['retweet_count', 'favorite_count','full_text','created_at'])


for i in namelist:
    dfi=pd.read_csv(i)
    dffi=dfi[['retweet_count', 'favorite_count','full_text','created_at']] # drop 'retweeted' as it's suspiciously all false
    df=pd.concat([df, dffi])

In [70]:
df

Unnamed: 0,retweet_count,favorite_count,full_text,created_at
0,0,0,Pregnant women should be offered Pfizer or Mod...,2021-04-20 23:07:25
1,17,0,RT @VernersViews: Covid Vaccines are 100% safe...,2021-04-20 23:02:16
2,85,0,RT @hanimomo: According to Observer: https://t...,2021-04-20 23:03:16
3,11,0,RT @BordersAgainst: University of Oxford who h...,2021-04-20 23:04:47
4,42,0,RT @CarryOnMargate: Woman had three brain surg...,2021-04-20 23:04:25
...,...,...,...,...
1195,1,0,RT @kasstanb: #Vegan #bodies &amp; #vaccine to...,2017-04-08 05:56:03
1196,91,0,RT @alanmcn1: come and join my new lab @IMIBir...,2017-04-08 09:37:09
1197,91,0,RT @alanmcn1: come and join my new lab @IMIBir...,2017-04-07 20:15:14
1198,46,0,RT @MailOnline: Scientists are working on an a...,2017-04-07 20:19:05


In [71]:
# clean tweets
import preprocessor as p
import re
# this package remove Reserved words (RT, FAV),Emojis,Smileys
# https://towardsdatascience.com/basic-tweet-preprocessing-in-python-efd8360d529e

df.full_text=[p.clean(i) for i in df.full_text]

# remove puncutations 
df.full_text = [re.sub(r'[^\w\s]', '', i) for i in df.full_text]

# remove colon 
df.full_text=[re.sub(':', '',i) for i in df.full_text]

In [72]:
# extract month to build classification label 
df.created_at=[i[:7] for i in df.created_at]

In [74]:
df1=df.copy()

In [75]:
df1['polarity']=[TextBlob(i).sentiment.polarity for i in df1.full_text] 
df1['subjectivity']=[TextBlob(i).sentiment.subjectivity for i in df1.full_text] 

In [76]:
df1=df1.drop(['full_text'], axis=1)

In [77]:
df1

Unnamed: 0,retweet_count,favorite_count,created_at,polarity,subjectivity
0,0,0,2021-04,0.333333,0.500000
1,17,0,2021-04,0.416667,0.500000
2,85,0,2021-04,0.000000,0.000000
3,11,0,2021-04,0.050000,0.650000
4,42,0,2021-04,-0.125000,0.375000
...,...,...,...,...,...
1195,1,0,2017-04,0.000000,0.000000
1196,91,0,2017-04,0.136364,0.454545
1197,91,0,2017-04,0.136364,0.454545
1198,46,0,2017-04,0.000000,0.000000


In [78]:
X = df1.drop(columns=['created_at'], axis=1)
y = df1['created_at']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
model = GaussianNB()
model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [79]:
y_pred = model.predict(X_test)
print("Number of mislabeled points out of a total %d points : %d"    % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 2160 points : 1799


In [None]:
# below is before text cleaning...surprisingly more accurate

In [30]:
X = df2.drop(columns=['created_at'], axis=1)
y = df2['created_at']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
model = GaussianNB()
model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [32]:
y_pred = model.predict(X_test)
print("Number of mislabeled points out of a total %d points : %d"    % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 2160 points : 1786


## Multinomial Naive Bayes with Tfidf

In [80]:
df2=df.copy()

In [81]:
tfidf_vectorizer = TfidfVectorizer()
df2['full_text'] = tfidf_vectorizer.fit_transform(df2['full_text']).toarray()

In [82]:
df2['full_text'].sum()

0.5592335247934099

In [93]:
X = df2.drop(columns=['created_at'], axis=1)
y = df2['created_at']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [95]:
y_pred = model.predict(X_test)
print("Number of mislabeled points out of a total %d points : %d"    % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 2160 points : 1940
