In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from tabulate import tabulate
import re
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize as w_t 
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
from autocorrect import Speller
from sklearn.feature_extraction.text import TfidfTransformer
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
nltk.download('wordnet')
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/BharathBandaru/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/BharathBandaru/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/BharathBandaru/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 1. Data Partitioning

In [None]:
# checking number of true and false values for each labeled data files from last phase of project.
# output folder path
# IGNORE THIS STEP IF ERROR PERSIST
os.chdir("../A_2_output")
fl = os.listdir()
df_list = []
for i in fl:
    print(i)
    df = pd.read_csv(i)
    df_list.append(df)
    #print(df.describe())
    a = list(df.label)
    print("True's: {0} False's: {1}".format(
        a.count(True), a.count(False)),"\n")

In [15]:
# we choose vaccination as we have majority of T's

# using twitter data as my primary dataset(assigned primary topic)
df = pd.read_csv("../A_2_output/twitter_topic_vaccination.csv")
# using change org data as my secondary dataset
sec_df = pd.read_csv("../A_2_output/change.org_topic_vaccination.csv")

print('Shape of the prim data: ', df.shape)
df["label"] = df["label"].astype(int)
print(df.head(),"\n\n")

print('Shape of the sec data: ', sec_df.shape)
sec_df["label"] = sec_df["label"].astype(int)
print(sec_df.head())

Shape of the prim data:  (1195, 2)
                                                text  label
0  Putin After Announcing #CovidVaccine #Russian ...      1
1  Courtesy: WA! #WhatsApp #COVID #CovidVaccine h...      1
2  4 of the vaccines Jared bought are expected to...      1
3  One day you will realize CDC Guidelines magica...      0
4  Im far from lying.  Current CDC guidelines is ...      1 


Shape of the sec data:  (1500, 2)
                                                text  label
0         Language Education in the Time of COVID-19      0
1                                 COVID-19 Test Kits      0
2                                 COVID 19 IN PRISON      0
3                                     Get Waled Home      0
4  Make pass/fail available for Mississippi State...      0


In [16]:
# split train and test data 
# [train - 70%, test - 30%]
text_train, text_test, y_train, y_test = train_test_split(df.text,df.label,test_size = 0.33)

print(text_train.shape)
print(text_test.shape)
print(y_train.shape)
print(y_test.shape)

(800,)
(395,)
(800,)
(395,)


## 2. Baseline model training
Here we are using below models:
1. Logistic regression with no penality
2. Logistic regression with l2 penality
3. RandomForest Classificatin 

In [18]:
# Normalizing data(pre-process) and tranforming data to matrix
vectorizer = TfidfVectorizer(min_df=3, stop_words="english").fit(text_train)
X_train = vectorizer.transform(text_train)
X_test = vectorizer.transform(text_test)
X_sec_data = vectorizer.transform(sec_df.text)

### a. Logistic regression

In [19]:
# Logistic regression model with no penality
lr_model = LogisticRegression(penalty="none", 
                              multi_class="multinomial",
                              solver="lbfgs").fit(X_train, y_train)
y_pred_lr_test = lr_model.predict(X_test)
y_sec_data_predict = lr_model.predict(X_sec_data)

####  Primary test dataset

In [20]:
# primary test data accuracy with logistic regression(no penality)
print("Logistic regression with no penality - Accuracy: ", accuracy_score(y_test, y_pred_lr_test),"\n")
print(classification_report(y_test, y_pred_lr_test))

Logistic regression with no penality - Accuracy:  0.8734177215189873 

              precision    recall  f1-score   support

           0       0.88      0.85      0.86       187
           1       0.87      0.89      0.88       208

    accuracy                           0.87       395
   macro avg       0.87      0.87      0.87       395
weighted avg       0.87      0.87      0.87       395



####  Secondary dataset

In [21]:
# secondary test data accuracy with logistic regression(no penality)
print("Logistic regression with no penality - Accuracy[SECONDARY DATA]: ",
      accuracy_score(sec_df.label, y_sec_data_predict),"\n")
print(classification_report(sec_df.label, y_sec_data_predict))

Logistic regression with no penality - Accuracy[SECONDARY DATA]:  0.5373333333333333 

              precision    recall  f1-score   support

           0       0.99      0.53      0.69      1474
           1       0.03      0.77      0.05        26

    accuracy                           0.54      1500
   macro avg       0.51      0.65      0.37      1500
weighted avg       0.98      0.54      0.68      1500



### b. Logistic regression with l2 penality

In [22]:
# Logistic regression model with penality
lr2_model_w_penality = LogisticRegression(penalty="l2", 
                               solver="lbfgs",
                               multi_class="multinomial",
                               max_iter=1001,
                               C=9).fit(X_train, y_train)
y_pred_lr_penality_test = lr2_model_w_penality.predict(X_test)
y_sec_data_predict_w_pen = lr2_model_w_penality.predict(X_sec_data)

####  Primary test dataset

In [23]:
# primary test data accuracy using logistic regression(no penality)
print("Logistic regression with l2 penality - Accuracy: ",accuracy_score(y_test, y_pred_lr_penality_test),"\n")
print(classification_report(y_test, y_pred_lr_penality_test))

Logistic regression with l2 penality - Accuracy:  0.8835443037974684 

              precision    recall  f1-score   support

           0       0.90      0.84      0.87       187
           1       0.87      0.92      0.89       208

    accuracy                           0.88       395
   macro avg       0.89      0.88      0.88       395
weighted avg       0.88      0.88      0.88       395



#### Secondary dataset

In [24]:
# secondary test data accuracy using logistic regression(l2 penality)
print("Logistic regression with no penality - Accuracy[SECONDARY DATA]: ",
      accuracy_score(sec_df.label, y_sec_data_predict_w_pen),"\n")
print(classification_report(sec_df.label, y_sec_data_predict_w_pen))

Logistic regression with no penality - Accuracy[SECONDARY DATA]:  0.5753333333333334 

              precision    recall  f1-score   support

           0       0.99      0.57      0.73      1474
           1       0.03      0.73      0.06        26

    accuracy                           0.58      1500
   macro avg       0.51      0.65      0.39      1500
weighted avg       0.98      0.58      0.71      1500



### c. Random Forest Classification

In [25]:
# Classification model with RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
rf_pred = clf.predict(X_test)
rf_sec_pred = clf.predict(X_sec_data)

In [26]:
# primary and secondary test data accuracy using Random Forest
print("RandomForestClassifier - primary test data - Accuracy: ", accuracy_score(y_test, rf_pred),"\n")
print("RandomForestClassifier - secondary test data - Accuracy: ", accuracy_score(sec_df.label, rf_sec_pred),"\n")

RandomForestClassifier - primary test data - Accuracy:  0.9012658227848102 

RandomForestClassifier - secondary test data - Accuracy:  0.6453333333333333 



## 3. Model evaluation-1
Below is the tabular form of above evaluated data with all models accuracy.

In [27]:
table = [["Logistic Regression with no penality - Primary test data", accuracy_score(y_test, y_pred_lr_test)], 
                                              ['Logistic Regression with no penality - Secondary data', accuracy_score(sec_df.label, y_sec_data_predict)],
                                              ['Logistic Regression with l2 penality - Primary test data', accuracy_score(y_test, y_pred_lr_penality_test)],
                                              ['Logistic Regression with l2 penality - Secondary data', accuracy_score(sec_df.label, y_sec_data_predict_w_pen)],
                                              ['Random Forest Classifier - Primary test data', accuracy_score(y_test, rf_pred)],
                                              ['Random Forest Classifier - Secondary data', accuracy_score(sec_df.label, rf_sec_pred)],
                                             ]
print(tabulate(table, headers=['Model', 'accuracy']))

Model                                                       accuracy
--------------------------------------------------------  ----------
Logistic Regression with no penality - Primary test data    0.873418
Logistic Regression with no penality - Secondary data       0.537333
Logistic Regression with l2 penality - Primary test data    0.883544
Logistic Regression with l2 penality - Secondary data       0.575333
Random Forest Classifier - Primary test data                0.901266
Random Forest Classifier - Secondary data                   0.645333


## 4. Feature engineering

### a. Pre-processing
1. Removing all non alphabetic using regular expression([^A-Za-z#]).
2. Converting text to lower case
3. Converting each words to there stems(using stremmer)

In [28]:
# Pre processing data(stem, applying regex to remove links, etc..) 

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

#extract hash tags list(map(lambda x, y: x - y, [2, 4, 6], [1, 3, 5]))
def extract_hash_tags(s):
    return set(part[1:] for part in s.split() if part.startswith('#'))

# Tokenize and lemmatize
def preprocess(text):
    text = (re.sub('[^#A-Za-z]', ' ', re.sub(r'http\S+', '', text))).lower()
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    result.extend(list(map(lambda x:"#"+x,extract_hash_tags(text))))       
    return " ".join(result)

In [29]:
post_df_text = []
post_sec_df_text = []

for doc in df.text:
    post_df_text.append(preprocess(doc))
for doc in sec_df.text:
    post_sec_df_text.append(preprocess(doc))

In [30]:
print(len(post_df_text))
print(len(post_sec_df_text))
post_df_text[:3]

1195
1500


['putin announc covidvaccin russian #covidvaccine #russian',
 'courtesi whatsapp covid covidvaccin #covidvaccine #covid #whatsapp',
 'vaccin jar buy expect fail trump refus join global']

In [31]:
# creating the feature matrix 
matrix = CountVectorizer(max_features=1500)
X = matrix.fit_transform(post_df_text).toarray()
y = df.label

In [32]:
# splitting data to train and test
X_train_new, X_test_new, y_train_new, y_test_new, df_train, df_test = train_test_split(X, y, post_df_text, test_size = 0.33)

X_sec_test_new = matrix.fit_transform(post_sec_df_text).toarray()
y_sec_test_new = sec_df.label

print(X_train_new.shape)
print(y_train_new.shape)
print(X_test_new.shape)
print(y_test_new.shape)
print(X_sec_test_new.shape)
print(y_sec_test_new.shape)

(800, 1500)
(800,)
(395, 1500)
(395,)
(1500, 1500)
(1500,)


In [33]:
# Transform matrix to tf*idf vector
tf_transformer = TfidfTransformer(use_idf=False).fit(X)
X_train_tf = tf_transformer.transform(X)

### b. Calculating accuracy before adding features

In [34]:
# Train a text categorization model before feature addition through vector

# Logistic regression model with l2 penality after feature addition
lr2_model = LogisticRegression(penalty="l2", 
                               solver="lbfgs",
                               multi_class="multinomial",
                               max_iter=1001,
                               C=10).fit(X_train_new, y_train_new)
y_lr2_test_w_pre = lr2_model.predict(X_test_new)

print(accuracy_score(y_test_new, y_lr2_test_w_pre))


0.9164556962025316


In [35]:
clf = RandomForestClassifier()
clf.fit(np.array(X_train_new), y_train_new)
pred = clf.predict(X_test_new)
before_pre_accu = accuracy_score(y_test_new,pred)
print("Primary test data: ",before_pre_accu)

Primary test data:  0.9417721518987342


In [36]:
n_gram_range = (1, 1)
stop_words = "english"

# here newdf is the text with vaccinations as true
newdf = df.query('label == 1')
newdf = pd.DataFrame(newdf["text"])
# Extract all words/phrases
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit(newdf.text)
all_words = count.get_feature_names_out()

In [37]:
newdf.shape

(687, 1)

In [38]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding = model.encode(df.text)
word_embeddings = model.encode(all_words)
top = 5
distances = cosine_similarity(doc_embedding, word_embeddings)
keywords = [all_words[index] for index in distances.argsort()[0][-top:]]

In [41]:
# extracting list of hashtags
listofhtags = []
for each in newdf.text:
    x = re.findall(r"#(\w+)", each)
    listofhtags.extend(x)
listofhtags = set(listofhtags)
listofhtags

{'038',
 '4ginkashmir',
 'ABNTelugu',
 'Aadhar',
 'AatmaNirbharBharat',
 'America',
 'AstraZeneca',
 'Australia',
 'BREAKING',
 'BharatBiotech',
 'BigPharma',
 'BiharElections',
 'BiharElections2020',
 'Biharis',
 'BillGates',
 'BillGatesBioTerrorist',
 'Bitcoin',
 'Brahmans',
 'Brazil',
 'Breaking',
 'BurnolForAirasia',
 'CCP',
 'CDC',
 'CDCGuidelines',
 'COVAXIN',
 'COVID',
 'COVID19',
 'COVID1984',
 'COVID19Aus',
 'COVID19India',
 'COVID19pt',
 'COVID19vaccine',
 'COVID2019',
 'COVIDIOTS',
 'COVIDLongHaulers',
 'COVIDVACCINE',
 'COVIDVIC19',
 'COVIDVaccine',
 'COVID_19',
 'COVID__19',
 'COVIDvaccine',
 'COVIDー19',
 'COVISHIELD',
 'Cargo',
 'ChildAbuse',
 'China',
 'Chinese',
 'Corona',
 'CoronaUpdatesInIndia',
 'CoronaVaccine',
 'CoronaVirus',
 'CoronaVirusUpdates',
 'Coronavirus',
 'CoronavirusOutbreak',
 'CoronavirusPandemic',
 'CoronavirusVaccine',
 'Covaxin',
 'Covid',
 'Covid19',
 'Covid19Millionaires',
 'Covid19UK',
 'CovidHoax',
 'CovidVaccine',
 'CovidVaccineRace',
 'Covid_1

### c. Adding Features

In [43]:
# feature engineering functions 
def check_new_keywords(text):
    text = (re.sub('[^#A-Za-z]', ' ', re.sub(r'http\S+', '', text))).lower()
    for each in text.split():
        if(each in keywords):
            return 1
    return 0

def check_word_vaccine(text):
    return  1 if "vaccin" in text else 0

def check_word_immune(text):
    return  1 if "immun" in text else 0

def check_htags(text):
    x = re.findall(r"#(\w+)", text)
    return len(x)>0

train_new_feature_array = {"1":[],"2":[],"3":[],"4":[]}
for doc in df_train:
    train_new_feature_array["1"].append(check_new_keywords(doc))
    train_new_feature_array["2"].append(check_word_vaccine(doc))
    train_new_feature_array["3"].append(check_word_immune(doc))
    train_new_feature_array["4"].append(check_htags(doc))
    
test_new_feature_array = {"1":[],"2":[],"3":[],"4":[]}
for doc in df_test:
    test_new_feature_array["1"].append(check_new_keywords(doc))
    test_new_feature_array["2"].append(check_word_vaccine(doc))
    test_new_feature_array["3"].append(check_word_immune(doc))
    test_new_feature_array["4"].append(check_htags(doc))
    
sec_test_new_feature_array = {"1":[],"2":[],"3":[],"4":[]}
for doc in post_sec_df_text:    
    sec_test_new_feature_array["1"].append(check_new_keywords(doc))
    sec_test_new_feature_array["2"].append(check_word_vaccine(doc))
    sec_test_new_feature_array["3"].append(check_word_immune(doc))
    sec_test_new_feature_array["4"].append(check_htags(doc))

In [44]:
# adding 4 new features to train,test and secondary data sets
X_train_final = np.insert(X_train_new, X_train_new.shape[1], train_new_feature_array["1"], axis=1)
X_test_final = np.insert(X_test_new, X_test_new.shape[1], test_new_feature_array["1"], axis=1)
X_sec_test_final = np.insert(X_sec_test_new, X_sec_test_new.shape[1], sec_test_new_feature_array["1"], axis=1)

remain_feature = ["2","3","4"]
for i in remain_feature:
    X_train_final = np.insert(X_train_final, X_train_final.shape[1], train_new_feature_array[i], axis=1)
    X_test_final = np.insert(X_test_final, X_test_final.shape[1], test_new_feature_array[i], axis=1)
    X_sec_test_final = np.insert(X_sec_test_final, X_sec_test_final.shape[1], sec_test_new_feature_array[i], axis=1)
print("# shape of datasets after adding new features")
print(X_train_final.shape)
print(X_test_final.shape)
print(X_sec_test_final.shape)

# shape of datasets after adding new features
(800, 1504)
(395, 1504)
(1500, 1504)


### d. Calculating accuracy after adding features

In [45]:
# Logistic regression model with no penality after feature addition
lr_model_fea = LogisticRegression(penalty="none", 
                              multi_class="multinomial",
                              solver="lbfgs").fit(X_train_final, y_train_new)
y_pred_lr_test_fea = lr_model_fea.predict(X_test_final)
y_pred_lr_sec_test_fea = lr_model_fea.predict(X_sec_test_final)
print("Primary test data ACCURACY: ",accuracy_score(y_test_new, y_pred_lr_test_fea))
print(classification_report(y_test_new, y_pred_lr_test_fea))

Primary test data ACCURACY:  0.9063291139240506
              precision    recall  f1-score   support

           0       0.92      0.87      0.89       179
           1       0.89      0.94      0.92       216

    accuracy                           0.91       395
   macro avg       0.91      0.90      0.90       395
weighted avg       0.91      0.91      0.91       395



In [46]:
print("Secondary test data ACCURACY: ",accuracy_score(y_sec_test_new, y_pred_lr_sec_test_fea))
print(classification_report(y_sec_test_new, y_pred_lr_sec_test_fea))

Secondary test data ACCURACY:  0.9393333333333334
              precision    recall  f1-score   support

           0       0.99      0.95      0.97      1474
           1       0.16      0.62      0.26        26

    accuracy                           0.94      1500
   macro avg       0.58      0.78      0.61      1500
weighted avg       0.98      0.94      0.96      1500



In [47]:
# Logistic regression model with l2 penality after feature addition
lr2_model_fea = LogisticRegression(penalty="l2", 
                               solver="lbfgs",
                               multi_class="multinomial",
                               max_iter=1001,
                               C=10).fit(X_train_final, y_train_new)
y_lr2_test_fea = lr2_model_fea.predict(X_test_final)
y_pred_lr2_sec_test_fea = lr2_model_fea.predict(X_sec_test_final)
print("Primary test data ACCURACY: ",accuracy_score(y_test_new, y_lr2_test_fea))
print(classification_report(y_test_new, y_lr2_test_fea))

Primary test data ACCURACY:  0.9240506329113924
              precision    recall  f1-score   support

           0       0.93      0.91      0.92       179
           1       0.92      0.94      0.93       216

    accuracy                           0.92       395
   macro avg       0.92      0.92      0.92       395
weighted avg       0.92      0.92      0.92       395



In [48]:
print("Secondary test data ACCURACY: ",accuracy_score(y_sec_test_new, y_pred_lr2_sec_test_fea))
print(classification_report(y_sec_test_new, y_pred_lr2_sec_test_fea))

Secondary test data ACCURACY:  0.9793333333333333
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1474
           1       0.43      0.62      0.51        26

    accuracy                           0.98      1500
   macro avg       0.71      0.80      0.75      1500
weighted avg       0.98      0.98      0.98      1500



In [49]:
# Train a text categorization model after feature addition through vector
clf = RandomForestClassifier()
clf.fit(np.array(X_train_final), y_train_new)
pred = clf.predict(X_test_final)
sec_pred = clf.predict(X_sec_test_final)
print("Primary test data ACCURACY: ",accuracy_score(y_test_new,pred))
print(classification_report(y_test_new, pred))

Primary test data ACCURACY:  0.9417721518987342
              precision    recall  f1-score   support

           0       0.93      0.94      0.94       179
           1       0.95      0.94      0.95       216

    accuracy                           0.94       395
   macro avg       0.94      0.94      0.94       395
weighted avg       0.94      0.94      0.94       395



In [50]:
print("Secondary test data ACCURACY: ",accuracy_score(y_sec_test_new,sec_pred))
print(classification_report(y_sec_test_new, sec_pred))

Secondary test data ACCURACY:  0.9906666666666667
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1474
           1       0.80      0.62      0.70        26

    accuracy                           0.99      1500
   macro avg       0.90      0.81      0.85      1500
weighted avg       0.99      0.99      0.99      1500



In [52]:
table = [["Logistic Regression with no penality - Primary test data", accuracy_score(y_test_new, y_pred_lr_test_fea)], 
                                              ['Logistic Regression with no penality - Secondary data', accuracy_score(y_sec_test_new, y_pred_lr_sec_test_fea)],
                                              ['Logistic Regression with l2 penality - Primary test data', accuracy_score(y_test_new, y_lr2_test_fea)],
                                              ['Logistic Regression with l2 penality - Secondary data', accuracy_score(y_sec_test_new, y_pred_lr2_sec_test_fea)],
                                              ['Random Forest Classifier - Primary test data', accuracy_score(y_test_new,pred)],
                                              ['Random Forest Classifier - Secondary data', accuracy_score(y_sec_test_new,sec_pred)]
                                             ]
print(tabulate(table, headers=['Model with feature engineering', 'accuracy']))

Model with feature engineering                              accuracy
--------------------------------------------------------  ----------
Logistic Regression with no penality - Primary test data    0.906329
Logistic Regression with no penality - Secondary data       0.939333
Logistic Regression with l2 penality - Primary test data    0.924051
Logistic Regression with l2 penality - Secondary data       0.979333
Random Forest Classifier - Primary test data                0.941772
Random Forest Classifier - Secondary data                   0.990667
