In [3]:
# Importing Libraries
import pandas as pd
import numpy as np
import os
import re
import random
import nltk
#nltk.download()
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


#### Loading PRIMARY(change.org) and SECONDARY(Twitter) Datasets

In [2]:
# Loading PRIMARY Datasets - change.org
df_lockdown_change_org = pd.read_csv('change.org_topic_lockdowns.csv')
df_masking_and_distancing_change_org = pd.read_csv('change.org_topic_masking_and_distancing.csv')
df_vaccination_change_org = pd.read_csv('change.org_topic_vaccination.csv')

# Loading SECONDARY Datasets - Twitter
df_lockdown_twitter = pd.read_csv('twitter_topic_lockdowns.csv')
df_masking_and_distancing_twitter = pd.read_csv('twitter_topic_masking_and_distancing.csv')
df_vaccination_change_twitter = pd.read_csv('twitter_topic_vaccination.csv')

#### Checking the Class Distribution of Three Different Topics - Lockdown, Masking and Distancing, Vaccination

#### i. Class Distribution for Lockdown Topic:

In [4]:
print("Class Distribution in percent for Lockdown Topic:\n{}".format(df_lockdown_change_org['label'].value_counts()/df_lockdown_change_org.shape[0]))

Class Distribution in percent for Lockdown Topic:
False    0.72
True     0.28
Name: label, dtype: float64


#### ii. Class Distribution for Masking and Distancing Topic:

In [5]:
print("Class Distribution in percent for Masking and Distancing Topic:\n{}".format(df_masking_and_distancing_change_org['label'].value_counts()/df_masking_and_distancing_change_org.shape[0]))

Class Distribution in percent for Masking and Distancing Topic:
False    0.953333
True     0.046667
Name: label, dtype: float64


#### iii. Class Distribution for Vaccination Topic:

In [6]:
print("Class Distribution in percent for Vaccination Topic:\n{}".format(df_vaccination_change_org['label'].value_counts()/df_vaccination_change_org.shape[0]))

Class Distribution in percent for Vaccination Topic:
False    0.982
True     0.018
Name: label, dtype: float64


#### Clearly the Class Distribution for Lockdown is best among all three Topics. Hence, This Dataset is suitable for training.

### 1. Data partitioning

In [7]:
# Checking the first 5 rows
df_lockdown_change_org.head()

Unnamed: 0,text,label
0,Postpone CBSE Board Exams,False
1,Pay rise for paramedics and nurses,False
2,Too Soon To Open Georgia!,True
3,Request to reconsider DPS Ruby Park school tui...,False
4,Covid-19 Aesthetics / Salon / Massage Industry...,False


In [8]:
df_lockdown_change_org.shape

(1500, 2)

#### Splitting PRIMARY dataset into train and test

In [9]:
# Shuffling the Lockdown Train and Test sets by randomly selecting rows and also maintaining same 
# class distribution as the original Dataset
x, y = df_lockdown_change_org['text'], df_lockdown_change_org['label']
x_train_PRIMARY, x_test_PRIMARY, y_train_PRIMARY, y_test_PRIMARY = train_test_split(x, y, test_size=0.3, stratify=y, shuffle=True)

In [10]:
# Checking the class Distribution after the split for Train and Test set
print("Percentage Distribution of labels in PRIMARY Train set:\n{}".format(y_train_PRIMARY.value_counts()/y_train_PRIMARY.shape[0]))
print()
print("Percentage Distribution of labels in PRIMARY Test set:\n{}".format(y_test_PRIMARY.value_counts()/y_test_PRIMARY.shape[0]))

Percentage Distribution of labels in PRIMARY Train set:
False    0.72
True     0.28
Name: label, dtype: float64

Percentage Distribution of labels in PRIMARY Test set:
False    0.72
True     0.28
Name: label, dtype: float64


#### Splitting SECONDARY dataset into x and y

In [11]:
x_test_SECONDARY, y_test_SECONDARY = df_lockdown_twitter['text'], df_lockdown_twitter['label']

In [12]:
# Checking the class Distribution after the split for Train and Test set
print("Percentage Distribution of labels in SECONDARY Test set:\n{}".format(y_test_SECONDARY.value_counts()/y_test_SECONDARY.shape[0]))

Percentage Distribution of labels in SECONDARY Test set:
False    0.960833
True     0.039167
Name: label, dtype: float64


#### Creating Bag of Words Vectorizer

In [13]:
vectorizer = CountVectorizer()
x_train_PRIMARY_bow = vectorizer.fit_transform(x_train_PRIMARY)
x_test_PRIMARY_bow = vectorizer.transform(x_test_PRIMARY)
x_test_SECONDARY_bow = vectorizer.transform(x_test_SECONDARY)

In [14]:
# Let's check the Features of the BoW 
vectorizer.get_feature_names()



['00',
 '000',
 '10',
 '100',
 '1080',
 '10th',
 '11',
 '1199',
 '12',
 '120',
 '12th',
 '14',
 '15',
 '151',
 '1532',
 '16',
 '163',
 '16th',
 '17',
 '18s',
 '19',
 '1b',
 '20',
 '200',
 '2019',
 '2020',
 '2021',
 '21',
 '2192',
 '220k',
 '24',
 '25',
 '27',
 '28',
 '2months',
 '2nd',
 '30',
 '300',
 '31st',
 '365u',
 '40',
 '400',
 '4700',
 '4th',
 '50',
 '500',
 '52000',
 '626',
 '70',
 '90',
 '99',
 '9th',
 'aamc',
 'aapi',
 'aarogyasri',
 'ab',
 'abandonment',
 'abbadi',
 'abbott',
 'abdullah',
 'abeysekera',
 'abhishek',
 'able',
 'abolish',
 'about',
 'abuse',
 'abusive',
 'academic',
 'academies',
 'academy',
 'acceptance',
 'access',
 'accessible',
 'acciones',
 'accompany',
 'accountability',
 'acompañante',
 'across',
 'act',
 'action',
 'actions',
 'active',
 'actively',
 'actra',
 'ad',
 'added',
 'addison',
 'adelphi',
 'adequate',
 'adhanom',
 'adjust',
 'adjustments',
 'admin',
 'admission',
 'admissions',
 'admitting',
 'adom',
 'adult',
 'advantage',
 'advisory',
 'ad

In [15]:
len(vectorizer.get_feature_names())

2969

#### There are 2984 number of features in the current Bag of Words model.
***
But different canonicals forms of the same word are present in the vocabulary.
For example, abuse and abusive, action and actions, active and activities etc.

### 2. Baseline model training

#### Naive Bayes Model

In [16]:
%%time
nb_model = GaussianNB().fit(x_train_PRIMARY_bow.toarray(), y_train_PRIMARY)
#nb_model = MultinomialNB(alpha=1.0).fit(x_train_PRIMARY, y_train_PRIMARY)
y_hat_nb_test_PRIMARY = nb_model.predict(x_test_PRIMARY_bow.toarray())
y_hat_nb_test_SECONDARY = nb_model.predict(x_test_SECONDARY_bow.toarray())

CPU times: user 161 ms, sys: 92.4 ms, total: 254 ms
Wall time: 271 ms


#### Logistic regression Model

In [17]:
%%time
lr_model = LogisticRegression().fit(x_train_PRIMARY_bow.toarray(), y_train_PRIMARY)
y_hat_lr_test_PRIMARY = lr_model.predict(x_test_PRIMARY_bow.toarray())
y_hat_lr_test_SECONDARY = nb_model.predict(x_test_SECONDARY_bow.toarray())

CPU times: user 565 ms, sys: 44.5 ms, total: 610 ms
Wall time: 365 ms


### 3. Model evaluation 1

#### Model Evaluation for Naive Bayes Classifier

In [18]:
print(classification_report(y_test_PRIMARY, y_hat_nb_test_PRIMARY))

              precision    recall  f1-score   support

       False       0.80      0.66      0.73       324
        True       0.40      0.58      0.47       126

    accuracy                           0.64       450
   macro avg       0.60      0.62      0.60       450
weighted avg       0.69      0.64      0.66       450



In [19]:
print(f1_score(y_test_PRIMARY, y_hat_nb_test_PRIMARY))

0.4740259740259741


In [20]:
print(classification_report(y_test_SECONDARY, y_hat_nb_test_SECONDARY))

              precision    recall  f1-score   support

       False       0.95      0.85      0.90      1153
        True       0.00      0.00      0.00        47

    accuracy                           0.82      1200
   macro avg       0.48      0.42      0.45      1200
weighted avg       0.92      0.82      0.86      1200



In [21]:
print(f1_score(y_test_SECONDARY, y_hat_nb_test_SECONDARY))

0.0


#### Accuracy of Naive Bayes algorithm on Primary and secondary Test Set are 0.64 and 0.82
#### F1 Score of  Naive Bayes algorithm on Primary and Secondary Test Set are 0.47 and 0.0 

#### Model Evaluation for Logistic Regression Classifier

In [22]:
print(classification_report(y_test_PRIMARY, y_hat_lr_test_PRIMARY))

              precision    recall  f1-score   support

       False       0.83      0.94      0.88       324
        True       0.78      0.51      0.62       126

    accuracy                           0.82       450
   macro avg       0.81      0.73      0.75       450
weighted avg       0.82      0.82      0.81       450



In [23]:
print(f1_score(y_test_PRIMARY, y_hat_lr_test_PRIMARY))

0.6153846153846154


In [24]:
print(classification_report(y_test_SECONDARY, y_hat_lr_test_SECONDARY))

              precision    recall  f1-score   support

       False       0.95      0.85      0.90      1153
        True       0.00      0.00      0.00        47

    accuracy                           0.82      1200
   macro avg       0.48      0.42      0.45      1200
weighted avg       0.92      0.82      0.86      1200



In [25]:
print(f1_score(y_test_SECONDARY, y_hat_lr_test_SECONDARY))

0.0


#### Accuracy of Logistic Regression algorithm on Primary and secondary Test Set are 0.82 and 0.82
#### F1 Score of Logistic Regression algorithm on Primary and Secondary Test Set are 0.61 and 0.0     

#### Conclusion: Logistic Regression Performs better than Naive Bayes

### 4. Feature engineering

#### Stemming and lemmatising

In [26]:
stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

#### Function to preprocess text:
     1. Lower case all the words
     2. Remove stop words
     3. Perform stemming or lemmatization

In [27]:
def preprocess(document, stem=True):
    'changes document to lower case and removes stopwords'

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]
    
    
    if stem:
        words = [stemmer.stem(word) for word in words]
    else:
        words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]

    # join words to make sentence
    document = " ".join(words)
    
    return document

#### Function to derive first feature:
     - returns 1 if all the words are in capital else 0

In [28]:
def feature_function1(text):
    # Flags texts that has all the words in Upper case
    return int(bool(re.match(r"^[A-Z \d\W]+$", text)))

#### Function to derive second feature:
     - returns 1 if word Close or close is present in the text else 0

In [48]:
def feature_function2(text):
    # Flags texts that contains word Close or close
    return int(bool(re.match(r"[C|c]lose",text)))

#### Function to derive third feature:
     - returns 1 if word Shutdown or shutdown is present in the text else 0

In [30]:
def feature_function3(text):
    # Flags texts that contains word Shutdown or shutdown
    return int(bool(re.match(r"[S|s]hutdown", text)))

In [31]:
# Preprocess text
x_train_PRIMARY_processed = [preprocess(text, stem=True) for text in x_train_PRIMARY]
x_test_PRIMARY_processed = [preprocess(text, stem=True) for text in x_test_PRIMARY]
x_test_SECONDARY_processed = [preprocess(text, stem=True) for text in x_test_SECONDARY]

In [32]:
# Bag of words model
vectorizer = CountVectorizer(stop_words='english')
x_train_PRIMARY_bow_final = vectorizer.fit_transform(x_train_PRIMARY)
x_test_PRIMARY_bow_final = vectorizer.transform(x_test_PRIMARY)
x_test_SECONDARY_bow_final = vectorizer.transform(x_test_SECONDARY)

In [33]:
# Checking the dimension of each BOW matrix before adding new features
print(x_train_PRIMARY_bow_final.shape)
print(x_test_PRIMARY_bow_final.shape)
print(x_test_SECONDARY_bow_final.shape)

(1050, 2820)
(450, 2820)
(1200, 2820)


In [34]:
# Deriving FIRST feature for PRIMARY train, PRIMARY test, and SECONDARY test Datasets
new_feature1_PRIMARY_train = x_train_PRIMARY.apply(feature_function1)
new_feature1_PRIMARY_test  = x_test_PRIMARY.apply(feature_function1)
new_feature1_SECONDARY_test  = x_test_SECONDARY.apply(feature_function1)

# Deriving SECOND feature for PRIMARY train, PRIMARY test, and SECONDARY test Datasets
new_feature2_PRIMARY_train = x_train_PRIMARY.apply(feature_function2)
new_feature2_PRIMARY_test  = x_test_PRIMARY.apply(feature_function2)
new_feature2_SECONDARY_test  = x_test_SECONDARY.apply(feature_function2)

# Deriving THIRD feature for PRIMARY train, PRIMARY test, and SECONDARY test Datasets
new_feature3_PRIMARY_train = x_train_PRIMARY.apply(feature_function3)
new_feature3_PRIMARY_test  = x_test_PRIMARY.apply(feature_function3)
new_feature3_SECONDARY_test  = x_test_SECONDARY.apply(feature_function3)

# Adding FIRST feature to PRIMARY train, PRIMARY test, and SECONDARY test Datasets
x_train_PRIMARY_bow_final = np.insert(x_train_PRIMARY_bow_final.todense(), x_train_PRIMARY_bow_final.shape[1], new_feature1_PRIMARY_train, axis=1)
x_test_PRIMARY_bow_final = np.insert(x_test_PRIMARY_bow_final.todense(), x_test_PRIMARY_bow_final.shape[1], new_feature1_PRIMARY_test, axis=1)
x_test_SECONDARY_bow_final = np.insert(x_test_SECONDARY_bow_final.todense(), x_test_SECONDARY_bow_final.shape[1], new_feature1_SECONDARY_test, axis=1)

# Adding SECOND feature to PRIMARY train, PRIMARY test, and SECONDARY test Datasets
x_train_PRIMARY_bow_final = np.insert(x_train_PRIMARY_bow_final, x_train_PRIMARY_bow_final.shape[1], new_feature2_PRIMARY_train, axis=1)
x_test_PRIMARY_bow_final = np.insert(x_test_PRIMARY_bow_final, x_test_PRIMARY_bow_final.shape[1], new_feature2_PRIMARY_test, axis=1)
x_test_SECONDARY_bow_final = np.insert(x_test_SECONDARY_bow_final, x_test_SECONDARY_bow_final.shape[1], new_feature2_SECONDARY_test, axis=1)

# Adding THIRD feature to PRIMARY train, PRIMARY test, and SECONDARY test Datasets
x_train_PRIMARY_bow_final = np.insert(x_train_PRIMARY_bow_final, x_train_PRIMARY_bow_final.shape[1], new_feature3_PRIMARY_train, axis=1)
x_test_PRIMARY_bow_final = np.insert(x_test_PRIMARY_bow_final, x_test_PRIMARY_bow_final.shape[1], new_feature3_PRIMARY_test, axis=1)
x_test_SECONDARY_bow_final = np.insert(x_test_SECONDARY_bow_final, x_test_SECONDARY_bow_final.shape[1], new_feature3_SECONDARY_test, axis=1)


In [35]:
# Checking the dimension of each BOW matrix after adding new features
print(x_train_PRIMARY_bow_final.shape)
print(x_test_PRIMARY_bow_final.shape)
print(x_test_SECONDARY_bow_final.shape)

(1050, 2823)
(450, 2823)
(1200, 2823)


In [36]:
vectorizer.get_feature_names()



['00',
 '000',
 '10',
 '100',
 '1080',
 '10th',
 '11',
 '1199',
 '12',
 '120',
 '12th',
 '14',
 '15',
 '151',
 '1532',
 '16',
 '163',
 '16th',
 '17',
 '18s',
 '19',
 '1b',
 '20',
 '200',
 '2019',
 '2020',
 '2021',
 '21',
 '2192',
 '220k',
 '24',
 '25',
 '27',
 '28',
 '2months',
 '2nd',
 '30',
 '300',
 '31st',
 '365u',
 '40',
 '400',
 '4700',
 '4th',
 '50',
 '500',
 '52000',
 '626',
 '70',
 '90',
 '99',
 '9th',
 'aamc',
 'aapi',
 'aarogyasri',
 'ab',
 'abandonment',
 'abbadi',
 'abbott',
 'abdullah',
 'abeysekera',
 'abhishek',
 'able',
 'abolish',
 'abuse',
 'abusive',
 'academic',
 'academies',
 'academy',
 'acceptance',
 'access',
 'accessible',
 'acciones',
 'accompany',
 'accountability',
 'acompañante',
 'act',
 'action',
 'actions',
 'active',
 'actively',
 'actra',
 'ad',
 'added',
 'addison',
 'adelphi',
 'adequate',
 'adhanom',
 'adjust',
 'adjustments',
 'admin',
 'admission',
 'admissions',
 'admitting',
 'adom',
 'adult',
 'advantage',
 'advisory',
 'advocating',
 'aestheti

#### Naive Bayes Model

In [37]:
%%time
nb_model = GaussianNB().fit(np.asarray(x_train_PRIMARY_bow_final), y_train_PRIMARY)
y_hat_nb_test_PRIMARY = nb_model.predict(np.asarray(x_test_PRIMARY_bow_final))
y_hat_nb_test_SECONDARY = nb_model.predict(np.asarray(x_test_SECONDARY_bow_final))

CPU times: user 148 ms, sys: 22.2 ms, total: 171 ms
Wall time: 196 ms


#### Logistic Regression Model

In [38]:
%%time
lr_model = LogisticRegression().fit(np.asarray(x_train_PRIMARY_bow_final), y_train_PRIMARY)
y_hat_lr_test_PRIMARY = lr_model.predict(np.asarray(x_test_PRIMARY_bow_final))
y_hat_lr_test_SECONDARY = nb_model.predict(np.asarray(x_test_SECONDARY_bow_final))

CPU times: user 506 ms, sys: 33.8 ms, total: 539 ms
Wall time: 357 ms


### 5. Model evaluation 2

#### Model Evaluation for Naive Bayes Classifier

In [39]:
print(classification_report(y_test_PRIMARY, y_hat_nb_test_PRIMARY))

              precision    recall  f1-score   support

       False       0.82      0.65      0.72       324
        True       0.41      0.63      0.49       126

    accuracy                           0.64       450
   macro avg       0.61      0.64      0.61       450
weighted avg       0.70      0.64      0.66       450



In [40]:
print(f1_score(y_test_PRIMARY, y_hat_nb_test_PRIMARY))

0.49375


In [41]:
print(classification_report(y_test_SECONDARY, y_hat_nb_test_SECONDARY))

              precision    recall  f1-score   support

       False       0.96      0.77      0.85      1153
        True       0.02      0.13      0.04        47

    accuracy                           0.74      1200
   macro avg       0.49      0.45      0.45      1200
weighted avg       0.92      0.74      0.82      1200



In [42]:
print(f1_score(y_test_SECONDARY, y_hat_nb_test_SECONDARY))

0.03773584905660377


#### Accuracy of Naive Bayes algorithm on Primary and secondary Test Set are 0.64 and 0.74
#### F1 Score of Naive Bayes algorithm on Primary and Secondary Test Set are 0.49 and 0.03 

#### Model Evaluation for Logistic Regression Classifier

In [43]:
print(classification_report(y_test_PRIMARY, y_hat_lr_test_PRIMARY))

              precision    recall  f1-score   support

       False       0.82      0.95      0.88       324
        True       0.79      0.47      0.59       126

    accuracy                           0.82       450
   macro avg       0.80      0.71      0.73       450
weighted avg       0.81      0.82      0.80       450



In [44]:
print(f1_score(y_test_PRIMARY, y_hat_lr_test_PRIMARY))

0.5870646766169153


In [45]:
print(classification_report(y_test_SECONDARY, y_hat_lr_test_SECONDARY))

              precision    recall  f1-score   support

       False       0.96      0.77      0.85      1153
        True       0.02      0.13      0.04        47

    accuracy                           0.74      1200
   macro avg       0.49      0.45      0.45      1200
weighted avg       0.92      0.74      0.82      1200



In [46]:
print(f1_score(y_test_SECONDARY, y_hat_lr_test_SECONDARY))

0.03773584905660377


#### Accuracy of Logistic Regression algorithm on Primary and secondary Test Set are 0.64 and 0.74
#### F1 Score of Logistic Regression algorithm on Primary and Secondary Test Set are 0.58 and 0.03 

#### The Performace of Logistic Regression algorithm is better than Naive Bayes on new set of features

### 6. Reflection

##### The Performace of Logistic Regression Classifier:  
    Accuracy:
    Accuracy dropped from 0.82 to 0.64 on PRIMARY DATASET.
    Accuracy dropped from 0.82 to 0.74 on SECONDARY DATASET.
    
    F1-Score:
    F1-Score dropped from 0.61 to 0.58 on PRIMARY DATASET.
    F1-Score increased from 0.0 to 0.03 on SECONDARY DATASET.