### Class Imbalance issue - Try out Undersampling Methods

In [1]:
# credits: https://github.com/saimadhu-polamuri/DataAspirant_codes/tree/master/handle_imbalance_data
# and: https://imbalanced-learn.org/stable/references/generated/imblearn.under_sampling.TomekLinks.html

#### Import libraries

In [2]:
# Around SEP 2021: install and use use the following module versions
# imbalanced-learn 0.7.0 [pip install imbalanced-learn==0.8.1]
# scipy(>=0.19.1) [conda install -c conda-forge scipy=1.7.1]
# scikit-learn(>=0.22) [conda install -c conda-forge scikit-learn=1.0]
# numpy(>=1.13.3) [conda install -c conda-forge numpy=1.21.2]
# joblib(>=0.11) [conda install -c conda-forge joblib=1.0.1]

In [2]:
import re
import numpy as np
import pandas as pd
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
import csv
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore",category=DeprecationWarning)
warnings.filterwarnings("ignore",category=sklearn.exceptions.UndefinedMetricWarning)

from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.ensemble import RUSBoostClassifier

#### Prep Data

In [3]:
data = pd.read_csv('data/MasterTokens.csv', encoding='ISO-8859-1', dtype='str', usecols=['RecID', 'smrNouns', 'smrAdverbs'])
len(data)

4684

In [4]:
data.head(2)

Unnamed: 0,RecID,smrNouns,smrAdverbs
0,11947603240,part song lie,pretti
1,12643331537,follow igcom club gpsi,


#### Prep Training data

In [5]:
df_GTD_Rec = pd.read_csv('data/GTxM_Pass3/GTxM_Pass3_GTD_UpTodate.csv', dtype='str')
len(df_GTD_Rec)

1057

In [6]:
df_GTD_Rec.head(2)

Unnamed: 0,RecID,Label,Target
0,1207761446513319936,Politics,6
1,1180079141087055872,Politics,6


In [7]:
# train on pass 3 GTD
df_train = pd.merge(data, df_GTD_Rec, on="RecID")
len(df_train)

1057

In [8]:
df_train.groupby(['Target','Label']).size()

Target  Label         
1       Business           75
10      Social Stories     32
11      Sports             75
2       Entertainment     153
3       Environmental      17
4       Health              4
5       Human Rights       86
6       Politics          456
7       Law and Order      12
9       Obituary          147
dtype: int64

In [9]:
# remove lower than 50 count SMRs
df_train = df_train[df_train.Label != 'Environmental']
df_train = df_train[df_train.Label != 'Health']
df_train = df_train[df_train.Label != 'Law and Order']
df_train = df_train[df_train.Label != 'Social Stories']

In [10]:
# remove Business and Obituary from training since test has 0 and only 1 SMRs
# insufficient for testing
# df_train = df_train[df_train.Label != 'Business']
# df_train = df_train[df_train.Label != 'Obituary']

In [11]:
len(df_train)

992

In [12]:
df_train.Target = df_train['Target'].astype(int)

In [13]:
# Code the Targets 0-5
df_train.loc[(df_train.Target == 1), 'Target'] = 0 # Business
df_train.loc[(df_train.Target == 2), 'Target'] = 1 # Entertainment
df_train.loc[(df_train.Target == 5), 'Target'] = 2 # Human Rights
df_train.loc[(df_train.Target == 6), 'Target'] = 3 # Politics
df_train.loc[(df_train.Target == 9), 'Target'] = 4 # Obituary
df_train.loc[(df_train.Target == 11), 'Target'] = 5 # Sports

In [14]:
df_train.groupby(['Target','Label']).size()

Target  Label        
0       Business          75
1       Entertainment    153
2       Human Rights      86
3       Politics         456
4       Obituary         147
5       Sports            75
dtype: int64

#### Prep Test data

In [15]:
df_CGT_Rec = pd.read_csv('data/GTxM_Pass4/GTxM_CGT_Labeled_Pass4.csv', dtype='str')
df_CGT_Rec.head(2)

Unnamed: 0,RecID,Label,Target
0,1135851552495865857,Social Stories,10
1,1151389038781390848,Human Rights,5


In [16]:
len(df_CGT_Rec)

425

In [17]:
# Remove 'World Politics' as per research decision in pass 3
# unify UK and USA politics
df_CGT_Rec = df_CGT_Rec[df_CGT_Rec.Label != 'World Politics']
df_CGT_Rec.loc[(df_CGT_Rec.Label == 'USA Politics'), 'Label'] = 'Politics'
df_CGT_Rec.loc[(df_CGT_Rec.Label == 'UK Politics'), 'Label'] = 'Politics'
df_CGT_Rec.loc[(df_CGT_Rec.Target == '14'), 'Target'] = '6'
df_CGT_Rec.loc[(df_CGT_Rec.Target == '15'), 'Target'] = '6'
len(df_CGT_Rec)

327

In [18]:
# remove lower than 50 traing count SMRs
df_CGT_Rec = df_CGT_Rec[df_CGT_Rec.Label != 'Environmental']
df_CGT_Rec = df_CGT_Rec[df_CGT_Rec.Label != 'Health']
df_CGT_Rec = df_CGT_Rec[df_CGT_Rec.Label != 'Law and Order']
df_CGT_Rec = df_CGT_Rec[df_CGT_Rec.Label != 'Social Stories']
df_CGT_Rec = df_CGT_Rec[df_CGT_Rec.Label != 'Travel']

In [19]:
# Remove Obitaury - insufficient samples
# df_CGT_Rec = df_CGT_Rec[df_CGT_Rec.Label != 'Obituary']

In [20]:
df_test = pd.merge(data, df_CGT_Rec, on="RecID")
len(df_test)

223

In [21]:
df_test.head(2)

Unnamed: 0,RecID,smrNouns,smrAdverbs,Label,Target
0,222818213392678912,list parti apparatchik power parti leader pres...,actual ntw enough pretti probabl kind total ba...,Politics,6
1,1124056098925944832,support critic messag design chang paramount s...,fulli total definit actual,Entertainment,2


In [22]:
# Add a placebo SMR for Business (since it is missing in the test data and required for keras encoding)
df_test.loc[len(df_test)] = ['101','business placebo record tweet keras encoding purpose','This is business placebo supporting tweet for keras encoding purpose only','Business','1']


In [23]:
df_test.groupby(['Target','Label']).size()

Target  Label        
1       Business           1
11      Sports            16
2       Entertainment     28
5       Human Rights      24
6       Politics         154
9       Obituary           1
dtype: int64

In [24]:
df_test.Target = df_test['Target'].astype(int)

In [25]:
# # Code the Targets 0-5
df_test.loc[(df_test.Target == 1), 'Target'] = 0 # Business
df_test.loc[(df_test.Target == 2), 'Target'] = 1 # Entertainment
df_test.loc[(df_test.Target == 5), 'Target'] = 2 # Human Rights
df_test.loc[(df_test.Target == 6), 'Target'] = 3 # Politics
df_test.loc[(df_test.Target == 9), 'Target'] = 4 # Obituary
df_test.loc[(df_test.Target == 11), 'Target'] = 5 # Sports


In [26]:
df_test.groupby(['Target','Label']).size()

Target  Label        
0       Business           1
1       Entertainment     28
2       Human Rights      24
3       Politics         154
4       Obituary           1
5       Sports            16
dtype: int64

#### Baseline SVM Predictions

In [27]:
scoring = {'acc': 'accuracy',
           'prec': 'precision_weighted',
           'recall': 'recall_weighted',
           'f1': 'f1_weighted'}

#test_ratio = 0.20
corpus = df_train['smrNouns'] + df_train['smrAdverbs']
# corpus = df_train['smrNouns'] + df_train['smrNER'] +df_train['smrAdverbs'] + df_test['smrAdjectives']
# corpus = df_train['smrNER'] + df_test['smrAdjectives']
corpus = corpus.fillna(value='')
vec = 'TFIDF'
# vectorizer = TfidfVectorizer(min_df=2,ngram_range=(1,2),max_features=10000)
vectorizer = TfidfVectorizer(min_df=2,ngram_range=(1,2),max_features=5000)
data_vec = vectorizer.fit_transform(corpus)
# vec_dtm = pd.DataFrame(data_vec.toarray(), columns=vectorizer.get_feature_names_out())
# resolution: https://stackoverflow.com/questions/70215049/attributeerror-tfidfvectorizer-object-has-no-attribute-get-feature-names-out
vec_dtm = pd.DataFrame(data_vec.toarray(), columns=vectorizer.get_feature_names())
vec_dtm.index = df_train['RecID']
y = df_train['Target']
y.index = df_train['RecID']
y=y.astype('int')
X = vec_dtm




In [28]:
clf = SVC(kernel='linear')
clf.fit(X, y)

SVC(kernel='linear')

In [29]:
# setup test data
corpus_test = df_test['smrNouns'] + df_test['smrAdverbs']
# corpus_test = df_test['smrNouns'] + df_test['smrNER'] + df_test['smrAdverbs'] + df_test['smrAdjectives']
# corpus_test = df_test['smrNER'] + df_test['smrAdjectives']
corpus_test = corpus_test.fillna(value='')
# vectorizer = TfidfVectorizer(min_df=2,ngram_range=(1,2),max_features=10000)
vectorizer = TfidfVectorizer(min_df=2,ngram_range=(1,2),max_features=5000)
data_vec = vectorizer.fit_transform(corpus_test)
vec_dtm = pd.DataFrame(data_vec.toarray(), columns=vectorizer.get_feature_names())
vec_dtm.index = df_test['RecID']
y_test = df_test['Target']
y_test.index = df_test['RecID']
y_test=y_test.astype('int')
X_test = vec_dtm



In [30]:
test_pred = clf.predict(X_test)

Feature names unseen at fit time:
- aaron
- aberdeen
- abram
- absolut away
- absolut forward
- ...
Feature names seen at fit time, yet now missing:
- abc
- absolut instead
- absolut togeth
- accomplic
- accuraci
- ...



In [31]:
test_acc = accuracy_score(y_test, test_pred)
test_prec_recall_f1 = precision_recall_fscore_support(y_test, test_pred, average = 'weighted')

# For Prod:
result = ['GTxM Pass 4', 'SVM', 0,
        test_acc*100, test_prec_recall_f1[0]*100, test_prec_recall_f1[1]*100, test_prec_recall_f1[2]*100]

In [32]:
result

['GTxM Pass 4',
 'SVM',
 0,
 51.78571428571429,
 51.001240151736205,
 51.78571428571429,
 50.87569323942704]

##### Generate Confusion Matrix

In [33]:
svm_cm = confusion_matrix(y_test, test_pred)
df_svm_cm = pd.DataFrame(svm_cm, 
                        columns=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'],
                        index=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'])

# Removed Business and Obituary since there is none in test data
# df_svm_cm = pd.DataFrame(svm_cm, 
#                         columns=['Entertainment','Human Rights','Politics','Sports'],
#                         index=['Entertainment','Human Rights','Politics','Sports'])
df_svm_cm.to_csv('results/GTxM_Pass4/GTxM_Clf_SVM_Baseline_CM.csv')

In [34]:
df_svm_cm

Unnamed: 0,Business,Entertainment,Human Rights,Politics,Obituary,Sports
Business,0,1,0,0,0,0
Entertainment,1,9,0,17,1,0
Human Rights,0,6,0,18,0,0
Politics,2,43,1,107,1,0
Obituary,0,0,0,1,0,0
Sports,0,10,0,6,0,0


#### RUSBoostClassifier Predictions

In [35]:
# from sklearn.datasets import make_classification
clf_rusB = RUSBoostClassifier(random_state=0)
clf_rusB.fit(X, y)

RUSBoostClassifier(random_state=0)

In [36]:
test_pred = clf_rusB.predict(X_test)

Feature names unseen at fit time:
- aaron
- aberdeen
- abram
- absolut away
- absolut forward
- ...
Feature names seen at fit time, yet now missing:
- abc
- absolut instead
- absolut togeth
- accomplic
- accuraci
- ...



In [37]:
test_acc = accuracy_score(y_test, test_pred)
test_prec_recall_f1 = precision_recall_fscore_support(y_test, test_pred, average = 'weighted')

# For Prod:
result = ['GTxM Pass 4', 'RUSB', 0,
        test_acc*100, test_prec_recall_f1[0]*100, test_prec_recall_f1[1]*100, test_prec_recall_f1[2]*100]

In [38]:
result

['GTxM Pass 4',
 'RUSB',
 0,
 46.875,
 56.08135283961598,
 46.875,
 49.47779029178071]

##### Generate Confusion Matrix

In [39]:
cm = confusion_matrix(y_test, test_pred)
df_cm = pd.DataFrame(cm, 
                        columns=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'],
                        index=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'])
df_cm.to_csv('results/GTxM_Pass4/GTxM_Clf_RUSB_CM.csv')

In [40]:
df_cm

Unnamed: 0,Business,Entertainment,Human Rights,Politics,Obituary,Sports
Business,0,0,0,1,0,0
Entertainment,2,10,1,12,0,3
Human Rights,0,6,1,14,1,2
Politics,2,21,1,90,6,34
Obituary,0,0,0,1,0,0
Sports,0,3,0,9,0,4


#### BalancedRandomForestClassifier Predictions

In [45]:
# from imblearn.ensemble import BalancedRandomForestClassifier
clf_brf = BalancedRandomForestClassifier(max_depth=2, random_state=0)
clf_brf.fit(X,y)

BalancedRandomForestClassifier(max_depth=2, random_state=0)

In [46]:
test_pred = clf_brf.predict(X_test)

Feature names unseen at fit time:
- aaron
- aberdeen
- abram
- absolut away
- absolut forward
- ...
Feature names seen at fit time, yet now missing:
- abc
- absolut instead
- absolut togeth
- accomplic
- accuraci
- ...



In [47]:
test_acc = accuracy_score(y_test, test_pred)
test_prec_recall_f1 = precision_recall_fscore_support(y_test, test_pred, average = 'weighted')

# For Prod:
result = ['GTxM Pass 4', 'BRF', 0,
        test_acc*100, test_prec_recall_f1[0]*100, test_prec_recall_f1[1]*100, test_prec_recall_f1[2]*100]

In [48]:
result

['GTxM Pass 4',
 'BRF',
 0,
 18.303571428571427,
 65.64923943740737,
 18.303571428571427,
 19.201826645222873]

##### Generate Confusion Matrix

In [49]:
cm = confusion_matrix(y_test, test_pred)
df_cm = pd.DataFrame(cm, 
                        columns=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'],
                        index=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'])
df_cm.to_csv('results/GTxM_Pass4/GTxM_Clf_BRF_CM.csv')

In [50]:
df_cm

Unnamed: 0,Business,Entertainment,Human Rights,Politics,Obituary,Sports
Business,0,1,0,0,0,0
Entertainment,0,20,0,0,6,2
Human Rights,0,12,1,2,5,4
Politics,13,84,12,19,23,3
Obituary,0,1,0,0,0,0
Sports,1,13,0,0,1,1
