### Class Imbalance issue - Try out OverSampling Methods

In [1]:
# credits: https://github.com/saimadhu-polamuri/DataAspirant_codes/tree/master/handle_imbalance_data
# and: https://imbalanced-learn.org/stable/references/generated/imblearn.under_sampling.TomekLinks.html

#### Import libraries

In [2]:
# Around SEP 2021: install and use use the following module versions
# imbalanced-learn 0.7.0 [pip install imbalanced-learn==0.8.1]
# scipy(>=0.19.1) [conda install -c conda-forge scipy=1.7.1]
# scikit-learn(>=0.22) [conda install -c conda-forge scikit-learn=1.0]
# numpy(>=1.13.3) [conda install -c conda-forge numpy=1.21.2]
# joblib(>=0.11) [conda install -c conda-forge joblib=1.0.1]

In [1]:
import re
import numpy as np
import pandas as pd
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
import csv
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore",category=DeprecationWarning)
warnings.filterwarnings("ignore",category=sklearn.exceptions.UndefinedMetricWarning)

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTEN
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import KMeansSMOTE
from imblearn.over_sampling import SVMSMOTE

#### Prep Data

In [2]:
data = pd.read_csv('data/MasterTokens.csv', encoding='ISO-8859-1', dtype='str', usecols=['RecID', 'smrNouns', 'smrAdverbs'])
len(data)

4684

In [3]:
data.head(2)

Unnamed: 0,RecID,smrNouns,smrAdverbs
0,11947603240,part song lie,pretti
1,12643331537,follow igcom club gpsi,


#### Prep Training data

In [4]:
df_GTD_Rec = pd.read_csv('data/GTxM_Pass3/GTxM_Pass3_GTD_UpTodate.csv', dtype='str')
len(df_GTD_Rec)

1057

In [5]:
df_GTD_Rec.head(2)

Unnamed: 0,RecID,Label,Target
0,1207761446513319936,Politics,6
1,1180079141087055872,Politics,6


In [6]:
# train on pass 3 GTD
df_train = pd.merge(data, df_GTD_Rec, on="RecID")
len(df_train)

1057

In [7]:
df_train.groupby(['Target','Label']).size()

Target  Label         
1       Business           75
10      Social Stories     32
11      Sports             75
2       Entertainment     153
3       Environmental      17
4       Health              4
5       Human Rights       86
6       Politics          456
7       Law and Order      12
9       Obituary          147
dtype: int64

In [8]:
# remove lower than 50 count SMRs
df_train = df_train[df_train.Label != 'Environmental']
df_train = df_train[df_train.Label != 'Health']
df_train = df_train[df_train.Label != 'Law and Order']
df_train = df_train[df_train.Label != 'Social Stories']

In [9]:
# remove Business and Obituary from training since test has 0 and only 1 SMRs
# insufficient for testing
# df_train = df_train[df_train.Label != 'Business']
# df_train = df_train[df_train.Label != 'Obituary']

In [10]:
len(df_train)

992

In [11]:
df_train.Target = df_train['Target'].astype(int)

In [12]:
# Code the Targets 0-5
df_train.loc[(df_train.Target == 1), 'Target'] = 0 # Business
df_train.loc[(df_train.Target == 2), 'Target'] = 1 # Entertainment
df_train.loc[(df_train.Target == 5), 'Target'] = 2 # Human Rights
df_train.loc[(df_train.Target == 6), 'Target'] = 3 # Politics
df_train.loc[(df_train.Target == 9), 'Target'] = 4 # Obituary
df_train.loc[(df_train.Target == 11), 'Target'] = 5 # Sports

In [13]:
df_train.groupby(['Target','Label']).size()

Target  Label        
0       Business          75
1       Entertainment    153
2       Human Rights      86
3       Politics         456
4       Obituary         147
5       Sports            75
dtype: int64

#### Prep Test data

In [14]:
df_CGT_Rec = pd.read_csv('data/GTxM_Pass4/GTxM_CGT_Labeled_Pass4.csv', dtype='str')
df_CGT_Rec.head(2)

Unnamed: 0,RecID,Label,Target
0,1135851552495865857,Social Stories,10
1,1151389038781390848,Human Rights,5


In [15]:
len(df_CGT_Rec)

425

In [16]:
# Remove 'World Politics' as per research decision in pass 3
# unify UK and USA politics
df_CGT_Rec = df_CGT_Rec[df_CGT_Rec.Label != 'World Politics']
df_CGT_Rec.loc[(df_CGT_Rec.Label == 'USA Politics'), 'Label'] = 'Politics'
df_CGT_Rec.loc[(df_CGT_Rec.Label == 'UK Politics'), 'Label'] = 'Politics'
df_CGT_Rec.loc[(df_CGT_Rec.Target == '14'), 'Target'] = '6'
df_CGT_Rec.loc[(df_CGT_Rec.Target == '15'), 'Target'] = '6'
len(df_CGT_Rec)

327

In [17]:
# remove lower than 50 traing count SMRs
df_CGT_Rec = df_CGT_Rec[df_CGT_Rec.Label != 'Environmental']
df_CGT_Rec = df_CGT_Rec[df_CGT_Rec.Label != 'Health']
df_CGT_Rec = df_CGT_Rec[df_CGT_Rec.Label != 'Law and Order']
df_CGT_Rec = df_CGT_Rec[df_CGT_Rec.Label != 'Social Stories']
df_CGT_Rec = df_CGT_Rec[df_CGT_Rec.Label != 'Travel']

In [18]:
# Remove Obitaury - insufficient samples
# df_CGT_Rec = df_CGT_Rec[df_CGT_Rec.Label != 'Obituary']

In [19]:
df_test = pd.merge(data, df_CGT_Rec, on="RecID")
len(df_test)

223

In [20]:
df_test.head(2)

Unnamed: 0,RecID,smrNouns,smrAdverbs,Label,Target
0,222818213392678912,list parti apparatchik power parti leader pres...,actual ntw enough pretti probabl kind total ba...,Politics,6
1,1124056098925944832,support critic messag design chang paramount s...,fulli total definit actual,Entertainment,2


In [21]:
# Add a placebo SMR for Business (since it is missing in the test data and required for keras encoding)
df_test.loc[len(df_test)] = ['101','business placebo record tweet keras encoding purpose','This is business placebo supporting tweet for keras encoding purpose only','Business','1']


In [22]:
df_test.groupby(['Target','Label']).size()

Target  Label        
1       Business           1
11      Sports            16
2       Entertainment     28
5       Human Rights      24
6       Politics         154
9       Obituary           1
dtype: int64

In [23]:
df_test.Target = df_test['Target'].astype(int)

In [24]:
# # Code the Targets 0-5
df_test.loc[(df_test.Target == 1), 'Target'] = 0 # Business
df_test.loc[(df_test.Target == 2), 'Target'] = 1 # Entertainment
df_test.loc[(df_test.Target == 5), 'Target'] = 2 # Human Rights
df_test.loc[(df_test.Target == 6), 'Target'] = 3 # Politics
df_test.loc[(df_test.Target == 9), 'Target'] = 4 # Obituary
df_test.loc[(df_test.Target == 11), 'Target'] = 5 # Sports


In [25]:
df_test.groupby(['Target','Label']).size()

Target  Label        
0       Business           1
1       Entertainment     28
2       Human Rights      24
3       Politics         154
4       Obituary           1
5       Sports            16
dtype: int64

#### Baseline Predictions

In [26]:
scoring = {'acc': 'accuracy',
           'prec': 'precision_weighted',
           'recall': 'recall_weighted',
           'f1': 'f1_weighted'}

#test_ratio = 0.20
corpus = df_train['smrNouns'] + df_train['smrAdverbs']
# corpus = df_train['smrNouns'] + df_train['smrNER'] +df_train['smrAdverbs'] + df_test['smrAdjectives']
# corpus = df_train['smrNER'] + df_test['smrAdjectives']
corpus = corpus.fillna(value='')
vec = 'TFIDF'
# vectorizer = TfidfVectorizer(min_df=2,ngram_range=(1,2),max_features=10000)
vectorizer = TfidfVectorizer(min_df=2,ngram_range=(1,2),max_features=5000)
data_vec = vectorizer.fit_transform(corpus)
# vec_dtm = pd.DataFrame(data_vec.toarray(), columns=vectorizer.get_feature_names_out())
# resolution: https://stackoverflow.com/questions/70215049/attributeerror-tfidfvectorizer-object-has-no-attribute-get-feature-names-out
vec_dtm = pd.DataFrame(data_vec.toarray(), columns=vectorizer.get_feature_names())
vec_dtm.index = df_train['RecID']
y = df_train['Target']
y.index = df_train['RecID']
y=y.astype('int')
X = vec_dtm




In [27]:
clf = SVC(kernel='linear')
clf.fit(X, y)

SVC(kernel='linear')

In [28]:
# setup test data
corpus_test = df_test['smrNouns'] + df_test['smrAdverbs']
# corpus_test = df_test['smrNouns'] + df_test['smrNER'] + df_test['smrAdverbs'] + df_test['smrAdjectives']
# corpus_test = df_test['smrNER'] + df_test['smrAdjectives']
corpus_test = corpus_test.fillna(value='')
# vectorizer = TfidfVectorizer(min_df=2,ngram_range=(1,2),max_features=10000)
vectorizer = TfidfVectorizer(min_df=2,ngram_range=(1,2),max_features=5000)
data_vec = vectorizer.fit_transform(corpus_test)
vec_dtm = pd.DataFrame(data_vec.toarray(), columns=vectorizer.get_feature_names())
vec_dtm.index = df_test['RecID']
y_test = df_test['Target']
y_test.index = df_test['RecID']
y_test=y_test.astype('int')
X_test = vec_dtm



In [29]:
test_pred = clf.predict(X_test)

Feature names unseen at fit time:
- aaron
- aberdeen
- abram
- absolut away
- absolut forward
- ...
Feature names seen at fit time, yet now missing:
- abc
- absolut instead
- absolut togeth
- accomplic
- accuraci
- ...



In [30]:
test_acc = accuracy_score(y_test, test_pred)
test_prec_recall_f1 = precision_recall_fscore_support(y_test, test_pred, average = 'weighted')

# For Prod:
result = ['GTxM Pass 4', 'SVM', 0,
        test_acc*100, test_prec_recall_f1[0]*100, test_prec_recall_f1[1]*100, test_prec_recall_f1[2]*100]

In [31]:
result

['GTxM Pass 4',
 'SVM',
 0,
 51.78571428571429,
 51.001240151736205,
 51.78571428571429,
 50.87569323942704]

##### Generate Confusion Matrix

In [32]:
svm_cm = confusion_matrix(y_test, test_pred)
df_svm_cm = pd.DataFrame(svm_cm, 
                        columns=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'],
                        index=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'])

# Removed Business and Obituary since there is none in test data
# df_svm_cm = pd.DataFrame(svm_cm, 
#                         columns=['Entertainment','Human Rights','Politics','Sports'],
#                         index=['Entertainment','Human Rights','Politics','Sports'])
df_svm_cm.to_csv('results/GTxM_Pass4/GTxM_Clf_SVM_Baseline_CM.csv')

In [33]:
df_svm_cm

Unnamed: 0,Business,Entertainment,Human Rights,Politics,Obituary,Sports
Business,0,1,0,0,0,0
Entertainment,1,9,0,17,1,0
Human Rights,0,6,0,18,0,0
Politics,2,43,1,107,1,0
Obituary,0,0,0,1,0,0
Sports,0,10,0,6,0,0


#### ROS Predictions

In [34]:
# from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)

In [35]:
X_ros,y_ros = ros.fit_resample(X,y)

In [36]:
X_ros.shape,y_ros.shape

((2736, 5000), (2736,))

In [37]:
clf_ros = SVC(kernel='linear')
clf_ros.fit(X_ros, y_ros)

SVC(kernel='linear')

In [38]:
test_pred = clf_ros.predict(X_test)

Feature names unseen at fit time:
- aaron
- aberdeen
- abram
- absolut away
- absolut forward
- ...
Feature names seen at fit time, yet now missing:
- abc
- absolut instead
- absolut togeth
- accomplic
- accuraci
- ...



In [39]:
test_acc = accuracy_score(y_test, test_pred)
test_prec_recall_f1 = precision_recall_fscore_support(y_test, test_pred, average = 'weighted')

# For Prod:
result = ['GTxM Pass 4', 'SVM_ROS', 0,
        test_acc*100, test_prec_recall_f1[0]*100, test_prec_recall_f1[1]*100, test_prec_recall_f1[2]*100]

In [40]:
result

['GTxM Pass 4',
 'SVM_ROS',
 0,
 45.089285714285715,
 54.08392857142857,
 45.089285714285715,
 47.026347617281985]

##### Generate Confusion Matrix

In [41]:
svm_cm = confusion_matrix(y_test, test_pred)
df_svm_cm = pd.DataFrame(svm_cm, 
                        columns=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'],
                        index=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'])

# Removed Business and Obituary since there is none in test data
# df_svm_cm = pd.DataFrame(svm_cm, 
#                         columns=['Entertainment','Human Rights','Politics','Sports'],
#                         index=['Entertainment','Human Rights','Politics','Sports'])
df_svm_cm.to_csv('results/GTxM_Pass4/GTxM_Clf_SVM_ROS_CM.csv')

In [42]:
df_svm_cm

Unnamed: 0,Business,Entertainment,Human Rights,Politics,Obituary,Sports
Business,0,1,0,0,0,0
Entertainment,2,11,0,14,1,0
Human Rights,1,5,1,17,0,0
Politics,3,59,2,89,1,0
Obituary,0,0,0,1,0,0
Sports,0,12,0,4,0,0


#### SMOTE Predictions

In [43]:
# from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)

In [44]:
X_sm,y_sm = sm.fit_resample(X,y)

In [45]:
X_sm.shape,y_sm.shape

((2736, 5000), (2736,))

In [46]:
clf_sm = SVC(kernel='linear')
clf_sm.fit(X_sm,y_sm)

SVC(kernel='linear')

In [47]:
test_pred = clf_sm.predict(X_test)

Feature names unseen at fit time:
- aaron
- aberdeen
- abram
- absolut away
- absolut forward
- ...
Feature names seen at fit time, yet now missing:
- abc
- absolut instead
- absolut togeth
- accomplic
- accuraci
- ...



In [48]:
test_acc = accuracy_score(y_test, test_pred)
test_prec_recall_f1 = precision_recall_fscore_support(y_test, test_pred, average = 'weighted')

# For Prod:
result = ['GTxM Pass 4', 'SVM_SM', 0,
        test_acc*100, test_prec_recall_f1[0]*100, test_prec_recall_f1[1]*100, test_prec_recall_f1[2]*100]

In [49]:
result

['GTxM Pass 4',
 'SVM_SM',
 0,
 45.535714285714285,
 50.48449612403101,
 45.535714285714285,
 46.8926290992499]

##### Generate Confusion Matrix

In [50]:
svm_cm = confusion_matrix(y_test, test_pred)
df_svm_cm = pd.DataFrame(svm_cm, 
                        columns=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'],
                        index=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'])

# Removed Business and Obituary since there is none in test data
# df_svm_cm = pd.DataFrame(svm_cm, 
#                         columns=['Entertainment','Human Rights','Politics','Sports'],
#                         index=['Entertainment','Human Rights','Politics','Sports'])
df_svm_cm.to_csv('results/GTxM_Pass4/GTxM_Clf_SVM_SM_CM.csv')

In [51]:
df_svm_cm

Unnamed: 0,Business,Entertainment,Human Rights,Politics,Obituary,Sports
Business,0,1,0,0,0,0
Entertainment,2,10,0,15,1,0
Human Rights,1,6,0,17,0,0
Politics,3,57,1,92,1,0
Obituary,0,0,0,1,0,0
Sports,0,12,0,4,0,0


#### SMOTEN Predictions

In [53]:
# from imblearn.over_sampling import SMOTEN
smn = SMOTEN(random_state=0)

In [54]:
X_smn,y_smn = smn.fit_resample(X,y)

In [55]:
X_smn.shape,y_smn.shape

((2736, 5000), (2736,))

In [56]:
clf_smn = SVC(kernel='linear')
clf_smn.fit(X_smn,y_smn)

SVC(kernel='linear')

In [57]:
test_pred = clf_smn.predict(X_test)

Feature names unseen at fit time:
- aaron
- aberdeen
- abram
- absolut away
- absolut forward
- ...
Feature names seen at fit time, yet now missing:
- abc
- absolut instead
- absolut togeth
- accomplic
- accuraci
- ...



In [58]:
test_acc = accuracy_score(y_test, test_pred)
test_prec_recall_f1 = precision_recall_fscore_support(y_test, test_pred, average = 'weighted')

# For Prod:
result = ['GTxM Pass 4', 'SVM_SMN', 0,
        test_acc*100, test_prec_recall_f1[0]*100, test_prec_recall_f1[1]*100, test_prec_recall_f1[2]*100]

In [59]:
result

['GTxM Pass 4',
 'SVM_SMN',
 0,
 0.4464285714285714,
 0.002136021872863978,
 0.4464285714285714,
 0.004251700680272109]

##### Generate Confusion Matrix

In [60]:
svm_cm = confusion_matrix(y_test, test_pred)
df_svm_cm = pd.DataFrame(svm_cm, 
                        columns=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'],
                        index=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'])

# Removed Business and Obituary since there is none in test data
# df_svm_cm = pd.DataFrame(svm_cm, 
#                         columns=['Entertainment','Human Rights','Politics','Sports'],
#                         index=['Entertainment','Human Rights','Politics','Sports'])
df_svm_cm.to_csv('results/GTxM_Pass4/GTxM_Clf_SVM_SMN_CM.csv')

In [61]:
df_svm_cm

Unnamed: 0,Business,Entertainment,Human Rights,Politics,Obituary,Sports
Business,1,0,0,0,0,0
Entertainment,25,0,0,0,1,2
Human Rights,21,0,0,0,0,3
Politics,146,2,1,0,2,3
Obituary,1,0,0,0,0,0
Sports,15,1,0,0,0,0


#### ADASYN Predictions

In [62]:
# from imblearn.over_sampling import ADASYN
ada = ADASYN(random_state=42)

In [63]:
X_ada,y_ada = ada.fit_resample(X,y)

In [64]:
X_ada.shape,y_ada.shape

((2709, 5000), (2709,))

In [65]:
clf_ada = SVC(kernel='linear')
clf_ada.fit(X_ada,y_ada)

SVC(kernel='linear')

In [66]:
test_pred = clf_ada.predict(X_test)

Feature names unseen at fit time:
- aaron
- aberdeen
- abram
- absolut away
- absolut forward
- ...
Feature names seen at fit time, yet now missing:
- abc
- absolut instead
- absolut togeth
- accomplic
- accuraci
- ...



In [67]:
test_acc = accuracy_score(y_test, test_pred)
test_prec_recall_f1 = precision_recall_fscore_support(y_test, test_pred, average = 'weighted')

# For Prod:
result = ['GTxM Pass 4', 'SVM_ADA', 0,
        test_acc*100, test_prec_recall_f1[0]*100, test_prec_recall_f1[1]*100, test_prec_recall_f1[2]*100]

In [68]:
result

['GTxM Pass 4',
 'SVM_ADA',
 0,
 44.642857142857146,
 55.64203065517932,
 44.642857142857146,
 46.6410223469576]

##### Generate Confusion Matrix

In [69]:
svm_cm = confusion_matrix(y_test, test_pred)
df_svm_cm = pd.DataFrame(svm_cm, 
                        columns=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'],
                        index=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'])

# Removed Business and Obituary since there is none in test data
# df_svm_cm = pd.DataFrame(svm_cm, 
#                         columns=['Entertainment','Human Rights','Politics','Sports'],
#                         index=['Entertainment','Human Rights','Politics','Sports'])
# df_svm_cm = pd.DataFrame(svm_cm)
df_svm_cm.to_csv('results/GTxM_Pass4/GTxM_Clf_SVM_ADA_CM.csv')

In [70]:
df_svm_cm

Unnamed: 0,Business,Entertainment,Human Rights,Politics,Obituary,Sports
Business,0,1,0,0,0,0
Entertainment,2,11,0,14,1,0
Human Rights,0,6,1,17,0,0
Politics,2,62,1,88,1,0
Obituary,0,0,0,1,0,0
Sports,0,12,0,4,0,0


#### BorderlineSMOTE Predictions

In [71]:
# from imblearn.over_sampling import BorderlineSMOTE
bsm = BorderlineSMOTE(random_state=42)

In [72]:
X_bsm,y_bsm = bsm.fit_resample(X,y)

In [73]:
X_bsm.shape,y_bsm.shape

((2736, 5000), (2736,))

In [74]:
clf_bsm = SVC(kernel='linear')
clf_bsm.fit(X_bsm,y_bsm)

SVC(kernel='linear')

In [75]:
test_pred = clf_bsm.predict(X_test)

Feature names unseen at fit time:
- aaron
- aberdeen
- abram
- absolut away
- absolut forward
- ...
Feature names seen at fit time, yet now missing:
- abc
- absolut instead
- absolut togeth
- accomplic
- accuraci
- ...



In [76]:
test_acc = accuracy_score(y_test, test_pred)
test_prec_recall_f1 = precision_recall_fscore_support(y_test, test_pred, average = 'weighted')

# For Prod:
result = ['GTxM Pass 4', 'SVM_BSM', 0,
        test_acc*100, test_prec_recall_f1[0]*100, test_prec_recall_f1[1]*100, test_prec_recall_f1[2]*100]

In [77]:
result

['GTxM Pass 4',
 'SVM_BSM',
 0,
 47.767857142857146,
 50.9225383920506,
 47.767857142857146,
 48.42324630386914]

##### Generate Confusion Matrix

In [78]:
svm_cm = confusion_matrix(y_test, test_pred)
df_svm_cm = pd.DataFrame(svm_cm, 
                        columns=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'],
                        index=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'])

# Removed Business and Obituary since there is none in test data
# df_svm_cm = pd.DataFrame(svm_cm, 
#                         columns=['Entertainment','Human Rights','Politics','Sports'],
#                         index=['Entertainment','Human Rights','Politics','Sports'])
df_svm_cm.to_csv('results/GTxM_Pass4/GTxM_Clf_SVM_BSM_CM.csv')

In [79]:
df_svm_cm

Unnamed: 0,Business,Entertainment,Human Rights,Politics,Obituary,Sports
Business,0,1,0,0,0,0
Entertainment,2,10,0,15,1,0
Human Rights,0,6,0,18,0,0
Politics,2,53,1,97,1,0
Obituary,0,0,0,1,0,0
Sports,0,12,0,4,0,0


#### KMeansSMOTE Predictions

In [80]:
# from imblearn.over_sampling import KMeansSMOTE
ksm = KMeansSMOTE(random_state=42)

In [81]:
X_ksm,y_ksm = ksm.fit_resample(X,y)

In [82]:
X_ksm.shape,y_ksm.shape

((2740, 5000), (2740,))

In [83]:
clf_ksm = SVC(kernel='linear')
clf_ksm.fit(X_ksm,y_ksm)

SVC(kernel='linear')

In [84]:
test_pred = clf_ksm.predict(X_test)

Feature names unseen at fit time:
- aaron
- aberdeen
- abram
- absolut away
- absolut forward
- ...
Feature names seen at fit time, yet now missing:
- abc
- absolut instead
- absolut togeth
- accomplic
- accuraci
- ...



In [85]:
test_acc = accuracy_score(y_test, test_pred)
test_prec_recall_f1 = precision_recall_fscore_support(y_test, test_pred, average = 'weighted')

# For Prod:
result = ['GTxM Pass 4', 'SVM_KSM', 0,
        test_acc*100, test_prec_recall_f1[0]*100, test_prec_recall_f1[1]*100, test_prec_recall_f1[2]*100]

In [86]:
result

['GTxM Pass 4',
 'SVM_KSM',
 0,
 47.32142857142857,
 50.10105793218147,
 47.32142857142857,
 47.93613707165109]

##### Generate Confusion Matrix

In [87]:
svm_cm = confusion_matrix(y_test, test_pred)
df_svm_cm = pd.DataFrame(svm_cm, 
                        columns=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'],
                        index=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'])

# Removed Business and Obituary since there is none in test data
# df_svm_cm = pd.DataFrame(svm_cm, 
#                         columns=['Entertainment','Human Rights','Politics','Sports'],
#                         index=['Entertainment','Human Rights','Politics','Sports'])
df_svm_cm.to_csv('results/GTxM_Pass4/GTxM_Clf_SVM_KSM_CM.csv')

In [88]:
df_svm_cm

Unnamed: 0,Business,Entertainment,Human Rights,Politics,Obituary,Sports
Business,0,1,0,0,0,0
Entertainment,2,9,0,16,1,0
Human Rights,0,7,0,17,0,0
Politics,2,52,2,97,1,0
Obituary,0,0,0,1,0,0
Sports,0,10,0,6,0,0


#### SVMSMOTE Predictions

In [89]:
# from imblearn.over_sampling import SVMSMOTE
ssm = SVMSMOTE(random_state=42)

In [90]:
X_ssm,y_ssm = ssm.fit_resample(X,y)

In [91]:
X_ssm.shape,y_ssm.shape

((2736, 5000), (2736,))

In [92]:
clf_ssm = SVC(kernel='linear')
clf_ssm.fit(X_ssm,y_ssm)

SVC(kernel='linear')

In [93]:
test_pred = clf_ssm.predict(X_test)

Feature names unseen at fit time:
- aaron
- aberdeen
- abram
- absolut away
- absolut forward
- ...
Feature names seen at fit time, yet now missing:
- abc
- absolut instead
- absolut togeth
- accomplic
- accuraci
- ...



In [94]:
test_acc = accuracy_score(y_test, test_pred)
test_prec_recall_f1 = precision_recall_fscore_support(y_test, test_pred, average = 'weighted')

# For Prod:
result = ['GTxM Pass 4', 'SVM_SSM', 0,
        test_acc*100, test_prec_recall_f1[0]*100, test_prec_recall_f1[1]*100, test_prec_recall_f1[2]*100]

In [95]:
result

['GTxM Pass 4',
 'SVM_SSM',
 0,
 46.42857142857143,
 51.199633699633694,
 46.42857142857143,
 47.74270623742455]

##### Generate Confusion Matrix

In [96]:
svm_cm = confusion_matrix(y_test, test_pred)
df_svm_cm = pd.DataFrame(svm_cm, 
                        columns=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'],
                        index=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'])

# Removed Business and Obituary since there is none in test data
# df_svm_cm = pd.DataFrame(svm_cm, 
#                         columns=['Entertainment','Human Rights','Politics','Sports'],
#                         index=['Entertainment','Human Rights','Politics','Sports'])
df_svm_cm.to_csv('results/GTxM_Pass4/GTxM_Clf_SVM_SSM_CM.csv')

In [97]:
df_svm_cm

Unnamed: 0,Business,Entertainment,Human Rights,Politics,Obituary,Sports
Business,0,1,0,0,0,0
Entertainment,2,10,0,14,2,0
Human Rights,1,6,0,17,0,0
Politics,2,55,1,94,2,0
Obituary,0,0,0,1,0,0
Sports,0,12,0,4,0,0


#### SMOTEENN Predictions

In [100]:
from imblearn.combine import SMOTEENN
sme = SMOTEENN(random_state=42)

In [101]:
X_sme,y_sme = sme.fit_resample(X,y)

In [102]:
X_sme.shape,y_sme.shape

((2527, 5000), (2527,))

In [103]:
clf_sme = SVC(kernel='linear')
clf_sme.fit(X_sme,y_sme)

SVC(kernel='linear')

In [104]:
test_pred = clf_sme.predict(X_test)

Feature names unseen at fit time:
- aaron
- aberdeen
- abram
- absolut away
- absolut forward
- ...
Feature names seen at fit time, yet now missing:
- abc
- absolut instead
- absolut togeth
- accomplic
- accuraci
- ...



In [106]:
test_acc = accuracy_score(y_test, test_pred)
test_prec_recall_f1 = precision_recall_fscore_support(y_test, test_pred, average = 'weighted')

# For Prod:
result = ['GTxM Pass 4', 'SVM_SME', 0,
    test_acc*100, test_prec_recall_f1[0]*100, test_prec_recall_f1[1]*100, test_prec_recall_f1[2]*100]

In [107]:
result

['GTxM Pass 4',
 'SVM_SME',
 0,
 12.053571428571429,
 70.20631067961165,
 12.053571428571429,
 5.191491099134411]

##### Generate Confusion Matrix

In [108]:
svm_cm = confusion_matrix(y_test, test_pred)
df_svm_cm = pd.DataFrame(svm_cm, 
                        columns=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'],
                        index=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'])

# Removed Business and Obituary since there is none in test data
# df_svm_cm = pd.DataFrame(svm_cm, 
#                         columns=['Entertainment','Human Rights','Politics','Sports'],
#                         index=['Entertainment','Human Rights','Politics','Sports'])
df_svm_cm.to_csv('results/GTxM_Pass4/GTxM_Clf_SVM_SME_CM.csv')

In [109]:
df_svm_cm

Unnamed: 0,Business,Entertainment,Human Rights,Politics,Obituary,Sports
Business,0,1,0,0,0,0
Entertainment,3,24,0,0,1,0
Human Rights,1,23,0,0,0,0
Politics,4,142,4,3,1,0
Obituary,0,1,0,0,0,0
Sports,1,15,0,0,0,0


#### SMOTETomek Predictions

In [110]:
from imblearn.combine import SMOTETomek
smt = SMOTETomek(random_state=42)

In [111]:
X_smt,y_smt = smt.fit_resample(X,y)

In [112]:
X_smt.shape,y_smt.shape

((2734, 5000), (2734,))

In [113]:
clf_smt = SVC(kernel='linear')
clf_smt.fit(X_smt,y_smt)

SVC(kernel='linear')

In [114]:
test_pred = clf_smt.predict(X_test)

Feature names unseen at fit time:
- aaron
- aberdeen
- abram
- absolut away
- absolut forward
- ...
Feature names seen at fit time, yet now missing:
- abc
- absolut instead
- absolut togeth
- accomplic
- accuraci
- ...



In [115]:
test_acc = accuracy_score(y_test, test_pred)
test_prec_recall_f1 = precision_recall_fscore_support(y_test, test_pred, average = 'weighted')

# For Prod:
result = ['GTxM Pass 4', 'SVM_SMT', 0,
        test_acc*100, test_prec_recall_f1[0]*100, test_prec_recall_f1[1]*100, test_prec_recall_f1[2]*100]

In [116]:
result

['GTxM Pass 4',
 'SVM_SMT',
 0,
 44.19642857142857,
 49.632947198275865,
 44.19642857142857,
 45.8395004625347]

##### Generate Confusion Matrix

In [117]:
svm_cm = confusion_matrix(y_test, test_pred)
df_svm_cm = pd.DataFrame(svm_cm, 
                        columns=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'],
                        index=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'])

# Removed Business and Obituary since there is none in test data
# df_svm_cm = pd.DataFrame(svm_cm, 
#                         columns=['Entertainment','Human Rights','Politics','Sports'],
#                         index=['Entertainment','Human Rights','Politics','Sports'])
df_svm_cm.to_csv('results/GTxM_Pass4/GTxM_Clf_SVM_SMT_CM.csv')

In [118]:
df_svm_cm

Unnamed: 0,Business,Entertainment,Human Rights,Politics,Obituary,Sports
Business,0,1,0,0,0,0
Entertainment,2,9,0,16,1,0
Human Rights,1,6,0,17,0,0
Politics,3,59,1,90,1,0
Obituary,0,0,0,1,0,0
Sports,0,12,0,4,0,0
