# Lamb - Voting Ensemble in Spam Classification: An Approach for Optimizing Model Selection

### Kaggle Database Link

https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset

### Other Articles to Consider

http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf

https://www.proquest.com/docview/2798556468?pq-origsite=primo

https://machinelearningmastery.com/cross-validation-for-imbalanced-classification/

https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/

### Load Libraries

In [1]:
import pandas as pd
import numpy as np
import itertools
import collections
import re
import nltk
import string
import opendatasets as od
import pickle
from nltk.corpus import stopwords
from nltk import bigrams
from nltk.stem.porter import PorterStemmer
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics, svm
from sklearn.metrics import precision_score, recall_score, roc_curve, confusion_matrix, jaccard_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from keras.layers import SimpleRNN, LSTM, Dense, Dropout, Activation, Flatten
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier, AdaBoostClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.model_selection import RandomizedSearchCV
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

### Pickle Load

In [2]:
# Load pickled variables
# with open('session.pkl', 'rb') as f:
#     data = pickle.load(f)

In [3]:
# #pickeling file to allow for loading calculation directionly into memory without recomputing
# ##as this is a computationally intensive wb this allows for incremental progress to be made without recommputing the whole file
# with open('session.pkl', 'wb') as f:
#     pickle.dump({
#             'cnt' : cnt,
#             'ltt' : ltt,
#             'lrt' : lrt,
#             'gnt' : gnt,
#             'bnt' : bnt,
#             'mbt' : mbt,
#             'srt' : srt,
#             'spt' : spt,
#             'slt' : slt,
#             'rft' : rft,
#             'gbt' : gbt,
#             'ett' : ett,
#             'adt' : adt,
#             'xgt' : xgt,
#             'xrt' : xrt,
#             'cnr' : cnr,
#             'ltr' : ltr,
#             'lrr' : lrr,
#             'gnr' : gnr,
#             'bnr' : bnr,
#             'mbr' : mbr,
#             'srr' : srr,
#             'spr' : spr,
#             'slr' : slr,
#             'rfr' : rfr,
#             'gbr' : gbr,
#             'etr' : etr,
#             'adr' : adr,
#             'xgr' : xgr,
#             'xrr' : xrr
#                 }, f)

### Importing Data

In [4]:
#loading corpus into data frame
df = pd.read_csv("spam.csv", encoding = "ISO-8859-1", engine = "python")
print(df.shape)

(5572, 5)


In [5]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


### Cleaning Data

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [7]:
#looking at the data in the unnamed columns
df[df['Unnamed: 2'].isnull() == False].head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
95,spam,Your free ringtone is waiting to be collected....,PO Box 5249,"MK17 92H. 450Ppw 16""",
281,ham,\Wen u miss someone,the person is definitely special for u..... B...,why to miss them,"just Keep-in-touch\"" gdeve.."""
444,ham,\HEY HEY WERETHE MONKEESPEOPLE SAY WE MONKEYAR...,HOWU DOIN? FOUNDURSELF A JOBYET SAUSAGE?LOVE ...,,
671,spam,SMS. ac sun0819 posts HELLO:\You seem cool,"wanted to say hi. HI!!!\"" Stop? Send STOP to ...",,
710,ham,Height of Confidence: All the Aeronautics prof...,"this wont even start........ Datz confidence..""",,


In [8]:
df[df['Unnamed: 3'].isnull() == False].head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
95,spam,Your free ringtone is waiting to be collected....,PO Box 5249,"MK17 92H. 450Ppw 16""",
281,ham,\Wen u miss someone,the person is definitely special for u..... B...,why to miss them,"just Keep-in-touch\"" gdeve.."""
899,spam,Your free ringtone is waiting to be collected....,PO Box 5249,"MK17 92H. 450Ppw 16""",
1038,ham,"Edison has rightly said, \A fool can ask more ...",GN,GE,"GNT:-)"""
2170,ham,\CAN I PLEASE COME UP NOW IMIN TOWN.DONTMATTER...,JUST REALLYNEED 2DOCD.PLEASE DONTPLEASE DONTIG...,"U NO THECD ISV.IMPORTANT TOME 4 2MORO\""""",


In [9]:
df[df['Unnamed: 4'].isnull() == False].head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
281,ham,\Wen u miss someone,the person is definitely special for u..... B...,why to miss them,"just Keep-in-touch\"" gdeve.."""
1038,ham,"Edison has rightly said, \A fool can ask more ...",GN,GE,"GNT:-)"""
2255,ham,I just lov this line: \Hurt me with the truth,I don't mind,i wil tolerat.bcs ur my someone..... But,"Never comfort me with a lie\"" gud ni8 and swe..."
3525,ham,\HEY BABE! FAR 2 SPUN-OUT 2 SPK AT DA MO... DE...,HAD A COOL NYTHO,TX 4 FONIN HON,"CALL 2MWEN IM BK FRMCLOUD 9! J X\"""""
4668,ham,"When I was born, GOD said, \Oh No! Another IDI...",GOD said,"\""OH No! COMPETITION\"". Who knew","one day these two will become FREINDS FOREVER!"""


In [10]:
#the unknown columns are sparsely populated and most that are are populated appear to contain irrelevant information 
#(such as time or address info).  droping these columns
to_drop = ['Unnamed: 2',"Unnamed: 3","Unnamed: 4"]
df = df.drop(columns = to_drop)
print(df.shape)

(5572, 2)


In [11]:
#renamining columns
rename_list = {'v1':'label','v2':'documents'}
df = df.rename(columns=rename_list)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   label      5572 non-null   object
 1   documents  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [12]:
#neither column has any null values, but lets check to make sure there is non-blank text in the documents
df_temp = df['documents'].str.len() - df['documents'].str.count(' ')
sum(df_temp == 0)

0

In [13]:
#okay so all the documents contain at least some characters.  Lets check that our label is a binary indicator as expected
label_list = df.label.unique()
print(label_list)

['ham' 'spam']


In [14]:
#creating one hotkey on label
label_binary = pd.get_dummies(df.label)
label_binary= label_binary.drop(columns='ham')
label_binary = label_binary.rename(columns={'spam':'label_binary'})
df = pd.concat([df,label_binary],axis=1)

In [15]:
#checking hotkey join and binary hotkey labeling
print(df.shape)
print(df[df['label']=='ham'].label_binary.unique())
print(df[df['label']=='spam'].label_binary.unique())

(5572, 3)
[0]
[1]


In [16]:
#checking for duplicates
df.duplicated().sum()

403

In [17]:
#dropping duplicated
df = df.drop_duplicates()
df.shape

(5169, 3)

### EDA

In [18]:
#looking at the frequency of ham versus spam
label_count = df.groupby('label').count()
print(label_count)

       documents  label_binary
label                         
ham         4516          4516
spam         653           653


In [19]:
#lets look at how wordy our documents are - first creating a word count
documents = df['documents'].tolist()
word_count = [] 
for i in documents:
    word_count.append(len(i.split()))
print(len(word_count))

5169


In [20]:
#calculating mean, standard deviations, min, and max
min_val = min(word_count)
max_val =max(word_count)
mean_val = np.mean(word_count)
var_val = np.std(word_count)
stat_label = pd.Series(('min','max','mean','std'))
stats = pd.Series((min_val,max_val,mean_val,var_val))
d = {'label':stat_label,'value':stats}
df_stat = pd.DataFrame(data=d)
df_stat

Unnamed: 0,label,value
0,min,1.0
1,max,171.0
2,mean,15.340685
3,std,11.067417


In [21]:
#adding the word count into the data frame
df['word_count'] = np.array(word_count)
df.shape

(5169, 4)

In [22]:
#looking at a few of these one word documents
df[df['word_count'] == 1].head()

Unnamed: 0,label,documents,label_binary,word_count
260,ham,Yup,0,1
275,ham,Thanx...,0,1
283,ham,Okie...,0,1
286,ham,Ok..,0,1
782,ham,Beerage?,0,1


In [23]:
#what percentage of the documents have only 1 word
sum(df['word_count'] == 1)/len(df)

0.003869220352099052

In [24]:
range(len(documents))

range(0, 5169)

In [25]:
#look at the most common words - first prep a word list
word_list = []
for i in range(len(documents)):
    word_list.append(documents[i].lower().split())
master_word_list = list(itertools.chain(*word_list))

In [26]:
#now count the words
count_words = collections.Counter(master_word_list)
count_words.most_common(20)

[('i', 2095),
 ('to', 2055),
 ('you', 1832),
 ('a', 1281),
 ('the', 1223),
 ('and', 919),
 ('u', 890),
 ('in', 785),
 ('is', 766),
 ('my', 676),
 ('for', 653),
 ('your', 618),
 ('me', 579),
 ('of', 552),
 ('have', 532),
 ('on', 476),
 ('call', 468),
 ('are', 457),
 ('that', 453),
 ('it', 440)]

### Text Preprocessing

In [27]:
#making text lowercase
df['documents_clean'] = df['documents'].str.lower()

In [28]:
#replacing URLs with keyword "URL"
df['documents_clean'] = df['documents_clean'].str.replace(r'https?://\S+|www\.\S+', 'url')

  df['documents_clean'] = df['documents_clean'].str.replace(r'https?://\S+|www\.\S+', 'url')


In [29]:
#loading stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
print(len(stop_words))

179


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\CGLam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
#removing stop words
df['documents_clean'] = df['documents_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [31]:
#remove punctuation
df['documents_clean'] = df['documents_clean'].str.replace(r'[^\w\s]+', '')

  df['documents_clean'] = df['documents_clean'].str.replace(r'[^\w\s]+', '')


In [32]:
#re-reviewing most common words to see if it makes sense to create any custom stop words
word_list_2 = []
documents_2 = df['documents_clean'].tolist()
for i in range(len(documents_2)):
    word_list_2.append(documents_2[i].lower().split())
master_word_list_2 = list(itertools.chain(*word_list_2))
count_words_2 = collections.Counter(master_word_list_2)
count_words_2.most_common(20)

[('u', 1001),
 ('call', 487),
 ('im', 447),
 ('2', 443),
 ('get', 364),
 ('ur', 316),
 ('go', 269),
 ('4', 257),
 ('ltgt', 254),
 ('ok', 251),
 ('free', 243),
 ('know', 239),
 ('got', 231),
 ('like', 231),
 ('good', 217),
 ('come', 210),
 ('ill', 206),
 ('you', 200),
 ('time', 199),
 ('now', 198)]

In [33]:
#creating custom stop words
custom_stopwords = {'u','im','ur','ill','you'}

In [34]:
#remove custom stop words
df['documents_clean'] = df['documents_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (custom_stopwords)]))

In [35]:
#remove non-character tokens
df['documents_clean'] = df['documents_clean'].apply(lambda x: ' '.join([word for word in x.split() if word.isalpha()]))

In [36]:
#applying stemming
stemmer = PorterStemmer()
df['documents_clean'] = df['documents_clean'].apply(lambda x: ' '.join([stemmer.stem(y) for y in x.split()]))

In [37]:
df.head()

Unnamed: 0,label,documents,label_binary,word_count,documents_clean
0,ham,"Go until jurong point, crazy.. Available only ...",0,20,go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,0,6,ok lar joke wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,28,free entri wkli comp win fa cup final tkt may ...
3,ham,U dun say so early hor... U c already then say...,0,11,dun say earli hor c alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,13,nah think goe usf live around though


In [38]:
#export preprocessed data to excel for further review 
#df.to_excel('preprocessed.xlsx')

### Tokenize the Data

In [39]:
def define_tokenizer(x):
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer
    
def encode(x2, tokenizer):
    encoded_sentences = tokenizer.texts_to_sequences(x2)
    encoded_sentences = tf.keras.preprocessing.sequence.pad_sequences(encoded_sentences, padding='post')
    return encoded_sentences

In [40]:
tokenizer = define_tokenizer(df['documents_clean'])
s_strings = encode(df['documents_clean'],tokenizer)

In [41]:
#checking that we have appropriate number of documents
len(s_strings)

5169

In [42]:
#quick look at encoding...text of first clean document
df['documents_clean'][0]

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [43]:
#encoding of that document
s_strings[0]

array([   2, 2952,  271,  540,  568,  954,   43,   66,  325,  955,   88,
       2089,  956,   11, 2953,   64,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0])

In [44]:
#pulling these words out of the dictionary to make sure we encoded as expected
d = tokenizer.word_index
print(d['go'])
print(d['jurong'])
print(d['point'])
print(d['crazi'])
print(d['avail'])
print(d['bugi'])
print(d['n'])
print(d['great'])
print(d['world'])
print(d['la'])
print(d['e'])
print(d['buffet'])
print(d['cine'])
print(d['got'])
print(d['amor'])
print(d['wat'])

2
2952
271
540
568
954
43
66
325
955
88
2089
956
11
2953
64


### Keras Cross Validation Design

In [45]:
#converting tokenized data to pandas dataframe
#we will eventually convert back to array but the pd will make the cross validation setup easier
s_pandas = pd.DataFrame(data=s_strings)
s_pandas['label'] = df['label_binary'].tolist()
s_pandas['doc'] = df['documents_clean'].tolist()
print(s_pandas.shape)
print(s_pandas.head())

(5169, 81)
     0     1    2     3     4     5     6    7     8    9  ...  71  72  73  \
0    2  2952  271   540   568   954    43   66   325  955  ...   0   0   0   
1    6   226  569   326  1413     0     0    0     0    0  ...   0   0   0   
2    9   413  796   797   129  2090  1087  304  2091  206  ...   0   0   0   
3  146    40  263  2092    97    73    40    0     0    0  ...   0   0   0   
4  750    22  338   696   176   128   339    0     0    0  ...   0   0   0   

   74  75  76  77  78  label  \
0   0   0   0   0   0      0   
1   0   0   0   0   0      0   
2   0   0   0   0   0      1   
3   0   0   0   0   0      0   
4   0   0   0   0   0      0   

                                                 doc  
0  go jurong point crazi avail bugi n great world...  
1                                ok lar joke wif oni  
2  free entri wkli comp win fa cup final tkt may ...  
3                    dun say earli hor c alreadi say  
4               nah think goe usf live around though

In [46]:
#creating the cross validation datasets
y = s_pandas['label'].to_numpy()
y = y.reshape(-1,1)
x = s_pandas.drop(columns=['label'])

x_train, x1_test, y_train, y1_test = train_test_split(x, y, test_size=0.5, random_state=9)

print(x_train.shape)
print(y_train.shape)

(2584, 80)
(2584, 1)


In [47]:
#split into test and validation layer
x_val, x_test, y_val, y_test = train_test_split(x1_test, y1_test, test_size=0.5, random_state=9)
print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)
print(x_test.shape)
print(y_test.shape)

(2584, 80)
(2584, 1)
(1292, 80)
(1292, 1)
(1293, 80)
(1293, 1)


In [48]:
#checking that i split this correctly
print(x_train.head())
print(y_train[:4])

         0     1    2     3    4    5    6     7    8    9  ...  70  71  72  \
2194  4640    14   91    48    0    0    0     0    0    0  ...   0   0   0   
2259  4678   253   27  1122    0    0    0     0    0    0  ...   0   0   0   
3644    65   487   20    58  986  394   36   798   14  107  ...   0   0   0   
107      7  3025  433   199  416    8  204  1108  161   20  ...   0   0   0   
2584   208    73  300  1270   43  500  202     0    0    0  ...   0   0   0   

      73  74  75  76  77  78  \
2194   0   0   0   0   0   0   
2259   0   0   0   0   0   0   
3644   0   0   0   0   0   0   
107    0   0   0   0   0   0   
2584   0   0   0   0   0   0   

                                                    doc  
2194                                velli good ye pleas  
2259                             nohe join today itself  
3644      happi sad one thing past is it more good morn  
107   know grumpi old peopl mom like better lie alwa...  
2584                  yup alreadi thanx pr

In [49]:
#splitting out the text doc from the encoded data
train_doc = x_train['doc']
test_doc = x_test['doc']
val_doc = x_val['doc']
x_train = x_train.drop(columns=['doc'])
x_test = x_test.drop(columns=['doc'])
x_val = x_val.drop(columns=['doc'])
print(train_doc.shape)
print(test_doc.shape)
print(x_train.shape)
print(x_test.shape)

(2584,)
(1293,)
(2584, 79)
(1293, 79)


In [50]:
#our data needs to be in array format so we conver the new dataframes back to arrays
x_train_array = x_train.to_numpy()
x_test_array = x_test.to_numpy()
x_val_array = x_val.to_numpy()
#and checking shaping
print(len(x_train_array))
print(len(x_test_array))
print(len(x_val_array))

2584
1293
1292


In [51]:
print(x_train_array.shape)
print(x_test_array.shape)
print(x_val_array.shape)
print(y_train.shape)
print(y_test.shape)

(2584, 79)
(1293, 79)
(1292, 79)
(2584, 1)
(1293, 1)


In [52]:
#saving index of records that made it into test versus train
train_index = x_train.index
val_index = x_val.index
test_index = x_test.index

### One Hotkey Encoding (Count Vectorization)

In [53]:
#creating the one hotkey on clean documents
# vec = CountVectorizer()
# X_train_count = vec.fit_transform(df['documents_clean'].values)
# X_train_count.toarray()

In [54]:
#checking shape
# len(X_train_count.toarray())

In [55]:
#moving into pandas dataframe
# df_one = pd.DataFrame(X_train_count.toarray())
# len(df_one)

In [56]:
#using index to split into train and test
# df_one_train = df_one.iloc[train_index]
# df_one_test = df_one.iloc[test_index]
# print(df_one_train.shape)
# print(df_one_test.shape)

### One Hotkey Encoding (TFID Vectorization)

In [57]:
#creating the one hotkey on clean documents
vec2 = TfidfVectorizer(max_features=7000)
X_train_tfidf = vec2.fit_transform(df['documents_clean'].values)
X_train_tfidf.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [58]:
#checking shape
len(X_train_tfidf.toarray())

5169

In [59]:
#moving into pandas dataframe
df_vec = pd.DataFrame(X_train_tfidf.toarray())
df_vec.shape

(5169, 6793)

In [60]:
#using index to split into train and test
df_train = df_vec.iloc[train_index]
df_test = df_vec.iloc[test_index]
df_val = df_vec.iloc[val_index]
print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(2584, 6793)
(1292, 6793)
(1293, 6793)


### Defining Metrics Function

In [61]:
def class_metrics(method):
    a = print("Accuracy:",metrics.accuracy_score(y_test, method))
    p = print("Precision:",metrics.precision_score(y_test, method))
    r =print("Recall:",metrics.recall_score(y_test, method))
    return a, p, r;

In [62]:
def class_accuracy(method):
    a = metrics.accuracy_score(y_test, method)
    return a

In [63]:
def class_precision(method):
    p = metrics.precision_score(y_test, method)
    return p;

In [64]:
def class_recall(method):
    r =metrics.recall_score(y_test, method)
    return r;

In [65]:
def class_f1(method):
    r =metrics.f1_score(y_test, method)
    return r;

In [66]:
def predict_metrics(method):
    a = print("Accuracy:",metrics.accuracy_score(y_train, method))
    p = print("Precision:",metrics.precision_score(y_train, method))
    r =print("Recall:",metrics.recall_score(y_train, method))
    return a, p, r;

### Setting Up Tensors for each of the Datasets

In [67]:
#creating the tensor
# ts_train = tf.data.Dataset.from_tensor_slices((x_train_array, y_train))
# ts_holdout = tf.data.Dataset.from_tensor_slices((x_test_array, y_test))

In [68]:
#padding
#ts_train = ts_train.padded_batch(32, padded_shapes=([-1], []))
#ts_holdout = ts_holdout.padded_batch(32, padded_shapes=([-1], []))

### Establishing Early Stopping Criteria for NN training

In [69]:
callback = keras.callbacks.EarlyStopping(monitor='precision',patience=15,min_delta=.001,verbose=1,mode="max")

### Under Sampling Pipeline

In [70]:
under = RandomUnderSampler(sampling_strategy=0.5)

### Weighting Scheme for Neural Networks

In [71]:
class_weight = {0: 1, 1: 8}

### Logistic Regression

In [72]:
lr = LogisticRegression(solver="lbfgs",random_state=9,warm_start=False,class_weight='balanced')
lrm = lr.fit(df_train, y_train.ravel())
lrr = lrm.predict(df_train)
lrt = lrm.predict(df_test)
lrv = lrm.predict(df_val)

In [73]:
class_metrics(lrt)

Accuracy: 0.9613302397525135
Precision: 0.85
Recall: 0.8395061728395061


(None, None, None)

In [74]:
print(lrm.get_params())

{'C': 1.0, 'class_weight': 'balanced', 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': 9, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


### Problems with Naive Bayes

http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf

Problems with Naive Bayes:
    Doesnt handle imbalanced data well
    Assumes feature indepedence

### Gaussian Naive Bayes Model

In [75]:
gn = GaussianNB()
gn_steps = [('under', under), ('model', gn)]
gn_pipeline = Pipeline(steps=gn_steps)
gnm = gn_pipeline.fit(df_train, y_train)
gnr = gnm.predict(df_train)
gnt = gnm.predict(df_test)
gnv = gnm.predict(df_val)

In [76]:
class_metrics(gnt)

Accuracy: 0.8089713843774169
Precision: 0.38605898123324395
Recall: 0.8888888888888888


(None, None, None)

In [77]:
print(gnm.get_params())

{'memory': None, 'steps': [('under', RandomUnderSampler(sampling_strategy=0.5)), ('model', GaussianNB())], 'verbose': False, 'under': RandomUnderSampler(sampling_strategy=0.5), 'model': GaussianNB(), 'under__random_state': None, 'under__replacement': False, 'under__sampling_strategy': 0.5, 'model__priors': None, 'model__var_smoothing': 1e-09}


### Bernouli Naive Bayes Model 

In [78]:
bn = BernoulliNB()
bn_steps = [('under', under), ('model', bn)]
bn_pipeline = Pipeline(steps=bn_steps)
bnm = bn_pipeline.fit(df_train, y_train)
bnr = bnm.predict(df_train)
bnt = bnm.predict(df_test)
bnv = bnm.predict(df_val)

In [79]:
class_metrics(bnt)

Accuracy: 0.9574632637277649
Precision: 0.9908256880733946
Recall: 0.6666666666666666


(None, None, None)

In [80]:
print(bn.get_params())

{'alpha': 1.0, 'binarize': 0.0, 'class_prior': None, 'fit_prior': True, 'force_alpha': 'warn'}


### MultiNomimail Bayes Model

In [81]:
mb = MultinomialNB()
mb_steps = [('under', under), ('model', mb)]
mb_pipeline = Pipeline(steps=mb_steps)
mbm = mb_pipeline.fit(df_train, y_train)
mbr = mbm.predict(df_train)
mbt = mbm.predict(df_test)
mbv = mbm.predict(df_val)

In [82]:
class_metrics(mbt)

Accuracy: 0.9690641918020109
Precision: 0.9121621621621622
Recall: 0.8333333333333334


(None, None, None)

In [83]:
print(mb.get_params())

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True, 'force_alpha': 'warn'}


### Hypertuned SVM

In [84]:
# param_grid_sv = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid','linear']}
# grid_sv = GridSearchCV(svm.SVC(random_state=9,class_weight='balanced'),param_grid_sv,refit=True,verbose=3,scoring='precision',cv=5)
# shm = grid_sv.fit(df_train,y_train.ravel())
# print(grid_sv.best_estimator_)

In [85]:
# svr = grid_sv.predict(df_train)
# svt = grid_sv.predict(df_test)

In [86]:
# class_metrics(svt)

In [87]:
# df_sv_grid = pd.DataFrame(data=grid_sv.cv_results_)

In [88]:
# df_sv_grid.to_excel("SVC_GridSearch.xlsx")

###  Best Fit SVM (precision)

In [89]:
# sb = svm.SVC(kernel='poly',random_state=9,C=.1,gamma=1,class_weight='balanced')
# sbm = sb.fit(df_train, y_train.ravel())
# sbr = sbm.predict(df_train)
# sbt = sbm.predict(df_test)

In [90]:
# class_metrics(sbt)

In [91]:
# print(sbm.get_params())

### Best Fit SVM (f1)

In [92]:
sb2 = svm.SVC(kernel='rbf',random_state=9,C=10,gamma=0.1,class_weight='balanced')
sbm2 = sb2.fit(df_train, y_train.ravel())
sbr2 = sbm2.predict(df_train)
sbt2 = sbm2.predict(df_test)
sbv2 = sbm2.predict(df_val)

In [93]:
class_metrics(sbt2)

Accuracy: 0.9675174013921114
Precision: 0.9285714285714286
Recall: 0.8024691358024691


(None, None, None)

In [94]:
print(sbm2.get_params())

{'C': 10, 'break_ties': False, 'cache_size': 200, 'class_weight': 'balanced', 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': 9, 'shrinking': True, 'tol': 0.001, 'verbose': False}


### Hyertuned Random Forest Classifier

In [95]:
# n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
# max_features = ['auto', 'sqrt']
# max_depth = [int(x) for x in np.linspace(10, 1000, num = 11)]
# max_depth.append(None)
# min_samples_split = [2, 5, 10]
# min_samples_leaf = [1, 2, 4]
# bootstrap = [True, False]
# random_grid = {
#                'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#                'bootstrap': bootstrap
#                 }
# print(random_grid)

In [96]:
# rf = RandomForestClassifier(class_weight="balanced",random_state=9)
# rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=9, scoring='precision')
# rf_random.fit(df_train, y_train.ravel())

In [97]:
# class_metrics(rf_random)

In [98]:
# rhr = rf_random.predict(df_train)
# rgt = rf_random.predict(df_test)

In [99]:
# df_rf_grid = pd.DataFrame(data=rf_random.cv_results_)

In [100]:
# df_rf_grid.to_excel("RF_GridSearch.xlsx")

### Best Fit RF (precision)

In [101]:
# rf = RandomForestClassifier(
#     class_weight="balanced",
#     random_state=9,
#     n_estimators=100,
#     min_samples_split=2,
#     min_samples_leaf=1,
#     max_features='sqrt',
#     max_depth=307,
#     bootstrap=True)
# rfm = rf.fit(df_train, y_train.ravel())
# rfr = rfm.predict(df_train)
# rft = rfm.predict(df_test)

In [102]:
# class_metrics(rft)

In [103]:
#looking at default parameters
# print(rf.get_params())
# print('Max Depth of any Tree: ',max([estimator.tree_.max_depth for estimator in rf.estimators_]))

### Best Fit RF (f1)

In [104]:
rf2 = RandomForestClassifier(
    class_weight="balanced",
    random_state=9,
    n_estimators=1577,
    min_samples_split=10,
    min_samples_leaf=1,
    max_features='auto',
    max_depth=307,
    bootstrap=False)
rfm2 = rf2.fit(df_train, y_train.ravel())
rfr2 = rfm2.predict(df_train)
rft2 = rfm2.predict(df_test)
rfv2 = rfm2.predict(df_val)

  warn(


In [105]:
class_metrics(rft2)

Accuracy: 0.9791183294663574
Precision: 0.9655172413793104
Recall: 0.8641975308641975


(None, None, None)

In [106]:
#looking at default parameters
print(rf2.get_params())
print('Max Depth of any Tree: ',max([estimator.tree_.max_depth for estimator in rf2.estimators_]))

{'bootstrap': False, 'ccp_alpha': 0.0, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 307, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 1577, 'n_jobs': None, 'oob_score': False, 'random_state': 9, 'verbose': 0, 'warm_start': False}
Max Depth of any Tree:  307


### Hyertuned Gradient Boosted Trees

In [107]:
# n_estimators = [int(x) for x in np.linspace(start = 100, stop = 500, num = 3)]
# max_features = ['log2', 'sqrt']
# max_depth = [int(x) for x in np.linspace(1, 10, num = 3)]
# max_depth.append(None)
# min_samples_split = [2, 5, 10]
# min_samples_leaf = [1, 2, 4]
# random_grid = {
#                'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#                 }
# print(random_grid)

In [108]:
# gb = GradientBoostingClassifier(random_state=9)
# gb_grid = GridSearchCV(gb,random_grid, cv = 5, verbose=2, scoring='precision')
# gb_grid.fit(df_train, y_train.ravel())
# print(gb_grid.best_estimator_)

In [109]:
# print(gb_grid.best_params_)

![image-2.png](attachment:image-2.png)

In [110]:
#df_gb_grid = pd.DataFrame(data=gb_grid.cv_results_)

In [111]:
#df_gb_grid.to_excel("Gradient_Boosted_GridSearch.xlsx")

### Best Fit Gradient Boosted Trees (precision)

In [112]:
# gb = GradientBoostingClassifier(warm_start=False,
#                                 random_state=9,
#                                 max_depth=1,
#                                 max_features='log2',
#                                 min_samples_leaf=1,
#                                 min_samples_split=2,
#                                 n_estimators=100)
                                
# gbm = gb.fit(df_train, y_train.ravel())
# gbr = gbm.predict(df_train)
# gbt = gbm.predict(df_test)

In [113]:
# class_metrics(gbt)

In [114]:
# print(gb.get_params())

### Best Fit Gradient Boosted Trees (f1)

In [115]:
gb2 = GradientBoostingClassifier(warm_start=False,
                                random_state=9,
                                max_depth=None,
                                max_features='sqrt',
                                min_samples_leaf=4,
                                min_samples_split=10,
                                n_estimators=100)
                                
gbm2 = gb2.fit(df_train, y_train.ravel())
gbr2 = gbm2.predict(df_train)
gbt2 = gbm2.predict(df_test)
gbv2 = gbm2.predict(df_val)

In [116]:
class_metrics(gbt2)

Accuracy: 0.974477958236659
Precision: 0.9640287769784173
Recall: 0.8271604938271605


(None, None, None)

In [117]:
print(gb2.get_params())

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'log_loss', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 4, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': 9, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}


### Model - Hyertuned ExtraTrees

In [118]:
# n_estimators_e = [int(x) for x in np.linspace(start = 100, stop = 500, num = 3)]
# max_features_e = ['log2', 'sqrt']
# min_samples_split_e = [2, 5, 10]
# min_samples_leaf_e = [1, 2, 4]
# bootstrap_e = ['bool',False]
# criterion_e = ['gini','log_loss','entropy']
# random_grid_e = {
#                'n_estimators': n_estimators_e,
#                'max_features': max_features_e,
#                'min_samples_split': min_samples_split_e,
#                'min_samples_leaf': min_samples_leaf_e,
#                 'bootstrap': bootstrap_e,
#                 'criterion': criterion_e
#                 }
# print(random_grid_e)

In [119]:
# et = ExtraTreesClassifier(warm_start=False,random_state=9,class_weight='Balanced')
# et_grid = GridSearchCV(et,random_grid_e, cv = 5, verbose=2, scoring='precision')
# et_grid.fit(df_train, y_train.ravel())
# print(et_grid.best_estimator_)

In [120]:
# print(et_grid.best_params_)

In [121]:
# df_et_grid = pd.DataFrame(data=et_grid.cv_results_)

In [122]:
# df_et_grid.to_excel("ET_GridSearch.xlsx")

### Best Fit Extra Trees (precision)

In [123]:
# et = ExtraTreesClassifier(
#         warm_start=False,
#         random_state=9,
#         bootstrap=True,
#         class_weight='balanced',
#         criterion='log_loss',
#         max_features='log2',
#         min_samples_split=10)
# etm = et.fit(df_train, y_train.ravel())
# etr = etm.predict(df_train)
# ett = etm.predict(df_test)

In [124]:
# class_metrics(ett)

In [125]:
# print(et.get_params())

### Best Fit Extra Trees (F1)

In [126]:
et2 = ExtraTreesClassifier(
        warm_start=False,
        random_state=9,
        bootstrap=False,
        class_weight='balanced',
        criterion='log_loss',
        max_features='log2',
        min_samples_split=10)
etm2 = et2.fit(df_train, y_train.ravel())
etr2 = etm2.predict(df_train)
ett2 = etm2.predict(df_test)
etv2 = etm2.predict(df_val)

In [127]:
class_metrics(ett2)

Accuracy: 0.9752513534416086
Precision: 0.9710144927536232
Recall: 0.8271604938271605


(None, None, None)

In [128]:
print(et2.get_params())

{'bootstrap': False, 'ccp_alpha': 0.0, 'class_weight': 'balanced', 'criterion': 'log_loss', 'max_depth': None, 'max_features': 'log2', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 9, 'verbose': 0, 'warm_start': False}


### Hypertuned ADA Boosting

In [129]:
# param_grid_ad = {'n_estimators': [int(x) for x in np.linspace(start = 20, stop = 300, num = 10)], 
#                  'learning_rate': [.1,.5,1]}
# grid_ad = GridSearchCV(AdaBoostClassifier(random_state=9),param_grid_ad,refit=True,verbose=3,scoring='precision',cv=5)
# adm = grid_ad.fit(df_train,y_train.ravel())

In [130]:
# print(grid_ad.best_estimator_)

In [131]:
# df_ad_grid = pd.DataFrame(data=grid_ad.cv_results_)

In [132]:
# df_ad_grid.to_excel("AD_GridSearch.xlsx")

### Best Fit ADA Boosting (precision)

In [133]:
# ad = AdaBoostClassifier(random_state=9,learning_rate=0.1,n_estimators=20)
# adm = ad.fit(df_train, y_train.ravel())
# adr = adm.predict(df_train)
# adt = adm.predict(df_test)

In [134]:
# class_metrics(adt)

In [135]:
# print(ad.get_params())

### Best Fit ADA Bososting (f1)

In [136]:
ad2 = AdaBoostClassifier(random_state=9,learning_rate=0.5,n_estimators=268)
adm2 = ad2.fit(df_train, y_train.ravel())
adr2 = adm2.predict(df_train)
adt2 = adm2.predict(df_test)
adv2 = adm2.predict(df_val)

In [137]:
class_metrics(adt2)

Accuracy: 0.9644238205723125
Precision: 0.9027777777777778
Recall: 0.8024691358024691


(None, None, None)

In [138]:
print(ad2.get_params())

{'algorithm': 'SAMME.R', 'base_estimator': 'deprecated', 'estimator': None, 'learning_rate': 0.5, 'n_estimators': 268, 'random_state': 9}


### XGB Hypertuning

In [139]:
# params = {'max_depth': [3, 6, 10],
#               'learning_rate': [ 0.1, 0.5],
#               'subsample': np.arange(0.5, 1.0, 0.2),
#               'colsample_bytree': np.arange(0.5, 1.0, 0.2),
#               'colsample_bylevel': np.arange(0.5, 1.0, 0.2),
#               'n_estimators': [50, 100],
#               'num_class': [1]
#               }

In [140]:
# grid_xg = GridSearchCV(XGBClassifier(random_state=9,scale_pos_weight=20),params,refit=True,verbose=3,scoring='precision',cv=5,n_jobs=-2)
# xgm = grid_xg.fit(df_train,y_train.ravel())

In [141]:
# print(grid_xg.best_estimator_)

In [142]:
# df_xg_grid = pd.DataFrame(data=grid_xg.cv_results_)

In [143]:
# df_xg_grid.to_excel("XG_GridSearch.xlsx")

### Model - XGB Classifier (precision)

In [144]:
# xg = XGBClassifier(
#                     random_state=9,
#                     colsample_bylevel=0.5, 
#                     colsample_bytree=0.7, 
#                     learning_rate=0.5, 
#                     max_depth=6, 
#                     n_estimators=100, 
#                     num_class=1, 
#                     subsample=0.9,
#                     scale_pos_weight=20)
# xgm = xg.fit(df_train, y_train.ravel())
# xgr = xgm.predict(df_train)
# xgt = xgm.predict(df_test)

In [145]:
# class_metrics(xgt)

In [146]:
# print(xg.get_params())

### Best Fit XGB Classifier (f1)

In [147]:
xg2 = XGBClassifier(
                    random_state=9,
                    colsample_bylevel=0.5, 
                    colsample_bytree=0.5, 
                    learning_rate=0.1, 
                    max_depth=10, 
                    n_estimators=250, 
                    num_class=1, 
                    subsample=0.75,
                    scale_pos_weight=20)
xgm2 = xg2.fit(df_train, y_train.ravel())
xgr2 = xgm2.predict(df_train)
xgt2 = xgm2.predict(df_test)
xgv2 = xgm2.predict(df_val)

In [148]:
class_metrics(xgt2)

Accuracy: 0.9675174013921114
Precision: 0.8658536585365854
Recall: 0.8765432098765432


(None, None, None)

In [149]:
print(xg2.get_params())

{'objective': 'binary:logistic', 'use_label_encoder': None, 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': 0.5, 'colsample_bynode': None, 'colsample_bytree': 0.5, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'gpu_id': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.1, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': 10, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'n_estimators': 250, 'n_jobs': None, 'num_parallel_tree': None, 'predictor': None, 'random_state': 9, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': 20, 'subsample': 0.75, 'tree_method': None, 'validate_parameters': None, 'verbosity': None, 'num_class': 1}


### Model XGBRF Classifier Hypertuning

In [150]:
# params_brf = {'max_depth': [3, 6, 10, 15],
#               'learning_rate': [ 0.1, 0.5, 1],
#               'subsample': np.arange(0.5, 1.0, 0.1),
#               'colsample_bytree': np.arange(0.5, 1.0, 0.1),
#               'colsample_bylevel': np.arange(0.5, 1.0, 0.1),
#               'n_estimators': [50, 100, 250],
#               'num_class': [1]
#               }

### Model - XGBRF Classifier

In [151]:
xr = XGBRFClassifier(random_state=9)
xrm = xr.fit(df_train, y_train)
xrr = xrm.predict(df_train)
xrt = xrm.predict(df_test)
xrv = xrm.predict(df_val)

In [152]:
class_metrics(xrt)

Accuracy: 0.9404485692188709
Precision: 0.8346456692913385
Recall: 0.654320987654321


(None, None, None)

In [153]:
print(xr.get_params())

{'colsample_bynode': 0.8, 'learning_rate': 1.0, 'reg_lambda': 1e-05, 'subsample': 0.8, 'objective': 'binary:logistic', 'use_label_encoder': None, 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bytree': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'gpu_id': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'n_estimators': 100, 'n_jobs': None, 'num_parallel_tree': None, 'predictor': None, 'random_state': 9, 'reg_alpha': None, 'sampling_method': None, 'scale_pos_weight': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}


### Model_CNN

In [154]:
md_cnn = tf.keras.Sequential()
md_cnn.add(tf.keras.layers.Embedding(input_dim=len(d)+1, output_dim=64, input_length=x_train.shape[1]))
md_cnn.add(tf.keras.layers.GlobalAveragePooling1D())
md_cnn.add(tf.keras.layers.Dense(64, activation='relu'))
md_cnn.add(tf.keras.layers.Dense(32, activation='relu'))
md_cnn.add(tf.keras.layers.Dense(1, activation='sigmoid'))

optimizer = keras.optimizers.SGD(learning_rate=0.02, momentum = 0.95, nesterov=True)
md_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'Precision', 'Recall'])

md_cnn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 79, 64)            436480    
                                                                 
 global_average_pooling1d (G  (None, 64)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 442,753
Trainable params: 442,753
Non-trainable params: 0
__________________________________________________

In [155]:
md_cnn.fit(x_train_array, y_train, epochs=50, callbacks=[callback],class_weight=class_weight)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 44: early stopping


<keras.callbacks.History at 0x15dffb85060>

In [156]:
cnt = md_cnn.predict(x_test_array, batch_size=None)
cnr = md_cnn.predict(x_train_array, batch_size=None)
cnv = md_cnn.predict(x_val_array, batch_size=None)
cnt = np.concatenate(cnt).round().astype(int)
cnr = np.concatenate(cnr).round().astype(int)
cnv = np.concatenate(cnv).round().astype(int)



In [157]:
class_metrics(cnt)

Accuracy: 0.9667440061871616
Precision: 0.9280575539568345
Recall: 0.7962962962962963


(None, None, None)

### Model - BiDirectional LSTM with Hidden Layers

In [158]:
#embedding layer and bidirectional layer
md_bidir_w_hidden = tf.keras.Sequential()
md_bidir_w_hidden.add(tf.keras.layers.Embedding(input_dim= len(d) + 1,output_dim=100,input_length=x_train.shape[1]))
md_bidir_w_hidden.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
md_bidir_w_hidden.add(tf.keras.layers.Dense(64, activation='relu'))
md_bidir_w_hidden.add(tf.keras.layers.Dense(32, activation='relu'))
md_bidir_w_hidden.add(tf.keras.layers.Dense(1, activation='sigmoid'))
md_bidir_w_hidden.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 79, 100)           682000    
                                                                 
 bidirectional (Bidirectiona  (None, 128)              84480     
 l)                                                              
                                                                 
 dense_3 (Dense)             (None, 64)                8256      
                                                                 
 dense_4 (Dense)             (None, 32)                2080      
                                                                 
 dense_5 (Dense)             (None, 1)                 33        
                                                                 
Total params: 776,849
Trainable params: 776,849
Non-trainable params: 0
________________________________________________

In [159]:
#compile the model
optimizer = keras.optimizers.SGD(learning_rate=0.02, momentum = 0.95, nesterov=True)
md_bidir_w_hidden.compile(loss="binary_crossentropy",optimizer=optimizer,metrics=['accuracy', 'Precision', 'Recall'])

In [160]:
#fit the model
md_bidir_w_hidden.fit(x_train_array, y_train, epochs=50, callbacks=[callback],class_weight=class_weight)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 34: early stopping


<keras.callbacks.History at 0x15d92337220>

In [161]:
ltt = md_bidir_w_hidden.predict(x_test_array, batch_size=None)
ltr = md_bidir_w_hidden.predict(x_train_array, batch_size=None)
ltv = md_bidir_w_hidden.predict(x_val_array, batch_size=None)
ltt = np.concatenate(ltt).round().astype(int)
ltr = np.concatenate(ltr).round().astype(int)
ltv = np.concatenate(ltv).round().astype(int)



In [162]:
class_metrics(ltt)

Accuracy: 0.9644238205723125
Precision: 0.8717948717948718
Recall: 0.8395061728395061


(None, None, None)

In [163]:
out = pd.DataFrame(data=ltt)
out['actual'] = y_test
out.to_excel('ltr.xlsx')

### Model - Bert

In [164]:
METRICS = [tf.keras.metrics.BinaryAccuracy(name='accuracy'),tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')]

In [165]:
# bert_df = pd.DataFrame(data=y_train,columns=['y'])
# bert_df['Doc'] = train_doc.tolist()
# print(bert_df.head())
# print(bert_df.shape)

In [166]:
# bert_df['y'].value_counts()

In [167]:
# spam_df = bert_df[bert_df['y'] == 1]
# print(spam_df.shape)
# ham_df = bert_df[bert_df['y'] == 0]
# print(ham_df.shape)
# ham_downsample_df = ham_df.sample(spam_df.shape[0])
# print(ham_downsample_df.shape)
# test_balanced_df = pd.concat([spam_df,ham_downsample_df])
# print(test_balanced_df.shape)
# # print(test_balanced_df['y'].value_counts())

In [168]:
# test_doc_balanced = test_balanced_df['Doc']
# y_test_balanced = test_balanced_df['y']

In [169]:
# import tensorflow_hub as hub
# import tensorflow_text as text
# import tensorflow as tf

In [170]:
# bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
# bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [171]:
# #Functional Model

# # Bert layers
# #Text input passed to bert_preprocess
# text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
# preprocessed_text = bert_preprocess(text_input)
# #Processed text passed to bert_encoder
# outputs = bert_encoder(preprocessed_text)

# # Neural network layers
# l = tf.keras.layers.Dense(64, activation='relu')
# l = tf.keras.layers.Dense(32, activation='relu')
# l = tf.keras.layers.Dense(1, activation='sigmoid')

# #Dropout
# l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
# l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

# # Use inputs and outputs to construct a final model
# model_bert = tf.keras.Model(inputs=[text_input], outputs = [l])

In [172]:
# model_bert.summary()

In [173]:
# model_bert.compile(optimizer='adam',loss='binary_crossentropy',metrics=METRICS)

In [174]:
# model_bert.fit(test_doc_balanced, y_test_balanced, epochs=2)

In [175]:
# test_predict = model_bert.predict(test_doc)
# test_predict = test_predict.flatten()

NameError: name 'model_bert' is not defined

In [None]:
# test_predict = test_predict.round().astype(int)
# test_predict

In [None]:
# class_metrics(test_predict)

In [None]:
# class_f1(test_predict)

### Model - Bert

In [None]:
# import tensorflow as tf
# from tensorflow.keras.layers import Dense, Input
# from tensorflow.keras.optimizers import Adam
# from tensorflow.keras.models import Model
# from tensorflow.keras.callbacks import ModelCheckpoint

# import transformers
# from tqdm.notebook import tqdm
# from tokenizers import BertWordPieceTokenizer

In [None]:
# from transformers import BertTokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

# def bert_encode(data, maximum_length) :
#     input_ids = []
#     attention_masks = []

#     for text in data:
#         encoded = tokenizer.encode_plus(
#             text, 
#             add_special_tokens=True,
#             max_length=maximum_length,
#             pad_to_max_length=True,

#             return_attention_mask=True,
#         )
#         input_ids.append(encoded['input_ids'])
#         attention_masks.append(encoded['attention_mask'])
        
#     return np.array(input_ids),np.array(attention_masks)

In [None]:
# texts = test_doc_balanced
# target = y_test_balanced

# train_input_ids, train_attention_masks = bert_encode(texts,60)

In [None]:
# import tensorflow as tf
# from tensorflow.keras.optimizers import Adam

# def create_model(bert_model):
    
#     input_ids = tf.keras.Input(shape=(60,),dtype='int32')
#     attention_masks = tf.keras.Input(shape=(60,),dtype='int32')

#     output = bert_model([input_ids,attention_masks])
#     output = output[1]
#     output = tf.keras.layers.Dense(32,activation='relu')(output)
#     output = tf.keras.layers.Dropout(0.2)(output)
#     output = tf.keras.layers.Dense(1,activation='sigmoid')(output)
    
#     model = tf.keras.models.Model(inputs = [input_ids,attention_masks],outputs = output)
#     model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
#     return model

In [None]:
# from transformers import TFBertModel
# bert_model = TFBertModel.from_pretrained('bert-base-uncased')

In [None]:
# model = create_model(bert_model)
# model.summary()

In [None]:
# history = model.fit(
#     [train_input_ids, train_attention_masks],
#     target,
#     validation_split=0.2, 
#     epochs=2,
#     batch_size=10
# )


In [None]:
# b2_test_predict = model_bert.predict(test_doc)
# b2_test_predict = b2_test_predict.round().astype(int)

In [None]:
# class_metrics(b2_test_predict)

In [None]:
# class_f1(b2_test_predict)

### Model Bert

In [176]:
from transformers import BertTokenizer,TFBertModel
from tensorflow.keras.optimizers import Adam
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [177]:
def encode(text, maxlen):
    input_ids=[]
    attention_masks=[]

    for row in text:
        encoded = tokenizer.encode_plus(
            row,
            add_special_tokens=True,
            max_length=maxlen,
            pad_to_max_length=True,
            return_attention_mask=True,
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

    return np.array(input_ids),np.array(attention_masks)

In [178]:
X_train_input_ids, X_train_attention_masks = encode(train_doc.values, maxlen=64)
X_test_input_ids, X_test_attention_masks = encode(test_doc.values, maxlen=64)
X_val_input_ids, X_val_attention_masks = encode(val_doc.values, maxlen=64)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [179]:
def build_model(bert_model):
    input_word_ids = tf.keras.Input(shape=(64,),dtype='int32')
    attention_masks = tf.keras.Input(shape=(64,),dtype='int32')

    sequence_output = bert_model([input_word_ids,attention_masks])
    output = sequence_output[1]
    output = tf.keras.layers.Dense(32,activation='relu')(output)
    output = tf.keras.layers.Dropout(0.2)(output)
    output = tf.keras.layers.Dense(1,activation='sigmoid')(output)

    model = tf.keras.models.Model(inputs = [input_word_ids,attention_masks], outputs = output)
    model.compile(Adam(lr=1e-5),loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [180]:
model3 = build_model(bert_model)
model3.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 64)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 64)]         0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  109482240   ['input_1[0][0]',                
                                thPoolingAndCrossAt               'input_2[0][0]']                
                                tentions(last_hidde                                               
                                n_state=(None, 64,                                            

  super().__init__(name, **kwargs)


In [181]:
history = model3.fit(
    [X_train_input_ids, X_train_attention_masks],
    y_train,
    batch_size=32,
    epochs=5,
    validation_data=([X_test_input_ids, X_test_attention_masks], y_test),
    class_weight=class_weight)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [236]:
ber = model3.predict([X_train_input_ids, X_train_attention_masks], batch_size=None)
ber = np.concatenate(ber).round().astype(int)
bet = model3.predict([X_test_input_ids, X_test_attention_masks], batch_size=None)
bet = np.concatenate(bet).round().astype(int)
bev = model3.predict([X_val_input_ids, X_val_attention_masks], batch_size=None)
bev = np.concatenate(bev).round().astype(int)



In [183]:
class_metrics(bet)

Accuracy: 0.9752513534416086
Precision: 0.9166666666666666
Recall: 0.8827160493827161


(None, None, None)

In [232]:
ber = ber.tolist()
bet = bet.tolist()
bev = bev.tolist()

### Model - BiDirectional LSTM with Convuluational Layer

In [184]:
#embedding layer and bidirectional layer
#md_combo = tf.keras.Sequential()
#md_combo.add(tf.keras.layers.Embedding(input_dim= len(d) + 1,output_dim=64,input_length=x_train.shape[1]))
#md_combo.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
#md_combo.add(tf.keras.layers.GlobalAveragePooling2D(64,input_shape=(3,32,32)))
#md_combo.add(Flatten())
#md_combo.add(tf.keras.layers.Dense(64, activation='relu'))
#md_combo.add(tf.keras.layers.Dense(32, activation='relu'))
#md_combo.add(tf.keras.layers.Dense(1, activation='sigmoid'))
#md_combo.summary()

In [185]:
#compile the model
#optimizer = keras.optimizers.SGD(learning_rate=0.02, momentum = 0.95, nesterov=True)
#md_combo.compile(loss="binary_crossentropy",optimizer=optimizer,metrics=['accuracy', 'Precision', 'Recall'])

In [186]:
#fit the model
#md_combo.fit(x_train_array, y_train, epochs=50, callbacks=[callback])

### Report (Precision Hyptertuned)

In [187]:
# methods = [lrt, gnt, bnt, mbt, sbt, rft, gbt, ett, adt, xgt, cnt, ltt]
# method_name = ['LG','GNB','BNB','MNB','SVM','RF','GBT','ET','ADA','XGB','CNN','LTSM']
# accuracy = []
# precision = []
# recall = []
# f1 = []
# for i in methods:
#     accuracy.append(class_accuracy(i))
#     precision.append(class_precision(i))
#     recall.append(class_recall(i))
#     f1.append(class_f1(i))
# results = pd.DataFrame(
#     {'Method': method_name,
#      'Accuracy': accuracy,
#      'Precision': precision,
#      'Recall':recall,
#      'F1':f1
#     })


In [188]:
# results

### 3 Approach Voting Results

In [189]:
def vote_prediction(method1,method2,method3):
    pred = []
    for i in range(len(method1)):
        pred.append(round((method1[i]+method2[i]+method3[i])/3))
    return pred;

In [190]:
def vote_prediction5(method1,method2,method3,method4,method5):
    pred = []
    for i in range(len(method1)):
        pred.append(round((method1[i]+method2[i]+method3[i]+method4[i]+method5[i])/5))
    return pred;

### Report (Train Data)

In [250]:
methods1 = [lrr,mbr,sbr2,rfr2,gbr2,etr2,adr2,xgr2,xrr,cnr,ltr,ber]
method_name1 = ['Logistic Regression','Naïve Bayes','SVM','RF','Gradient Boosted','ET','Adaptive Boosting','XGBoost','RF with XGBoost','CNN','LTSM','BERT']
accuracy1 = []
precision1 = []
recall1 = []
f11 = []
for i in methods1:
    accuracy1.append(metrics.accuracy_score(y_train,i))
    precision1.append(metrics.precision_score(y_train,i))
    recall1.append(metrics.recall_score(y_train,i))
    f11.append(metrics.f1_score(y_train,i))
results1 = pd.DataFrame(
    {'Method': method_name1,
     'Accuracy': accuracy1,
     'Precision': precision1,
     'Recall':recall1,
     'F1':f11
    })

In [246]:
results1

Unnamed: 0,Method,Accuracy,Precision,Recall,F1
0,Logistic Regression,0.989164,0.935211,0.985163,0.959538
1,Naive Bayes-Gaussian,0.868421,0.497784,1.0,0.664694
2,Naive Bayes-Bernouli,0.971749,1.0,0.783383,0.878536
3,Naive Bayes-MultiNomial,0.986455,0.966049,0.928783,0.94705
4,SVM,0.999226,0.997033,0.997033,0.997033
5,RF,0.999613,1.0,0.997033,0.998514
6,Gradient-Boosted Trees,0.999226,1.0,0.994065,0.997024
7,Extremely Randomized Trees,0.999613,1.0,0.997033,0.998514
8,Adapative Boosting,0.999613,1.0,0.997033,0.998514
9,XGBoost,0.997678,0.985337,0.997033,0.99115


### Report (Val Data)

In [252]:
methods2 = [lrv,mbv,sbv2,rfv2,gbv2,etv2,adv2,xgv2,xrv,cnv,ltv,bev]
method_name2 = ['Logistic Regression','Naïve Bayes','SVM','RF','Gradient Boosted','ET','Adaptive Boosting','XGBoost','RF with XGBoost','CNN','LTSM','BERT']
accuracy2 = []
precision2 = []
recall2 = []
f12 = []
for i in methods2:
    accuracy2.append(metrics.accuracy_score(y_val,i))
    precision2.append(metrics.precision_score(y_val,i))
    recall2.append(metrics.recall_score(y_val,i))
    f12.append(metrics.f1_score(y_val,i))
results2 = pd.DataFrame(
    {'Method': method_name2,
     'Accuracy': accuracy2,
     'Precision': precision2,
     'Recall':recall2,
     'F1':f12
    })

In [194]:
results2

Unnamed: 0,Method,Accuracy,Precision,Recall,F1
0,Logistic Regression,0.965944,0.861842,0.850649,0.856209
1,Naive Bayes-Gaussian,0.835139,0.412463,0.902597,0.566191
2,Naive Bayes-Bernouli,0.966718,0.99115,0.727273,0.838951
3,Naive Bayes-MultiNomial,0.978328,0.95,0.863636,0.904762
4,SVM,0.97678,0.969697,0.831169,0.895105
5,RF,0.979876,0.963768,0.863636,0.910959
6,Gradient-Boosted Trees,0.98065,0.964029,0.87013,0.914676
7,Extremely Randomized Trees,0.977554,0.94964,0.857143,0.901024
8,Adapative Boosting,0.974458,0.941606,0.837662,0.886598
9,XGBoost,0.943498,0.818898,0.675325,0.740214


### Report (Test Data)

In [253]:
methods3 = [lrt,mbt,sbt2,rft2,gbt2,ett2,adt2,xgt2,xrt,cnt,ltt,bet]
method_name3 = ['Logistic Regression','Naïve Bayes','SVM','RF','Gradient Boosted','ET','Adaptive Boosting','XGBoost','RF with XGBoost','CNN','LTSM','BERT']
accuracy3 = []
precision3 = []
recall3 = []
f13 = []
for i in methods3:
    accuracy3.append(class_accuracy(i))
    precision3.append(class_precision(i))
    recall3.append(class_recall(i))
    f13.append(class_f1(i))
results3 = pd.DataFrame(
    {'Method': method_name3,
     'Accuracy': accuracy3,
     'Precision': precision3,
     'Recall':recall3,
     'F1':f13
    })

In [254]:
results3

Unnamed: 0,Method,Accuracy,Precision,Recall,F1
0,Logistic Regression,0.96133,0.85,0.839506,0.84472
1,Naïve Bayes,0.969064,0.912162,0.833333,0.870968
2,SVM,0.967517,0.928571,0.802469,0.860927
3,RF,0.979118,0.965517,0.864198,0.912052
4,Gradient Boosted,0.974478,0.964029,0.82716,0.890365
5,ET,0.975251,0.971014,0.82716,0.893333
6,Adaptive Boosting,0.964424,0.902778,0.802469,0.849673
7,XGBoost,0.967517,0.865854,0.876543,0.871166
8,RF with XGBoost,0.940449,0.834646,0.654321,0.733564
9,CNN,0.966744,0.928058,0.796296,0.857143


### Creating a Train Data Predictions DataFrame

This dataframe will be used to first identify false positives and then calculate the jaccard score amongst false negatives

In [197]:
# yy = s_pandas['label'].iloc[train_index]
# train_predictions = [lrr, gnr, bnr, mbr, sbr2, rfr2, gbr2, etr2, adr2, xgr2, cnr, ltr]
# for i in train_predictions:
#     i = i.tolist()
# train_results = list(zip(yy,lrr, gnr, bnr, mbr, sbr2, rfr2, gbr2, etr2, adr2, xgr2, cnr, ltr))
# df_results = pd.DataFrame(data=train_results,columns=['actual','LG','GNB','BNB','MNB','SVM','RF','GBT','ET','ADA','XGB','CNN','LTSM'])

In [198]:
# tn_list = []
# fp_list = []
# fn_list = []
# tp_list = []
# for i in range(len(train_predictions)):
#     tn, fp, fn, tp = confusion_matrix(yy, train_predictions[i], labels=[0, 1]).ravel()
#     tn_list.append(tn)
#     fp_list.append(fp)
#     fn_list.append(fn)
#     tp_list.append(tp)
# confusion = list(zip(method_name,tn_list,fp_list,fn_list,tp_list))
# df_confusion = pd.DataFrame(data=confusion,columns=['Method','True Positive','False Positive','False Negative','True Positive'])
# df_confusion

### Creating a Test Data Predictions DataFrame

In [199]:
# yz = s_pandas['label'].iloc[test_index]
# test_predictions = [lrt, gnt, bnt, mbt, sbt2, rft2, gbt2, ett2, adt2, xgt2, cnt, ltt]
# for i in test_predictions:
#     i = i.tolist()
# test_results = list(zip(yz,lrt, gnt, bnt, mbt, sbt2, rft2, gbt2, ett2, adt2, xgt2, cnt, ltt))
# df_results_test = pd.DataFrame(data=test_results,columns=['actual','LG','GNB','BNB','MNB','SVM','RF','GBT','ET','ADA','XGB','CNN','LTSM'])

In [200]:
# tn2_list = []
# fp2_list = []
# fn2_list = []
# tp2_list = []
# for i in range(len(test_predictions)):
#     tn, fp, fn, tp = confusion_matrix(yz, test_predictions[i], labels=[0, 1]).ravel()
#     tn2_list.append(tn)
#     fp2_list.append(fp)
#     fn2_list.append(fn)
#     tp2_list.append(tp)
# confusion2 = list(zip(method_name,tn2_list,fp2_list,fn2_list,tp2_list))
# df_confusion2 = pd.DataFrame(data=confusion2,columns=['Method','True Positive','False Positive','False Negative','True Positive'])
# df_confusion2

### Creating a DataFrame of False Positives

In [201]:
# fp_list = []
# for i in method_name:
#     fp_list.append(np.where((df_results['actual'] == 1) & (df_results[i] == 0),1,0).tolist())

In [202]:
# df_fp = pd.DataFrame(data=fp_list)
# df_fp = df_fp.transpose()
# df_fp = df_fp.set_axis(method_name,axis=1)

In [203]:
# len(method_name)

### Jaccard Score Matrix

In [204]:
# #calculate the jaccard scores
# method1 = []
# method2 = []
# jac = []
# for x in method_name:
#     for y in method_name:
#         method1.append(x)
#         method2.append(y)
#         jac.append(jaccard_score(df_fp[x],df_fp[y]))

In [205]:
# #put into a matrix
# d = list(zip(method1,method2,jac))
# df_j = pd.DataFrame(data=d,columns=('method1','method2','jac'))
# table = pd.pivot_table(df_j,values='jac',index='method1',columns='method2',aggfunc=np.sum)
# table

### Testing False Positive Metrics

In [206]:
# ET_train_predictions = etm.predict(df_train).tolist()
# ADA_train_predictions = adm.predict(df_train).tolist()
# actual = y_train.tolist()
# results = list(zip(actual,ET_train_predictions,ADA_train_predictions))

In [207]:
# df_results = pd.DataFrame(data=results,columns=['actual','ET','ADA'])

In [208]:
# df_results.to_excel("FP.xlsx")

In [209]:
# predict_metrics(ET_train_predictions)

In [210]:
# predict_metrics(ADA_train_predictions)

In [211]:
# from sklearn.metrics import confusion_matrix
# tn, fp, fn, tp = confusion_matrix(actual, ADA_train_predictions, labels=[0, 1]).ravel()
# print(tn, fp, fn, tp)  # 1 1 1 1

In [212]:
# tn, fp, fn, tp = confusion_matrix(actual, ET_train_predictions, labels=[0, 1]).ravel()
# print(tn, fp, fn, tp)  # 1 1 1 1

In [213]:
# actual[0]

In [214]:
# df_results['ADA_FP'] = np.where(df_results['ADA'] == 0 & df_results['actual'] == 1,1,0)

In [215]:
# sum(df_results.ADA_FP.tolist())

In [216]:
a = [0,0,0,0,1,0,0,1,0,0]
b = [0,1,0,0,0,0,0,1,0,0]

In [217]:
jaccard_score(a,b)

0.3333333333333333

In [218]:
c = [0,1,0,0,0,0,0,0,0,0]
d = [0,0,0,0,0,0,0,1,0,0]

In [219]:
jaccard_score(c,d)

0.0

### Pickle Predictions

In [220]:
#pickle predictions
# with open('session.pkl', 'wb') as f:
#     pickle.dump({
#             'cnt' : cnt,
#             'ltt' : ltt,
#             'lrt' : lrt,
#             'gnt' : gnt,
#             'bnt' : bnt,
#             'mbt' : mbt,
#             'srt' : srt,
#             'spt' : spt,
#             'slt' : slt,
#             'rft' : rft,
#             'gbt' : gbt,
#             'ett' : ett,
#             'adt' : adt,
#             'xgt' : xgt,
#             'xrt' : xrt,
#             'cnr' : cnr,
#             'ltr' : ltr,
#             'lrr' : lrr,
#             'gnr' : gnr,
#             'bnr' : bnr,
#             'mbr' : mbr,
#             'srr' : srr,
#             'spr' : spr,
#             'slr' : slr,
#             'rfr' : rfr,
#             'gbr' : gbr,
#             'etr' : etr,
#             'adr' : adr,
#             'xgr' : xgr,
#             'xrr' : xrr
#                 }, f)

### Scratchwork

In [221]:
# pred_one = [0,1,1,0,1,0,0,1,0,0]
# pred_two = [0,0,0,0,1,0,0,1,0,0]
# actual = [0,1,0,0,1,0,0,1,0,0]

In [222]:
# print(metrics.accuracy_score(actual, pred_one))
# print(metrics.precision_score(actual, pred_one))
# print(metrics.recall_score(actual, pred_one))



In [223]:
# print(metrics.accuracy_score(actual, pred_two))
# print(metrics.precision_score(actual, pred_two))
# print(metrics.recall_score(actual, pred_two))

In [224]:
# print(metrics.accuracy_score(y_test, ett2))
# print(metrics.precision_score(y_test, ett2))
# print(metrics.recall_score(y_test, ett2))

In [225]:
# tn, fp, fn, tp = confusion_matrix(y_test, ett2, labels=[0, 1]).ravel()
# print(tn)
# print(fp)
# print(fn)
# print(tp)
# print()

### Optimized Voting Assemble

In [256]:
f1_grid = []
precision_grid = []
opt_grid_name = []
methods_grid_name = ['Logistic Regression','Naïve Bayes','SVM','RF','Gradient Boosted','ET','Adaptive Boosting','XGBoost','RF with XGBoost','CNN','LTSM','BERT']
methods_grid = [lrv,mbv,sbv2,rfv2,gbv2,etv2,adv2,xgv2,xrv,cnv,ltv,bev]
for x in range(len(methods_grid)):
    for y in range(len(methods_grid)):
        for z in range(len(methods_grid)):
            a = methods_grid_name[x]
            b = methods_grid_name[y]
            c = methods_grid_name[z]
            if ((a == b) or (a == c) or (b == c)):
                name = 'xxx'
            else: 
                name = a + ',' + b +',' + c
            opt_grid_name.append(name)
            f1_grid.append(metrics.f1_score(y_val,vote_prediction(methods_grid[x],methods_grid[y],methods_grid[z])))
            precision_grid.append(metrics.precision_score(y_val,vote_prediction(methods_grid[x],methods_grid[y],methods_grid[z])))
df_vote_grid = pd.DataFrame(
    {'Method': opt_grid_name,
     'F1': f1_grid,
     'Precision': precision_grid
    })

In [257]:
df_filtered = df_vote_grid[df_vote_grid['Method'] != 'xxx']

In [258]:
df_filtered.sort_values(by=['F1','Precision'],ascending=False).to_excel('f1_grid.xlsx')

### Ensemble Method

In [259]:
singh_voting = vote_prediction(cnt,gbt2,rft2)
cl_voting = vote_prediction(cnt,gbt2,bet)

methods = [lrt,mbt,sbt2,rft2,gbt2,ett2,adt2,xgt2,xrt,cnt,ltt,bet, singh_voting,cl_voting]
method_name =  ['Logistic Regression','Naïve Bayes','SVM','RF','Gradient Boosted','ET','Adaptive Boosting','XGBoost','RF with XGBoost','CNN','LTSM','BERT',"Sighn Voting Ensemble","Alternative Voting Ensemble"]
accuracy3 = []
precision3 = []
recall3 = []
f13 = []
for i in methods:
    accuracy3.append(class_accuracy(i))
    precision3.append(class_precision(i))
    recall3.append(class_recall(i))
    f13.append(class_f1(i))
results3 = pd.DataFrame(
    {'Method': method_name,
     'Accuracy': accuracy3,
     'Precision': precision3,
     'Recall':recall3,
     'F1':f13
    })
results3

Unnamed: 0,Method,Accuracy,Precision,Recall,F1
0,Logistic Regression,0.96133,0.85,0.839506,0.84472
1,Naïve Bayes,0.969064,0.912162,0.833333,0.870968
2,SVM,0.967517,0.928571,0.802469,0.860927
3,RF,0.979118,0.965517,0.864198,0.912052
4,Gradient Boosted,0.974478,0.964029,0.82716,0.890365
5,ET,0.975251,0.971014,0.82716,0.893333
6,Adaptive Boosting,0.964424,0.902778,0.802469,0.849673
7,XGBoost,0.967517,0.865854,0.876543,0.871166
8,RF with XGBoost,0.940449,0.834646,0.654321,0.733564
9,CNN,0.966744,0.928058,0.796296,0.857143
