In [16]:
#Importing the libraries
import numpy as np
import pandas as pd
import os
import io
import pdb
import sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn import linear_model
import pyswarms as ps
from statistics import mean, stdev
from sklearn.model_selection import cross_val_score
from tpot import TPOTClassifier
import matplotlib.pyplot as plt

In [20]:
#Machine learning models
vectorizer=CountVectorizer(stop_words='english',lowercase=True)

MNBC=MultinomialNB(alpha=0.5743650,class_prior=None,fit_prior=False)    #Multinomial Naive BAYES
SGDC=SGDClassifier(loss="log_loss", alpha=0.0001, max_iter=1000, tol=1e-3, epsilon=0.1)   #Stochastic Gradient Descent
DTC=DecisionTreeClassifier(criterion = "entropy", splitter = "best")  #entropy can also be used  #Decision Tree
RFC=RandomForestClassifier(criterion = "entropy")  #entropy must be used after gini    #Random Forest
MLPC=MLPClassifier(hidden_layer_sizes=5,max_iter=10000,solver='lbfgs')  #Multi-layer Perceptron
ABC=AdaBoostClassifier(n_estimators=100)          #AdaBoost Classifier
GBC=GradientBoostingClassifier(n_estimators=100)        #GradientBoosting Classifier

#Voting Ensemble Technique
VCH=VotingClassifier([('MNBC',MNBC),('SGDC',SGDC),('DTC',DTC),('RFC',RFC),('MLPC',MLPC),('ABC',ABC),('GBC',GBC)],voting='hard')
VCS=VotingClassifier([('MNBC',MNBC),('SGDC',SGDC),('DTC',DTC),('RFC',RFC),('MLPC',MLPC),('ABC',ABC),('GBC',GBC)],voting='soft')

##Bagging Ensemble Technique
BMNBC=BaggingClassifier(MNBC)
BSGDC=BaggingClassifier(SGDC)
BDTC=BaggingClassifier(DTC)
BRFC=BaggingClassifier(RFC)
BMLPC=BaggingClassifier(MLPC)
BGBC=BaggingClassifier(GBC)
BABC=BaggingClassifier(ABC)

#Stacking Ensemble Technique
SCMNBC=StackingClassifier([('MNBC',MNBC),('SGDC',SGDC),('DTC',DTC),('RFC',RFC),('MLPC',MLPC),('ABC',ABC),('GBC',GBC)], final_estimator=MNBC)
SCSGDC=StackingClassifier([('MNBC',MNBC),('SGDC',SGDC),('DTC',DTC),('RFC',RFC),('MLPC',MLPC),('ABC',ABC),('GBC',GBC)], final_estimator=SGDC)
SCDTC=StackingClassifier([('MNBC',MNBC),('SGDC',SGDC),('DTC',DTC),('RFC',RFC),('MLPC',MLPC),('ABC',ABC),('GBC',GBC)], final_estimator=DTC)
SCRFC=StackingClassifier([('MNBC',MNBC),('SGDC',SGDC),('DTC',DTC),('RFC',RFC),('MLPC',MLPC),('ABC',ABC),('GBC',GBC)], final_estimator=RFC)
SCMLPC=StackingClassifier([('MNBC',MNBC),('SGDC',SGDC),('DTC',DTC),('RFC',RFC),('MLPC',MLPC),('ABC',ABC),('GBC',GBC)], final_estimator=MLPC)
SCGBC=StackingClassifier([('MNBC',MNBC),('SGDC',SGDC),('DTC',DTC),('RFC',RFC),('MLPC',MLPC),('ABC',ABC),('GBC',GBC)], final_estimator=GBC)
SCABC=StackingClassifier([('MNBC',MNBC),('SGDC',SGDC),('DTC',DTC),('RFC',RFC),('MLPC',MLPC),('ABC',ABC),('GBC',GBC)], final_estimator=ABC)

skf=StratifiedKFold(n_splits=4,shuffle=True)
models=[MNBC,SGDC,DTC,RFC,MLPC]
split_list=[5,4,3,2]

In [6]:
def readFiles(path):
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            path=os.path.join(root, filename)
            lines=[]
            f=io.open(path, 'r', encoding='latin1')
            for line in f:
                lines.append(line)
            f.close
            message='\n'.join(lines)
            yield path, message

def dataFrameFromDirectory(path, classification):
    rows=[]
    index=[]
    for filename, message in readFiles(path):
        rows.append({'message':message, 'class':classification})
        index.append(filename)
    return pd.DataFrame(rows, index=index)

data=pd.DataFrame({'message':[], 'class':[]})

data=data.append(dataFrameFromDirectory('D:/Major Project/App_Data_Set/Spam_Assassin_Dataset/Spam','Spam'))
data=data.append(dataFrameFromDirectory('D:/Major Project/App_Data_Set/Spam_Assassin_Dataset/Ham','Ham'))
X=data['message']
y=data['class']


'''Pre-processing'''
IDF = TfidfVectorizer().fit_transform(X)
Tr_tokens=vectorizer.fit_transform(X)



MNBC_accu_stratified=list()
SGDC_accu_stratified=list()
DTC_accu_stratified=list()
RFC_accu_stratified=list()
MPLC_accu_stratified=list()
scores=np.array

In [7]:
#Applying Stratified K_fold_Cross_Validation
def SKF_Split(x):
    skf=StratifiedKFold(n_splits=x,shuffle=True)
    #Stratified K_Fold_spliting
    for train_index, test_index in skf.split(Tr_tokens,y):
        global X_train, X_test, y_train, y_test
        X_train, X_test = Tr_tokens[train_index], Tr_tokens[test_index]
        y_train, y_test = y[train_index], y[test_index]
    return X_train, X_test, y_train, y_test

In [18]:
def Stacking_MNBC_Classification():
    print('Stacking is being applied for MNBC Classifier, Training and testing arerunning')
    SKF_Split(4)
    SCMNBC.fit(X_train, y_train)
    pred=SCMNBC.predict(X_test)
    Training_Accuracy, Testing_Accuracy = SCMNBC.score(X_train,y_train)*100, SCMNBC.score(X_test,y_test)*100
    return Testing_Accuracy, (metrics.precision_score(y_test,pred,pos_label='Spam')*100), (metrics.recall_score(y_test,pred,pos_label='Spam')*100), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)
Stacking_MNBC_Classification()

Stacking is being applied for MNBC Classifier, Training and testing arerunning


(99.49804796430564, 98.28680203045685, 100.0, 99.136)

In [21]:
def Stacking_SGDC_Classification():
    print('Stacking is being applied for SGDC Classifier, Training and testing arerunning')
    SKF_Split(4)
    SCSGDC.fit(X_train, y_train)
    pred=SCSGDC.predict(X_test)
    Training_Accuracy, Testing_Accuracy = SCSGDC.score(X_train,y_train)*100, SCSGDC.score(X_test,y_test)*100
    return Testing_Accuracy, (metrics.precision_score(y_test,pred,pos_label='Spam')*100), (metrics.recall_score(y_test,pred,pos_label='Spam')*100), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)
Stacking_SGDC_Classification()

Stacking is being applied for SGDC Classifier, Training and testing arerunning


(99.96281836772634, 99.87105093488073, 100.0, 99.93548387096774)

In [22]:
def Stacking_DTC_Classification():
    print('Stacking is being applied for DTC Classifier, Training and testing arerunning')
    SKF_Split(4)
    SCDTC.fit(X_train, y_train)
    pred=SCDTC.predict(X_test)
    Training_Accuracy, Testing_Accuracy = SCDTC.score(X_train,y_train)*100, SCDTC.score(X_test,y_test)*100
    return Testing_Accuracy, (metrics.precision_score(y_test,pred,pos_label='Spam')*100), (metrics.recall_score(y_test,pred,pos_label='Spam')*100), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)
Stacking_DTC_Classification()

Stacking is being applied for DTC Classifier, Training and testing arerunning


KeyboardInterrupt: 

In [None]:
def Stacking_RFC_Classification():
    print('Stacking is being applied for RFC Classifier, Training and testing arerunning')
    SKF_Split(4)
    SCRFC.fit(X_train, y_train)
    pred=SCRFC.predict(X_test)
    Training_Accuracy, Testing_Accuracy = SCRFC.score(X_train,y_train)*100, SCRFC.score(X_test,y_test)*100
    return Testing_Accuracy, (metrics.precision_score(y_test,pred,pos_label='Spam')*100), (metrics.recall_score(y_test,pred,pos_label='Spam')*100), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)
Stacking_RFC_Classification()

In [None]:
def Stacking_MLPC_Classification():
    print('Stacking is being applied for MLPC Classifier, Training and testing arerunning')
    SKF_Split(4)
    SCMLPC.fit(X_train, y_train)
    pred=SCMLPC.predict(X_test)
    Training_Accuracy, Testing_Accuracy = SCMLPC.score(X_train,y_train)*100, SCMLPC.score(X_test,y_test)*100
    return Testing_Accuracy, (metrics.precision_score(y_test,pred,pos_label='Spam')*100), (metrics.recall_score(y_test,pred,pos_label='Spam')*100), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)
Stacking_MLPC_Classification()

In [None]:
def Stacking_GBC_Classification():
    print('Stacking is being applied for GBC Classifier, Training and testing arerunning')
    SKF_Split(4)
    SCGBC.fit(X_train, y_train)
    pred=SCGBC.predict(X_test)
    Training_Accuracy, Testing_Accuracy = SCGBC.score(X_train,y_train)*100, SCGBC.score(X_test,y_test)*100
    return Testing_Accuracy, (metrics.precision_score(y_test,pred,pos_label='Spam')*100), (metrics.recall_score(y_test,pred,pos_label='Spam')*100), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)
Stacking_GBC_Classification()

In [None]:
def Stacking_ABC_Classification():
    print('Stacking is being applied for ABC Classifier, Training and testing arerunning')
    SKF_Split(4)
    SCABC.fit(X_train, y_train)
    pred=SCABC.predict(X_test)
    Training_Accuracy, Testing_Accuracy = SCABC.score(X_train,y_train)*100, SCABC.score(X_test,y_test)*100
    return Testing_Accuracy, (metrics.precision_score(y_test,pred,pos_label='Spam')*100), (metrics.recall_score(y_test,pred,pos_label='Spam')*100), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)
Stacking_ABC_Classification()

In [9]:
def VotingHard_Classification():
    print('VCH Training and testing is running')
    SKF_Split(4)
    VCH.fit(X_train, y_train)
    pred=VCH.predict(X_test)
    Training_Accuracy, Testing_Accuracy = VCH.score(X_train,y_train)*100, VCH.score(X_test,y_test)*100
    return Testing_Accuracy, (metrics.precision_score(y_test,pred,pos_label='Spam')*100), (metrics.recall_score(y_test,pred,pos_label='Spam')*100), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)
VotingHard_Classification()

VCH Training and testing is running


(99.98140918386316, 100.0, 99.9354422207876, 99.96771068776236)

In [14]:
def VotingSoft_Classification():
    print('VCS Training and testing is running')
    SKF_Split(4)
    VCS.fit(X_train, y_train)
    pred=VCS.predict(X_test)
    Training_Accuracy, Testing_Accuracy = VCS.score(X_train,y_train)*100, VCS.score(X_test,y_test)*100
    return Testing_Accuracy, (metrics.precision_score(y_test,pred,pos_label='Spam')*100), (metrics.recall_score(y_test,pred,pos_label='Spam')*100), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)
VotingSoft_Classification()

VCS Training and testing is running


(100.0, 100.0, 100.0, 100.0)

In [12]:
#Bagging MNB
def BMNB_Classification():
    print("Bagging classification of Multinomial Naive BAYES's Training and testing is running")
    BMNBC.fit(X_train,y_train)
    pred=BMNBC.predict(X_test)
    Training_Accuracy, Testing_Accuracy = BMNBC.score(X_train,y_train)*100, BMNBC.score(X_test,y_test)*100
    return Testing_Accuracy, (metrics.precision_score(y_test,pred,pos_label='Spam')*100), (metrics.recall_score(y_test,pred,pos_label='Spam')*100), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)

In [13]:
#Bagging GBC
def BGBC_Classification():
    print("Bagging classification of Gradientboost Classifier Training and testing is running")
    BGBC.fit(X_train, y_train)
    pred=BGBC.predict(X_test)
    Training_Accuracy, Testing_Accuracy = BGBC.score(X_train, y_train)*100, BGBC.score(X_test, y_test)*100
    return Testing_Accuracy, (metrics.precision_score(y_test,pred,pos_label='Spam')*100), (metrics.recall_score(y_test,pred,pos_label='Spam')*100), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)

In [14]:
#AdaBoost Classification
def BABC_Classification():
    print("Bagging classification of Adaboost Classifier Training and testing is running")
    BABC.fit(X_train, y_train)
    pred=BABC.predict(X_test)
    Training_Accuracy, Testing_Accuracy = BABC.score(X_train, y_train)*100, BABC.score(X_test, y_test)*100
    return Testing_Accuracy, (metrics.precision_score(y_test,pred,pos_label='Spam')*100), (metrics.recall_score(y_test,pred,pos_label='Spam')*100), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)

In [15]:
#Stochastic Gradient Descent (SGD) Calssification
def BSGDC_Classification():
    print("Bagging classification of Stochastic Gradient descent training and testing is running")
    BSGDC.fit(X_train,y_train)
    pred=BSGDC.predict(X_test)
    Training_Accuracy, Testing_Accuracy = BSGDC.score(X_train,y_train)*100, BSGDC.score(X_test,y_test)*100
    return Testing_Accuracy, (metrics.precision_score(y_test,pred,pos_label='Spam')*100), (metrics.recall_score(y_test,pred,pos_label='Spam')*100), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)

In [16]:
#Decision Tree Classification
def BDTC_Classification():
    print("Bagging classification of Decision Tree Classifier Training and testing is running")
    BDTC.fit(X_train,y_train)
    pred=BDTC.predict(X_test)
    Training_Accuracy, Testing_Accuracy = BDTC.score(X_train,y_train)*100, BDTC.score(X_test,y_test)*100
    return Testing_Accuracy, (metrics.precision_score(y_test,pred,pos_label='Spam')*100), (metrics.recall_score(y_test,pred,pos_label='Spam')*100), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)

In [17]:
#Random Forest Classification
def BRFC_Classification():
    print("Bagging classification of Random Forest Training and testing is running")
    BRFC.fit(X_train,y_train)
    pred=BRFC.predict(X_test)
    Training_Accuracy, Testing_Accuracy = BRFC.score(X_train,y_train)*100, BRFC.score(X_test,y_test)*100
    return Testing_Accuracy, (metrics.precision_score(y_test,pred,pos_label='Spam')*100), (metrics.recall_score(y_test,pred,pos_label='Spam')*100), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)

In [18]:
#Multi-layer Perceptron Calssification
def BMLPC_Classification():
    print("Bagging classification of Multi-layer Perceptron Training and testing is running")
    BMLPC.fit(X_train,y_train)
    pred=BMLPC.predict(X_test)
    Training_Accuracy, Testing_Accuracy = BMLPC.score(X_train,y_train)*100, BMLPC.score(X_test,y_test)*100
    return Testing_Accuracy, (metrics.precision_score(y_test,pred,pos_label='Spam')*100), (metrics.recall_score(y_test,pred,pos_label='Spam')*100), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)

In [9]:
#Multinomial Naive Bayes Calssification
def MNB_Classification(alpha):
    print("Multinomial Naive BAYES's Training and testing is running")
    MNBC.fit(X_train,y_train)
    pred=MNBC.predict(X_test)
    Training_Accuracy, Testing_Accuracy = MNBC.score(X_train,y_train)*100, MNBC.score(X_test,y_test)*100
    return Testing_Accuracy, (metrics.precision_score(y_test,pred,pos_label='Spam')*100), (metrics.recall_score(y_test,pred,pos_label='Spam')*100), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)

In [None]:
#GradientBoost Classification
def GBC_Classification(n_estimators, learning_rate):
    print("Gradientboost Classifier Training and testing is running")
    GBC.fit(X_train, y_train)
    pred=GBC.predict(X_test)
    Training_Accuracy, Testing_Accuracy = GBC.score(X_train, y_train)*100, GBC.score(X_test, y_test)*100
    return Testing_Accuracy, (metrics.precision_score(y_test,pred,pos_label='Spam')*100), (metrics.recall_score(y_test,pred,pos_label='Spam')*100), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)

In [None]:
#AdaBoost Classification
def ABC_Classification(n_estimators, learning_rate):
    print("Adaboost Classifier Training and testing is running")
    ABC.fit(X_train, y_train)
    pred=ABC.predict(X_test)
    Training_Accuracy, Testing_Accuracy = ABC.score(X_train, y_train)*100, ABC.score(X_test, y_test)*100
    return Testing_Accuracy, (metrics.precision_score(y_test,pred,pos_label='Spam')*100), (metrics.recall_score(y_test,pred,pos_label='Spam')*100), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)

In [29]:
#Stochastic Gradient Descent (SGD) Calssification
def SGDC_Classification(alpha,epsilon,tol):
    print("Stochastic Gradient descent training and testing is running")
    SGDC.fit(X_train,y_train)
    pred=SGDC.predict(X_test)
    Training_Accuracy, Testing_Accuracy = SGDC.score(X_train,y_train)*100, SGDC.score(X_test,y_test)*100
    return Testing_Accuracy, (metrics.precision_score(y_test,pred,pos_label='Spam')*100), (metrics.recall_score(y_test,pred,pos_label='Spam')*100), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)

In [8]:
#Decision Tree Classification
def DTC_Classification(max_depth,min_samples_leaf):
    print("Decision Tree Classifier Training and testing is running")
    DTC.fit(X_train,y_train)
    pred=DTC.predict(X_test)
    Training_Accuracy, Testing_Accuracy = DTC.score(X_train,y_train)*100, DTC.score(X_test,y_test)*100
    return Testing_Accuracy, (metrics.precision_score(y_test,pred,pos_label='Spam')*100), (metrics.recall_score(y_test,pred,pos_label='Spam')*100), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)

In [9]:
#Random Forest Classification
def RFC_Classification(n_estimators,max_depth):
    print("Random Forest Training and testing is running")
    RFC.fit(X_train,y_train)
    pred=RFC.predict(X_test)
    Training_Accuracy, Testing_Accuracy = RFC.score(X_train,y_train)*100, RFC.score(X_test,y_test)*100
    return Testing_Accuracy, (metrics.precision_score(y_test,pred,pos_label='Spam')*100), (metrics.recall_score(y_test,pred,pos_label='Spam')*100), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)

In [40]:
#Multi-layer Perceptron Calssification
def MLPC_Classification(hidden_layer_sizes,alpha):
    print("Multi-layer Perceptron Training and testing is running")
    MLPC.fit(X_train,y_train)
    pred=MLPC.predict(X_test)
    Training_Accuracy, Testing_Accuracy = MLPC.score(X_train,y_train)*100, MLPC.score(X_test,y_test)*100
    return Testing_Accuracy, (metrics.precision_score(y_test,pred,pos_label='Spam')*100), (metrics.recall_score(y_test,pred,pos_label='Spam')*100), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)

In [22]:
def Main_Base_model():
    MNB_ACC_LST=list()
    SGDC_ACC_LST=list()
    DTC_ACC_LST=list()
    RFC_ACC_LST=list()
    MLPC_ACC_LST=list()
    GBC_ACC_LST=list()
    ABC_ACC_LST=list()
    
    for ele in split_list:
        print("Stratified K-Fold has been applied for {} Splits".format(ele))
        SKF_Split(ele)
        acc1=MNB_Classification()
        acc2=SGDC_Classification()
        acc3=DTC_Classification()
        acc4=RFC_Classification()
        acc5=MLPC_Classification()
        acc6=GBC_Classification()
        acc7=ABC_Classification()
        MNB_ACC_LST.append(acc1*100)
        SGDC_ACC_LST.append(acc2*100)
        DTC_ACC_LST.append(acc3*100)
        RFC_ACC_LST.append(acc4*100)
        MLPC_ACC_LST.append(acc5*100)
        GBC_ACC_LST.append(acc6*100)
        ABC_ACC_LST.append(acc7*100)
        
    Accuracy_Table=[('Stochastic Gradient Descent',SGDC_ACC_LST[0][0],SGDC_ACC_LST[0][1],SGDC_ACC_LST[0][2],SGDC_ACC_LST[0][3],SGDC_ACC_LST[1][0],SGDC_ACC_LST[1][1],SGDC_ACC_LST[1][2],SGDC_ACC_LST[1][3],SGDC_ACC_LST[2][0],SGDC_ACC_LST[2][1],SGDC_ACC_LST[2][2],SGDC_ACC_LST[2][3],SGDC_ACC_LST[3][0],SGDC_ACC_LST[3][1],SGDC_ACC_LST[3][2],SGDC_ACC_LST[3][3]),
                ('Multinomial Naive BAYES',MNB_ACC_LST[0][0],MNB_ACC_LST[0][1],MNB_ACC_LST[0][2],MNB_ACC_LST[0][3],MNB_ACC_LST[1][0],MNB_ACC_LST[1][1],MNB_ACC_LST[1][2],MNB_ACC_LST[1][3],MNB_ACC_LST[2][0],MNB_ACC_LST[2][1],MNB_ACC_LST[2][2],MNB_ACC_LST[2][3],MNB_ACC_LST[3][0],MNB_ACC_LST[3][1],MNB_ACC_LST[3][2],MNB_ACC_LST[3][3]),
                ('Random Forest',RFC_ACC_LST[0][0],RFC_ACC_LST[0][1],RFC_ACC_LST[0][2],RFC_ACC_LST[0][3],RFC_ACC_LST[1][0],RFC_ACC_LST[1][1],RFC_ACC_LST[1][2],RFC_ACC_LST[1][3],RFC_ACC_LST[2][0],RFC_ACC_LST[2][1],RFC_ACC_LST[2][2],RFC_ACC_LST[2][3],RFC_ACC_LST[3][0],RFC_ACC_LST[3][1],RFC_ACC_LST[3][2],RFC_ACC_LST[3][3]),
                ('Decision Tree',DTC_ACC_LST[0][0],DTC_ACC_LST[0][1],DTC_ACC_LST[0][2],DTC_ACC_LST[0][3],DTC_ACC_LST[1][0],DTC_ACC_LST[1][1],DTC_ACC_LST[1][2],DTC_ACC_LST[1][3],DTC_ACC_LST[2][0],DTC_ACC_LST[2][1],DTC_ACC_LST[2][2],DTC_ACC_LST[2][3],DTC_ACC_LST[3][0],DTC_ACC_LST[3][1],DTC_ACC_LST[3][2],DTC_ACC_LST[3][3]),
                ('Multi-later Perceptron',MLPC_ACC_LST[0][0],MLPC_ACC_LST[0][1],MLPC_ACC_LST[0][2],MLPC_ACC_LST[0][3],MLPC_ACC_LST[1][0],MLPC_ACC_LST[1][1],MLPC_ACC_LST[1][2],MLPC_ACC_LST[1][3],MLPC_ACC_LST[2][0],MLPC_ACC_LST[2][1],MLPC_ACC_LST[2][2],MLPC_ACC_LST[2][3],MLPC_ACC_LST[3][0],MLPC_ACC_LST[3][1],MLPC_ACC_LST[3][2],MLPC_ACC_LST[3][3]),
                ('GradientBoosting Classifier',GBC_ACC_LST[0][0],GBC_ACC_LST[0][1],GBC_ACC_LST[0][2],GBC_ACC_LST[0][3],GBC_ACC_LST[1][0],GBC_ACC_LST[1][1],GBC_ACC_LST[1][2],GBC_ACC_LST[1][3],GBC_ACC_LST[2][0],GBC_ACC_LST[2][1],GBC_ACC_LST[2][2],GBC_ACC_LST[2][3],GBC_ACC_LST[3][0],GBC_ACC_LST[3][1],GBC_ACC_LST[3][2],GBC_ACC_LST[3][3]),
                ('AdaBoost Classifier',ABC_ACC_LST[0][0],ABC_ACC_LST[0][1],ABC_ACC_LST[0][2],ABC_ACC_LST[0][3],ABC_ACC_LST[1][0],ABC_ACC_LST[1][1],ABC_ACC_LST[1][2],ABC_ACC_LST[1][3],ABC_ACC_LST[2][0],ABC_ACC_LST[2][1],ABC_ACC_LST[2][2],ABC_ACC_LST[2][3],ABC_ACC_LST[3][0],ABC_ACC_LST[3][1],ABC_ACC_LST[3][2],ABC_ACC_LST[3][3])
               ]
    Result_table=pd.DataFrame(Accuracy_Table,columns=["Classifier","80-20 Split","Precision Score","Recall Score","F1-Score","75-25 Split","Precision Score","Recall Score","F1-Score","67-33 Split","Precision Score","Recall Score","F1-Score","50-50 Split","Precision Score","Recall Score","F1-Score"])
    
    '''80-20 Split Accuracy Plot '''
    N = 4
    ind = np.arange(N) 
    width = 0.12

    MNB_vals = [MNB_ACC_LST[0][0],MNB_ACC_LST[0][1],MNB_ACC_LST[0][2],MNB_ACC_LST[0][3]]
    bar1 = plt.bar(ind, MNB_vals, width, color = 'r')

    SGDC_vals = [SGDC_ACC_LST[0][0],SGDC_ACC_LST[0][1],SGDC_ACC_LST[0][2],SGDC_ACC_LST[0][3]]
    bar2 = plt.bar(ind+width, SGDC_vals, width, color='g')

    DTC_vals = [DTC_ACC_LST[0][0],DTC_ACC_LST[0][1],DTC_ACC_LST[0][2],DTC_ACC_LST[0][3]]
    bar3 = plt.bar(ind+width*2, DTC_vals, width, color = 'b')

    RFC_vals = [RFC_ACC_LST[0][0],RFC_ACC_LST[0][1],RFC_ACC_LST[0][2],RFC_ACC_LST[0][3]]
    bar4 = plt.bar(ind+width*3, RFC_vals, width, color='y')
    
    MLPC_vals = [MLPC_ACC_LST[0][0],MLPC_ACC_LST[0][1],MLPC_ACC_LST[0][2],MLPC_ACC_LST[0][3]]
    bar5 = plt.bar(ind+width*4, MLPC_vals, width, color='c')
    
    GBC_vals = [GBC_ACC_LST[0][0],GBC_ACC_LST[0][1],GBC_ACC_LST[0][2],GBC_ACC_LST[0][3]]
    bar6 = plt.bar(ind+width*4, GBC_vals, width, color='m')
    
    ABC_vals = [ABC_ACC_LST[0][0],ABC_ACC_LST[0][1],ABC_ACC_LST[0][2],ABC_ACC_LST[0][3]]
    bar7 = plt.bar(ind+width*4, ABC_vals, width, color='k')
    
        
    plt.xlabel("Accuracy Fields")
    plt.ylabel("Accuracies")
    plt.title("Accuracies for 80-20 Split")

    plt.xticks(ind+width,['Testing Accuracy', 'Precision Score', 'Recall Score', 'F1-Score'])
    plt.legend( (bar1, bar2, bar3, bar4, bar5, bar6, bar7), ('MNB', 'SGDC', 'DTC', 'RFC', 'MLPC', 'GBC', 'ABC') )
    plt.show()
    
    '''75-25 Split Accuracy Plot '''
    
    N = 4
    ind = np.arange(N) 
    width = 0.12

    MNB_vals = [MNB_ACC_LST[1][0],MNB_ACC_LST[1][1],MNB_ACC_LST[1][2],MNB_ACC_LST[1][3]]
    bar1 = plt.bar(ind, MNB_vals, width, color = 'r')

    SGDC_vals = [SGDC_ACC_LST[1][0],SGDC_ACC_LST[1][1],SGDC_ACC_LST[1][2],SGDC_ACC_LST[1][3]]
    bar2 = plt.bar(ind+width, SGDC_vals, width, color='g')

    DTC_vals = [DTC_ACC_LST[1][0],DTC_ACC_LST[1][1],DTC_ACC_LST[1][2],DTC_ACC_LST[1][3]]
    bar3 = plt.bar(ind+width*2, DTC_vals, width, color = 'b')

    RFC_vals = [RFC_ACC_LST[1][0],RFC_ACC_LST[1][1],RFC_ACC_LST[1][2],RFC_ACC_LST[1][3]]
    bar4 = plt.bar(ind+width*3, RFC_vals, width, color='y')
    
    MLPC_vals = [MLPC_ACC_LST[1][0],MLPC_ACC_LST[1][1],MLPC_ACC_LST[1][2],MLPC_ACC_LST[1][3]]
    bar5 = plt.bar(ind+width*4, MLPC_vals, width, color='c')
    
    GBC_vals = [GBC_ACC_LST[1][0],GBC_ACC_LST[1][1],GBC_ACC_LST[1][2],GBC_ACC_LST[1][3]]
    bar6 = plt.bar(ind+width*4, GBC_vals, width, color='m')
    
    ABC_vals = [ABC_ACC_LST[1][0],ABC_ACC_LST[1][1],ABC_ACC_LST[1][2],ABC_ACC_LST[1][3]]
    bar7 = plt.bar(ind+width*4, ABC_vals, width, color='k')
        
    plt.xlabel("Accuracy Fields")
    plt.ylabel("Accuracies")
    plt.title("Accuracies for 75-25 Split")

    plt.xticks(ind+width,['Testing Accuracy', 'Precision Score', 'Recall Score', 'F1-Score'])
    plt.legend( (bar1, bar2, bar3, bar4, bar5, bar6, bar7), ('MNB', 'SGDC', 'DTC', 'RFC', 'MLPC', 'GBC', 'ABC') )
    plt.show()
    
    
    '''67-33 Split Accuracy Plot '''
    N = 4
    ind = np.arange(N) 
    width = 0.12

    MNB_vals = [MNB_ACC_LST[2][0],MNB_ACC_LST[2][1],MNB_ACC_LST[2][2],MNB_ACC_LST[2][3]]
    bar1 = plt.bar(ind, MNB_vals, width, color = 'r')

    SGDC_vals = [SGDC_ACC_LST[2][0],SGDC_ACC_LST[2][1],SGDC_ACC_LST[2][2],SGDC_ACC_LST[2][3]]
    bar2 = plt.bar(ind+width, SGDC_vals, width, color='g')

    DTC_vals = [DTC_ACC_LST[2][0],DTC_ACC_LST[2][1],DTC_ACC_LST[2][2],DTC_ACC_LST[2][3]]
    bar3 = plt.bar(ind+width*2, DTC_vals, width, color = 'b')

    RFC_vals = [RFC_ACC_LST[2][0],RFC_ACC_LST[2][1],RFC_ACC_LST[2][2],RFC_ACC_LST[2][3]]
    bar4 = plt.bar(ind+width*3, RFC_vals, width, color='y')
    
    MLPC_vals = [MLPC_ACC_LST[2][1],MLPC_ACC_LST[2][1],MLPC_ACC_LST[2][2],MLPC_ACC_LST[2][3]]
    bar5 = plt.bar(ind+width*4, MLPC_vals, width, color='c')
    
    GBC_vals = [GBC_ACC_LST[2][0],GBC_ACC_LST[2][1],GBC_ACC_LST[2][2],GBC_ACC_LST[2][3]]
    bar6 = plt.bar(ind+width*4, GBC_vals, width, color='m')
    
    ABC_vals = [ABC_ACC_LST[2][0],ABC_ACC_LST[2][1],ABC_ACC_LST[2][2],ABC_ACC_LST[2][3]]
    bar7 = plt.bar(ind+width*4, ABC_vals, width, color='k')
    
        
    plt.xlabel("Accuracy Fields")
    plt.ylabel("Accuracies")
    plt.title("Accuracies for 67-33 Split")

    plt.xticks(ind+width,['Testing Accuracy', 'Precision Score', 'Recall Score', 'F1-Score'])
    plt.legend( (bar1, bar2, bar3, bar4, bar5, bar6, bar7), ('MNB', 'SGDC', 'DTC', 'RFC', 'MLPC', 'GBC', 'ABC') )
    plt.show()
    
    
    
    '''50-50 Split Accuracy Plot '''
    N = 4
    ind = np.arange(N) 
    width = 0.12

    MNB_vals = [MNB_ACC_LST[3][0],MNB_ACC_LST[3][1],MNB_ACC_LST[3][2],MNB_ACC_LST[3][3]]
    bar1 = plt.bar(ind, MNB_vals, width, color = 'r')

    SGDC_vals = [SGDC_ACC_LST[3][0],SGDC_ACC_LST[3][1],SGDC_ACC_LST[3][2],SGDC_ACC_LST[3][3]]
    bar2 = plt.bar(ind+width, SGDC_vals, width, color='g')

    DTC_vals = [DTC_ACC_LST[3][0],DTC_ACC_LST[3][1],DTC_ACC_LST[3][2],DTC_ACC_LST[3][3]]
    bar3 = plt.bar(ind+width*2, DTC_vals, width, color = 'b')

    RFC_vals = [RFC_ACC_LST[3][0],RFC_ACC_LST[3][1],RFC_ACC_LST[3][2],RFC_ACC_LST[3][3]]
    bar4 = plt.bar(ind+width*3, RFC_vals, width, color='y')
    
    MLPC_vals = [MLPC_ACC_LST[3][1],MLPC_ACC_LST[3][1],MLPC_ACC_LST[3][2],MLPC_ACC_LST[3][3]]
    bar5 = plt.bar(ind+width*4, MLPC_vals, width, color='c')
    
    GBC_vals = [GBC_ACC_LST[3][0],GBC_ACC_LST[3][1],GBC_ACC_LST[3][2],GBC_ACC_LST[3][3]]
    bar6 = plt.bar(ind+width*4, GBC_vals, width, color='m')
    
    ABC_vals = [ABC_ACC_LST[3][0],ABC_ACC_LST[3][1],ABC_ACC_LST[3][2],ABC_ACC_LST[3][3]]
    bar7 = plt.bar(ind+width*4, ABC_vals, width, color='k')
    
        
    plt.xlabel("Accuracy Fields")
    plt.ylabel("Accuracies")
    plt.title("Accuracies for 50-50 Split")

    plt.xticks(ind+width,['Testing Accuracy', 'Precision Score', 'Recall Score', 'F1-Score'])
    plt.legend( (bar1, bar2, bar3, bar4, bar5, bar6, bar7), ('MNB', 'SGDC', 'DTC', 'RFC', 'MLPC', 'GBC', 'ABC') )
    plt.show()
    
    
    
    return Result_table
Main_Base_model()

Stratified K-Fold has been applied for 5 Splits


TypeError: MNB_Classification() missing 1 required positional argument: 'alpha'

In [19]:
def Main_Bagging():
    BMNB_ACC_LST=list()
    BSGDC_ACC_LST=list()
    BDTC_ACC_LST=list()
    BRFC_ACC_LST=list()
    BMLPC_ACC_LST=list()
    BGBC_ACC_LST=list()
    BABC_ACC_LST=list()
    
    for ele in split_list:
        print("Stratified K-Fold has been applied for {} Splits".format(ele))
        SKF_Split(ele)
        acc1=BMNB_Classification()
        acc2=BSGDC_Classification()
        acc3=BDTC_Classification()
        acc4=BRFC_Classification()
        acc5=BMLPC_Classification()
        acc6=BGBC_Classification()
        acc7=BABC_Classification()
        BMNB_ACC_LST.append(acc1*100)
        BSGDC_ACC_LST.append(acc2*100)
        BDTC_ACC_LST.append(acc3*100)
        BRFC_ACC_LST.append(acc4*100)
        BMLPC_ACC_LST.append(acc5*100)
        BGBC_ACC_LST.append(acc6*100)
        BABC_ACC_LST.append(acc7*100)
        
    Accuracy_Table=[('Stochastic Gradient Descent',BSGDC_ACC_LST[0][0],BSGDC_ACC_LST[0][1],BSGDC_ACC_LST[0][2],BSGDC_ACC_LST[0][3],BSGDC_ACC_LST[1][0],BSGDC_ACC_LST[1][1],BSGDC_ACC_LST[1][2],BSGDC_ACC_LST[1][3],BSGDC_ACC_LST[2][0],BSGDC_ACC_LST[2][1],BSGDC_ACC_LST[2][2],BSGDC_ACC_LST[2][3],BSGDC_ACC_LST[3][0],BSGDC_ACC_LST[3][1],BSGDC_ACC_LST[3][2],BSGDC_ACC_LST[3][3]),
                ('Multinomial Naive BAYES',BMNB_ACC_LST[0][0],BMNB_ACC_LST[0][1],BMNB_ACC_LST[0][2],BMNB_ACC_LST[0][3],BMNB_ACC_LST[1][0],BMNB_ACC_LST[1][1],BMNB_ACC_LST[1][2],BMNB_ACC_LST[1][3],BMNB_ACC_LST[2][0],BMNB_ACC_LST[2][1],BMNB_ACC_LST[2][2],BMNB_ACC_LST[2][3],BMNB_ACC_LST[3][0],BMNB_ACC_LST[3][1],BMNB_ACC_LST[3][2],BMNB_ACC_LST[3][3]),
                ('Random Forest',BRFC_ACC_LST[0][0],BRFC_ACC_LST[0][1],BRFC_ACC_LST[0][2],BRFC_ACC_LST[0][3],BRFC_ACC_LST[1][0],BRFC_ACC_LST[1][1],BRFC_ACC_LST[1][2],BRFC_ACC_LST[1][3],BRFC_ACC_LST[2][0],BRFC_ACC_LST[2][1],BRFC_ACC_LST[2][2],BRFC_ACC_LST[2][3],BRFC_ACC_LST[3][0],BRFC_ACC_LST[3][1],BRFC_ACC_LST[3][2],BRFC_ACC_LST[3][3]),
                ('Decision Tree',BDTC_ACC_LST[0][0],BDTC_ACC_LST[0][1],BDTC_ACC_LST[0][2],BDTC_ACC_LST[0][3],BDTC_ACC_LST[1][0],BDTC_ACC_LST[1][1],BDTC_ACC_LST[1][2],BDTC_ACC_LST[1][3],BDTC_ACC_LST[2][0],BDTC_ACC_LST[2][1],BDTC_ACC_LST[2][2],BDTC_ACC_LST[2][3],BDTC_ACC_LST[3][0],BDTC_ACC_LST[3][1],BDTC_ACC_LST[3][2],BDTC_ACC_LST[3][3]),
                ('Multi-later Perceptron',BMLPC_ACC_LST[0][0],BMLPC_ACC_LST[0][1],BMLPC_ACC_LST[0][2],BMLPC_ACC_LST[0][3],BMLPC_ACC_LST[1][0],BMLPC_ACC_LST[1][1],BMLPC_ACC_LST[1][2],BMLPC_ACC_LST[1][3],BMLPC_ACC_LST[2][0],BMLPC_ACC_LST[2][1],BMLPC_ACC_LST[2][2],BMLPC_ACC_LST[2][3],BMLPC_ACC_LST[3][0],BMLPC_ACC_LST[3][1],BMLPC_ACC_LST[3][2],BMLPC_ACC_LST[3][3]),
                ('GradientBoosting Classifier',BGBC_ACC_LST[0][0],BGBC_ACC_LST[0][1],BGBC_ACC_LST[0][2],BGBC_ACC_LST[0][3],BGBC_ACC_LST[1][0],BGBC_ACC_LST[1][1],BGBC_ACC_LST[1][2],BGBC_ACC_LST[1][3],BGBC_ACC_LST[2][0],BGBC_ACC_LST[2][1],BGBC_ACC_LST[2][2],BGBC_ACC_LST[2][3],BGBC_ACC_LST[3][0],BGBC_ACC_LST[3][1],BGBC_ACC_LST[3][2],BGBC_ACC_LST[3][3]),
                ('AdaBoost Classifier',BABC_ACC_LST[0][0],BABC_ACC_LST[0][1],BABC_ACC_LST[0][2],BABC_ACC_LST[0][3],BABC_ACC_LST[1][0],BABC_ACC_LST[1][1],BABC_ACC_LST[1][2],BABC_ACC_LST[1][3],BABC_ACC_LST[2][0],BABC_ACC_LST[2][1],BABC_ACC_LST[2][2],BABC_ACC_LST[2][3],BABC_ACC_LST[3][0],BABC_ACC_LST[3][1],BABC_ACC_LST[3][2],BABC_ACC_LST[3][3])
               ]
    Result_table=pd.DataFrame(Accuracy_Table,columns=["Classifier","80-20 Split","Precision Score","Recall Score","F1-Score","75-25 Split","Precision Score","Recall Score","F1-Score","67-33 Split","Precision Score","Recall Score","F1-Score","50-50 Split","Precision Score","Recall Score","F1-Score"])
    
    '''80-20 Split Accuracy Plot '''
    N = 4
    ind = np.arange(N) 
    width = 0.12

    MNB_vals = [BMNB_ACC_LST[0][0],BMNB_ACC_LST[0][1],BMNB_ACC_LST[0][2],BMNB_ACC_LST[0][3]]
    bar1 = plt.bar(ind, MNB_vals, width, color = 'r')

    SGDC_vals = [BSGDC_ACC_LST[0][0],BSGDC_ACC_LST[0][1],BSGDC_ACC_LST[0][2],BSGDC_ACC_LST[0][3]]
    bar2 = plt.bar(ind+width, SGDC_vals, width, color='g')

    DTC_vals = [BDTC_ACC_LST[0][0],BDTC_ACC_LST[0][1],BDTC_ACC_LST[0][2],BDTC_ACC_LST[0][3]]
    bar3 = plt.bar(ind+width*2, DTC_vals, width, color = 'b')

    RFC_vals = [BRFC_ACC_LST[0][0],BRFC_ACC_LST[0][1],BRFC_ACC_LST[0][2],BRFC_ACC_LST[0][3]]
    bar4 = plt.bar(ind+width*3, RFC_vals, width, color='y')
    
    MLPC_vals = [BMLPC_ACC_LST[0][0],BMLPC_ACC_LST[0][1],BMLPC_ACC_LST[0][2],BMLPC_ACC_LST[0][3]]
    bar5 = plt.bar(ind+width*4, MLPC_vals, width, color='c')
    
    GBC_vals = [BGBC_ACC_LST[0][0],BGBC_ACC_LST[0][1],BGBC_ACC_LST[0][2],BGBC_ACC_LST[0][3]]
    bar6 = plt.bar(ind+width*4, GBC_vals, width, color='m')
    
    ABC_vals = [BABC_ACC_LST[0][0],BABC_ACC_LST[0][1],BABC_ACC_LST[0][2],BABC_ACC_LST[0][3]]
    bar7 = plt.bar(ind+width*4, ABC_vals, width, color='k')
    
        
    plt.xlabel("Accuracy Fields")
    plt.ylabel("Accuracies")
    plt.title("Accuracies for 80-20 Split")

    plt.xticks(ind+width,['Testing Accuracy', 'Precision Score', 'Recall Score', 'F1-Score'])
    plt.legend( (bar1, bar2, bar3, bar4, bar5, bar6, bar7), ('BMNB', 'BSGDC', 'BDTC', 'BRFC', 'BMLPC', 'BGBC', 'BABC') )
    plt.show()
    
    '''75-25 Split Accuracy Plot '''
    
    N = 4
    ind = np.arange(N) 
    width = 0.12

    MNB_vals = [BMNB_ACC_LST[1][0],BMNB_ACC_LST[1][1],BMNB_ACC_LST[1][2],BMNB_ACC_LST[1][3]]
    bar1 = plt.bar(ind, MNB_vals, width, color = 'r')

    SGDC_vals = [BSGDC_ACC_LST[1][0],BSGDC_ACC_LST[1][1],BSGDC_ACC_LST[1][2],BSGDC_ACC_LST[1][3]]
    bar2 = plt.bar(ind+width, SGDC_vals, width, color='g')

    DTC_vals = [BDTC_ACC_LST[1][0],BDTC_ACC_LST[1][1],BDTC_ACC_LST[1][2],BDTC_ACC_LST[1][3]]
    bar3 = plt.bar(ind+width*2, DTC_vals, width, color = 'b')

    RFC_vals = [BRFC_ACC_LST[1][0],BRFC_ACC_LST[1][1],BRFC_ACC_LST[1][2],BRFC_ACC_LST[1][3]]
    bar4 = plt.bar(ind+width*3, RFC_vals, width, color='y')
    
    MLPC_vals = [BMLPC_ACC_LST[1][0],BMLPC_ACC_LST[1][1],BMLPC_ACC_LST[1][2],BMLPC_ACC_LST[1][3]]
    bar5 = plt.bar(ind+width*4, MLPC_vals, width, color='c')
    
    GBC_vals = [BGBC_ACC_LST[1][0],BGBC_ACC_LST[1][1],BGBC_ACC_LST[1][2],BGBC_ACC_LST[1][3]]
    bar6 = plt.bar(ind+width*4, GBC_vals, width, color='m')
    
    ABC_vals = [BABC_ACC_LST[1][0],BABC_ACC_LST[1][1],BABC_ACC_LST[1][2],BABC_ACC_LST[1][3]]
    bar7 = plt.bar(ind+width*4, ABC_vals, width, color='k')
        
    plt.xlabel("Accuracy Fields")
    plt.ylabel("Accuracies")
    plt.title("Accuracies for 75-25 Split")

    plt.xticks(ind+width,['Testing Accuracy', 'Precision Score', 'Recall Score', 'F1-Score'])
    plt.legend( (bar1, bar2, bar3, bar4, bar5, bar6, bar7), ('BMNB', 'BSGDC', 'BDTC', 'BRFC', 'BMLPC', 'BGBC', 'BABC') )
    plt.show()
    
    
    '''67-33 Split Accuracy Plot '''
    N = 4
    ind = np.arange(N) 
    width = 0.12

    MNB_vals = [BMNB_ACC_LST[2][0],BMNB_ACC_LST[2][1],BMNB_ACC_LST[2][2],BMNB_ACC_LST[2][3]]
    bar1 = plt.bar(ind, MNB_vals, width, color = 'r')

    SGDC_vals = [BSGDC_ACC_LST[2][0],BSGDC_ACC_LST[2][1],BSGDC_ACC_LST[2][2],BSGDC_ACC_LST[2][3]]
    bar2 = plt.bar(ind+width, SGDC_vals, width, color='g')

    DTC_vals = [BDTC_ACC_LST[2][0],BDTC_ACC_LST[2][1],BDTC_ACC_LST[2][2],BDTC_ACC_LST[2][3]]
    bar3 = plt.bar(ind+width*2, DTC_vals, width, color = 'b')

    RFC_vals = [BRFC_ACC_LST[2][0],BRFC_ACC_LST[2][1],BRFC_ACC_LST[2][2],BRFC_ACC_LST[2][3]]
    bar4 = plt.bar(ind+width*3, RFC_vals, width, color='y')
    
    MLPC_vals = [BMLPC_ACC_LST[2][1],BMLPC_ACC_LST[2][1],BMLPC_ACC_LST[2][2],BMLPC_ACC_LST[2][3]]
    bar5 = plt.bar(ind+width*4, MLPC_vals, width, color='c')
    
    GBC_vals = [BGBC_ACC_LST[2][0],BGBC_ACC_LST[2][1],BGBC_ACC_LST[2][2],BGBC_ACC_LST[2][3]]
    bar6 = plt.bar(ind+width*4, GBC_vals, width, color='m')
    
    ABC_vals = [BABC_ACC_LST[2][0],BABC_ACC_LST[2][1],BABC_ACC_LST[2][2],BABC_ACC_LST[2][3]]
    bar7 = plt.bar(ind+width*4, ABC_vals, width, color='k')
    
        
    plt.xlabel("Accuracy Fields")
    plt.ylabel("Accuracies")
    plt.title("Accuracies for 67-33 Split")

    plt.xticks(ind+width,['Testing Accuracy', 'Precision Score', 'Recall Score', 'F1-Score'])
    plt.legend( (bar1, bar2, bar3, bar4, bar5, bar6, bar7), ('BMNB', 'BSGDC', 'BDTC', 'BRFC', 'BMLPC', 'BGBC', 'BABC') )
    plt.show()
    
    
    
    '''50-50 Split Accuracy Plot '''
    N = 4
    ind = np.arange(N) 
    width = 0.12

    MNB_vals = [BMNB_ACC_LST[3][0],BMNB_ACC_LST[3][1],BMNB_ACC_LST[3][2],BMNB_ACC_LST[3][3]]
    bar1 = plt.bar(ind, MNB_vals, width, color = 'r')

    SGDC_vals = [BSGDC_ACC_LST[3][0],BSGDC_ACC_LST[3][1],BSGDC_ACC_LST[3][2],BSGDC_ACC_LST[3][3]]
    bar2 = plt.bar(ind+width, SGDC_vals, width, color='g')

    DTC_vals = [BDTC_ACC_LST[3][0],BDTC_ACC_LST[3][1],BDTC_ACC_LST[3][2],BDTC_ACC_LST[3][3]]
    bar3 = plt.bar(ind+width*2, DTC_vals, width, color = 'b')

    RFC_vals = [BRFC_ACC_LST[3][0],BRFC_ACC_LST[3][1],BRFC_ACC_LST[3][2],BRFC_ACC_LST[3][3]]
    bar4 = plt.bar(ind+width*3, RFC_vals, width, color='y')
    
    MLPC_vals = [BMLPC_ACC_LST[3][1],BMLPC_ACC_LST[3][1],BMLPC_ACC_LST[3][2],BMLPC_ACC_LST[3][3]]
    bar5 = plt.bar(ind+width*4, MLPC_vals, width, color='c')
    
    GBC_vals = [BGBC_ACC_LST[3][0],BGBC_ACC_LST[3][1],BGBC_ACC_LST[3][2],BGBC_ACC_LST[3][3]]
    bar6 = plt.bar(ind+width*4, GBC_vals, width, color='m')
    
    ABC_vals = [BABC_ACC_LST[3][0],BABC_ACC_LST[3][1],BABC_ACC_LST[3][2],BABC_ACC_LST[3][3]]
    bar7 = plt.bar(ind+width*4, ABC_vals, width, color='k')
    
        
    plt.xlabel("Accuracy Fields")
    plt.ylabel("Accuracies")
    plt.title("Accuracies for 50-50 Split")

    plt.xticks(ind+width,['Testing Accuracy', 'Precision Score', 'Recall Score', 'F1-Score'])
    plt.legend( (bar1, bar2, bar3, bar4, bar5, bar6, bar7), ('BMNB', 'BSGDC', 'BDTC', 'BRFC', 'BMLPC', 'BGBC', 'BABC') )
    plt.show()
    
    
    
    return Result_table
Main_Bagging()

Stratified K-Fold has been applied for 5 Splits
Bagging classification of Multinomial Naive BAYES's Training and testing is running
Bagging classification of Stochastic Gradient descent training and testing is running
Bagging classification of Decision Tree Classifier Training and testing is running
Bagging classification of Random Forest Training and testing is running
Bagging classification of Multi-layer Perceptron Training and testing is running


KeyboardInterrupt: 

In [None]:
def Main_VC():
    
    SKF_Split(4)
    acc1=VotingHard_Classification()
    acc2=VotingSoft_Classification()

    
        
    Accuracy_Table=[('Voting Classification Soft',acc2[0],acc2[1],acc2[2],acc2[3]),
                ('Voting Classification Hard',acc1[0],acc1[1],acc1[2],acc1[3]),
               ]
    Result_table=pd.DataFrame(Accuracy_Table,columns=["Classifier","Testing Accuracy","Precision Score","Recall Score","F1-Score"])
    
    N = 4
    ind = np.arange(N) 
    width = 0.12

    VCH_vals = [acc1[0],acc1[1],acc1[2],acc1[3]]
    bar1 = plt.bar(ind, VCH_vals, width, color = 'r')

    VCS_vals = [acc2[0],acc2[1],acc2[2],acc2[3]]
    bar2 = plt.bar(ind+width, VCS_vals, width, color='g')
    
        
    plt.xlabel("Accuracy Fields")
    plt.ylabel("Accuracies")
    plt.title("Accuracies for 75-25 Split")

    plt.xticks(ind+width,['TA', 'PS', 'RS', 'F1-S'])
    plt.legend( (bar1, bar2), ('VCH','VCS') )
    plt.show()
    
    return Result_table
Main_VC()

In [10]:
def PSO_MNB():
    n_particles=10
    options = {'c1': 0.5, 'c2': 0.7, 'w':0.9, 'alpha': [1e-3, 1000]}
    # Call an instance of PSO
    optimizer = ps.single.GlobalBestPSO(n_particles=10, dimensions=1, options=options)
    cost, pos=optimizer.optimize(MNB_objective_func, iters=3)
    alpha=pos[0]
    return MNB_Classification(alpha)
def MNB_objective_func(scores):
    lst=[]
    skf=StratifiedKFold(n_splits=4,shuffle=True)
    for i in range(10):
        train_index, test_index = next(iter(skf.split(Tr_tokens,y)))
        X_train, X_test = Tr_tokens[train_index], Tr_tokens[test_index]
        y_train, y_test = y[train_index], y[test_index]
        MNBC.fit(X_train, y_train)
        lst.append(MNBC.score(X_test, y_test))
        scores=np.array(lst)
    return scores

In [35]:
def PSO_SGDC():
    n_particles=10
    options = {'c1': 0.5, 'c2': 0.7, 'w':0.9, 'alpha':[0.0001,1000],'epsilon':[0.0001,1000],'tol':[.0001,1000]}
    # Call an instance of PSO
    optimizer = ps.single.GlobalBestPSO(n_particles=10, dimensions=3, options=options)
    cost, pos=optimizer.optimize(SGDC_objective_func, iters=100)
    alpha=pos[0]
    epsilon=pos[1]
    tol=pos[2]
    return SGDC_Classification(alpha,epsilon,tol)
def SGDC_objective_func(scores):
    lst=[]
    skf=StratifiedKFold(n_splits=4,shuffle=True)
    for i in range(10):
        train_index, test_index = next(iter(skf.split(Tr_tokens,y)))
        X_train, X_test = Tr_tokens[train_index], Tr_tokens[test_index]
        y_train, y_test = y[train_index], y[test_index]
        SGDC.fit(X_train, y_train)
        lst.append(SGDC.score(X_test, y_test))
        scores=np.array(lst)
    return scores
PSO_SGDC()

2022-07-05 14:43:26,541 - pyswarms.single.global_best - INFO - Optimize for 3 iters with {'c1': 0.5, 'c2': 0.7, 'w': 0.9, 'alpha': [0.0001, 1000], 'epsilon': [0.0001, 1000], 'tol': [0.0001, 1000]}
pyswarms.single.global_best: 100%|█████████████████████████████████████████████████████████████████|3/3, best_cost=0.99
2022-07-05 14:43:42,816 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 0.9895910780669145, best pos: [0.41667681 0.37169552 1.09881713]


Stochastic Gradient descent training and testing is running


(99.69325153374233, 99.35608499678042, 99.58050984188448, 99.46817082997582)

In [41]:
def PSO_MLPC():
    n_particles=10
    options = {'c1': 0.5, 'c2': 0.7, 'w':0.9, 'hidden_layer_sizes':[5,100],'alpha':[0.0001,1000]}
    # Call an instance of PSO
    optimizer = ps.single.GlobalBestPSO(n_particles=10, dimensions=2, options=options)
    cost, pos=optimizer.optimize(MLPC_objective_func, iters=100)
    hidden_layer_sizes=pos[0]
    alpha=pos[1]
    return MLPC_Classification(hidden_layer_sizes,alpha)
def MLPC_objective_func(scores):
    lst=[]
    skf=StratifiedKFold(n_splits=4,shuffle=True)
    for i in range(10):
        train_index, test_index = next(iter(skf.split(Tr_tokens,y)))
        X_train, X_test = Tr_tokens[train_index], Tr_tokens[test_index]
        y_train, y_test = y[train_index], y[test_index]
        MLPC.fit(X_train, y_train)
        lst.append(MLPC.score(X_test, y_test))
        scores=np.array(lst)
    return scores
PSO_MLPC()

2022-07-05 14:47:32,252 - pyswarms.single.global_best - INFO - Optimize for 3 iters with {'c1': 0.5, 'c2': 0.7, 'w': 0.9, 'hidden_layer_sizes': [5, 100], 'alpha': [0.0001, 1000]}
pyswarms.single.global_best: 100%|████████████████████████████████████████████████████████████████|3/3, best_cost=0.995
2022-07-05 14:59:23,573 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 0.9951672862453531, best pos: [0.50924196 0.88529553]


Multi-layer Perceptron Training and testing is running


(99.52593418851087, 99.2248062015504, 99.12875121006776, 99.1767554479419)

In [23]:
def PSO_DTC():
    n_particles=10
    options = {'c1': 0.5, 'c2': 0.7, 'w':0.9,'max_depth':[1,20],'min_samples_leaf':2}
    # Call an instance of PSO
    optimizer = ps.single.GlobalBestPSO(n_particles=10, dimensions=2, options=options)
    cost, pos=optimizer.optimize(DTC_objective_func, iters=100)
    max_depth=pos[0]
    min_samples_leaf=pos[1]
    return DTC_Classification(max_depth,min_samples_leaf)
def DTC_objective_func(scores):
    lst=[]
    skf=StratifiedKFold(n_splits=4,shuffle=True)
    for i in range(10):
        train_index, test_index = next(iter(skf.split(Tr_tokens,y)))
        X_train, X_test = Tr_tokens[train_index], Tr_tokens[test_index]
        y_train, y_test = y[train_index], y[test_index]
        DTC.fit(X_train, y_train)
        lst.append(DTC.score(X_test, y_test))
        scores=np.array(lst)
    return scores
PSO_DTC()

2022-07-05 14:19:43,942 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.5, 'c2': 0.7, 'w': 0.9, 'max_depth': [1, 20], 'min_samples_leaf': 2}
pyswarms.single.global_best:   9%|█████▌                                                        |9/100, best_cost=0.996


KeyboardInterrupt: 

In [None]:
def PSO_RFC():
    n_particles=10
    options = {'c1': 0.5, 'c2': 0.7, 'w':0.9, 'n_estimators':[1,40],'max_depth':[1,20]}
    # Call an instance of PSO
    optimizer = ps.single.GlobalBestPSO(n_particles=10, dimensions=2, options=options)
    cost, pos=optimizer.optimize(RFC_objective_func, iters=100)
    n_estimators=pos[0]
    max_depth=pos[1]
    return RFC_Classification(n_estimators,max_depth)
def RFC_objective_func(scores):
    lst=[]
    skf=StratifiedKFold(n_splits=4,shuffle=True)
    for i in range(10):
        train_index, test_index = next(iter(skf.split(Tr_tokens,y)))
        X_train, X_test = Tr_tokens[train_index], Tr_tokens[test_index]
        y_train, y_test = y[train_index], y[test_index]
        RFC.fit(X_train, y_train)
        lst.append(RFC.score(X_test, y_test))
        scores=np.array(lst)
    return scores
PSO_RFC()

In [None]:
'''Main PSO Functions'''
def PSO_GBC():
    n_particles=10
    options = {'c1': 0.5, 'c2': 0.7, 'w':0.9, 'n_estimators': [1, 1000], 'learning_rate': [1e-3, 100]}
    # Call an instance of PSO
    optimizer = ps.single.GlobalBestPSO(n_particles=10, dimensions=2, options=options)
    cost, pos=optimizer.optimize(GBC_objective_func, iters=100)
    n_estimators=pos[0]
    learning_rate=pos[1]
    return GBC_Classification(n_estimators, learning_rate)
def GBC_objective_func(scores):
    lst=[]
    skf=StratifiedKFold(n_splits=4,shuffle=True)
    for i in range(10):
        train_index, test_index = next(iter(skf.split(Tr_tokens,y)))
        X_train, X_test = Tr_tokens[train_index], Tr_tokens[test_index]
        y_train, y_test = y[train_index], y[test_index]
        GBC.fit(X_train, y_train)
        lst.append(GBC.score(X_test, y_test))
        scores=np.array(lst)
    return scores

In [None]:
def PSO_ABC():
    n_particles=10
    options = {'c1': 0.5, 'c2': 0.7, 'w':0.9, 'n_estimators': [1, 1000], 'learning_rate': [1e-3, 100]}
    # Call an instance of PSO
    optimizer = ps.single.GlobalBestPSO(n_particles=10, dimensions=2, options=options)
    cost, pos=optimizer.optimize(ABC_objective_func, iters=100)
    n_estimators=pos[0]
    learning_rate=pos[1]
    return ABC_Classification(n_estimators, learning_rate)
def ABC_objective_func(scores):
    lst=[]
    skf=StratifiedKFold(n_splits=4,shuffle=True)
    for i in range(10):
        train_index, test_index = next(iter(skf.split(Tr_tokens,y)))
        X_train, X_test = Tr_tokens[train_index], Tr_tokens[test_index]
        y_train, y_test = y[train_index], y[test_index]
        ABC.fit(X_train, y_train)
        lst.append(ABC.score(X_test, y_test))
        scores=np.array(lst)
    return scores

In [18]:
def Main_PSO():
    
    SKF_Split(4)
    acc1=PSO_MNB()
    acc2=PSO_SGDC()
    acc3=PSO_DTC()
    acc4=PSO_RFC()
    acc5=PSO_MLPC()
    acc6=PSO_GBC()
    acc7=PSO_ABC()
    
        
    Accuracy_Table=[('Stochastic Gradient Descent',acc2[0],acc2[1],acc2[2],acc2[3]),
                ('Multinomial Naive BAYES',acc1[0],acc1[1],acc1[2],acc1[3]),
                ('Random Forest',acc4[0],acc4[1],acc4[2],acc4[3]),
                ('Decision Tree',acc3[0],acc3[1],acc3[2],acc3[3]),
                ('Multi-later Perceptron',acc5[0],acc5[1],acc5[2],acc5[3]),
                ('Gradient Boost Classifier',acc6[0],acc6[1],acc6[2],acc6[3]),
                ('Adaboost Classifier',acc7[0],acc7[1],acc7[2],acc7[3])
               ]
    Result_table=pd.DataFrame(Accuracy_Table,columns=["Classifier","Testing Accuracy","Precision Score","Recall Score","F1-Score"])
    
    N = 4
    ind = np.arange(N) 
    width = 0.18

    MNB_vals = [acc1[0],acc1[1],acc1[2],acc1[3]]
    bar1 = plt.bar(ind, MNB_vals, width, color = 'r')

    SGDC_vals = [acc2[0],acc2[1],acc2[2],acc2[3]]
    bar2 = plt.bar(ind+width, SGDC_vals, width, color='g')

    DTC_vals = [acc3[0],acc3[1],acc3[2],acc3[3]]
    bar3 = plt.bar(ind+width*2, DTC_vals, width, color = 'b')

    RFC_vals = [acc4[0],acc4[1],acc4[2],acc4[3]]
    bar4 = plt.bar(ind+width*3, RFC_vals, width, color='y')
    
    MLPC_vals = [acc5[0],acc5[1],acc5[2],acc5[3]]
    bar5 = plt.bar(ind+width*4, MLPC_vals, width, color='c')
    
    GBC_vals = [acc6[0],acc6[1],acc6[2],acc6[3]]
    bar6 = plt.bar(ind+width*4, GBC_vals, width, color='m')
    
    ABC_vals = [acc7[0],acc7[1],acc7[2],acc7[3]]
    bar7 = plt.bar(ind+width*4, ABC_vals, width, color='k')
    
        
    plt.xlabel("Accuracy Fields")
    plt.ylabel("Accuracies")
    plt.title("Accuracies for 75-25 Split")

    plt.xticks(ind+width,['Testing Accuracy', 'Precision Score', 'Recall Score', 'F1-Score'])
    plt.legend( (bar1, bar2, bar3, bar4, bar5, bar6, bar7), ('MNB', 'SGDC', 'DTC', 'RFC', 'MLPC', 'GBC', 'ABC') )
    plt.show()
    
    return Result_table
Main_PSO()

2022-07-05 15:38:07,060 - pyswarms.single.global_best - INFO - Optimize for 3 iters with {'c1': 0.5, 'c2': 0.7, 'w': 0.9, 'alpha': [0.001, 1000]}
pyswarms.single.global_best: 100%|████████████████████████████████████████████████████████████████|3/3, best_cost=0.983
2022-07-05 15:38:11,031 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 0.9825278810408922, best pos: [0.10625222]


Multinomial Naive BAYES's Training and testing is running


IndexError: tuple index out of range

In [44]:
def GA_MNB():
    skf=StratifiedKFold(n_splits=4,shuffle=True)
    #Stratified K_Fold_spliting
    for train_index, test_index in skf.split(Tr_tokens,y):
        global X_train, X_test, y_train, y_test
        X_train, X_test = Tr_tokens[train_index], Tr_tokens[test_index]
        y_train, y_test = y[train_index], y[test_index]

    tpot_config = {'sklearn.naive_bayes.MultinomialNB': {'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],'fit_prior': [True, False]}}

    tpot = TPOTClassifier(generations=10, population_size=40, verbosity=2,config_dict=tpot_config,offspring_size=20)
    tpot.fit(X_train, y_train)
    pred=tpot.predict(X_test)
    return metrics.accuracy_score(pred, y_test), metrics.precision_score(y_test,pred,pos_label='Spam'), metrics.recall_score(y_test,pred,pos_label='Spam'), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)
GA_MNB()



Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9926259768396226

Generation 2 - Current best internal CV score: 0.9926259768396226

Generation 3 - Current best internal CV score: 0.9926259768396226

Best pipeline: MultinomialNB(input_matrix, alpha=0.001, fit_prior=True)


(0.9959100204498977, 0.9954574951330305, 0.9903163331181407, 99.28802588996763)

In [None]:
def GA_DTC():
    skf=StratifiedKFold(n_splits=4,shuffle=True)
    #Stratified K_Fold_spliting
    for train_index, test_index in skf.split(Tr_tokens,y):
        global X_train, X_test, y_train, y_test
        X_train, X_test = Tr_tokens[train_index], Tr_tokens[test_index]
        y_train, y_test = y[train_index], y[test_index]

    tpot_config = {'sklearn.tree.DecisionTreeClassifier': {'criterion': ["gini", "entropy"],'max_depth': range(1, 11),'min_samples_split': range(2, 21),'min_samples_leaf': range(1, 21)}}

    tpot = TPOTClassifier(generations=10, population_size=40, verbosity=2,config_dict=tpot_config,offspring_size=20)
    tpot.fit(X_train, y_train)
    pred=tpot.predict(X_test)
    return metrics.accuracy_score(pred, y_test), metrics.precision_score(y_test,pred,pos_label='Spam'), metrics.recall_score(y_test,pred,pos_label='Spam'), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)
GA_DTC()



Optimization Progress:   0%|          | 0/240 [00:00<?, ?pipeline/s]

In [None]:
def GA_RFC():
    skf=StratifiedKFold(n_splits=4,shuffle=True)
    #Stratified K_Fold_spliting
    for train_index, test_index in skf.split(Tr_tokens,y):
        global X_train, X_test, y_train, y_test
        X_train, X_test = Tr_tokens[train_index], Tr_tokens[test_index]
        y_train, y_test = y[train_index], y[test_index]

    tpot_config = {'sklearn.ensemble.RandomForestClassifier': {'n_estimators': [100],'criterion': ["gini", "entropy"],'max_features': np.arange(0.05, 1.01, 0.05),
                                                               'min_samples_split': range(2, 21),'min_samples_leaf':  range(1, 21),'bootstrap': [True, False]}}

    tpot = TPOTClassifier(generations=10, population_size=40, verbosity=2,config_dict=tpot_config,offspring_size=20)
    tpot.fit(X_train, y_train)
    pred=tpot.predict(X_test)
    return metrics.accuracy_score(pred, y_test), metrics.precision_score(y_test,pred,pos_label='Spam'), metrics.recall_score(y_test,pred,pos_label='Spam'), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)
GA_RFC()

In [None]:
def GA_SGDC():
    skf=StratifiedKFold(n_splits=4,shuffle=True)
    #Stratified K_Fold_spliting
    for train_index, test_index in skf.split(Tr_tokens,y):
        global X_train, X_test, y_train, y_test
        X_train, X_test = Tr_tokens[train_index], Tr_tokens[test_index]
        y_train, y_test = y[train_index], y[test_index]

    tpot_config = {'sklearn.linear_model.SGDClassifier': {'loss': ['log', 'hinge', 'modified_huber', 'squared_hinge', 'perceptron'],'penalty': ['elasticnet'],'alpha': [0.0, 0.01, 0.001],
                                                          'learning_rate': ['invscaling', 'constant'],'fit_intercept': [True, False],'l1_ratio': [0.25, 0.0, 1.0, 0.75, 0.5],'eta0': [0.1, 1.0, 0.01],
                                                          'power_t': [0.5, 0.0, 1.0, 0.1, 100.0, 10.0, 50.0]}}

    tpot = TPOTClassifier(generations=10, population_size=40, verbosity=2,config_dict=tpot_config,offspring_size=20)
    tpot.fit(X_train, y_train)
    pred=tpot.predict(X_test)
    return metrics.accuracy_score(pred, y_test), metrics.precision_score(y_test,pred,pos_label='Spam'), metrics.recall_score(y_test,pred,pos_label='Spam'), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)
GA_SGDC()

In [None]:
def GA_MLPC():
    skf=StratifiedKFold(n_splits=4,shuffle=True)
    #Stratified K_Fold_spliting
    for train_index, test_index in skf.split(Tr_tokens,y):
        global X_train, X_test, y_train, y_test
        X_train, X_test = Tr_tokens[train_index], Tr_tokens[test_index]
        y_train, y_test = y[train_index], y[test_index]

    tpot_config = {'sklearn.neural_network.MLPClassifier': {'alpha': [1e-4, 1e-3, 1e-2, 1e-1],'learning_rate_init': [1e-3, 1e-2, 1e-1, 0.5, 1.]}}

    tpot = TPOTClassifier(generations=10, population_size=40, verbosity=2,config_dict=tpot_config,offspring_size=20)
    tpot.fit(X_train, y_train)
    pred=tpot.predict(X_test)
    return metrics.accuracy_score(pred, y_test), metrics.precision_score(y_test,pred,pos_label='Spam'), metrics.recall_score(y_test,pred,pos_label='Spam'), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)
GA_MLPC()

In [None]:
def GA_GBC():
    SKF_Split(4)
    tpot_config = {'sklearn.ensemble.GradientBoostingClassifier': {
        'n_estimators': [100],
        'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.],
        'max_depth': range(1, 11),
        'min_samples_split': range(2, 21),
        'min_samples_leaf': range(1, 21),
        'subsample': np.arange(0.05, 1.01, 0.05),
        'max_features': np.arange(0.05, 1.01, 0.05)}}

    tpot = TPOTClassifier(generations=10, population_size=40, verbosity=2, config_dict=tpot_config, offspring_size=20)
    tpot.fit(X_train, y_train)
    pred=tpot.predict(X_test)
    return metrics.accuracy_score(pred, y_test), metrics.precision_score(y_test,pred,pos_label='Spam'), metrics.recall_score(y_test,pred,pos_label='Spam'), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)

In [None]:
def GA_ABC():
    SKF_Split(4)
    tpot_config = {'sklearn.ensemble.AdaBoostClassifier': {
        'n_estimators': [100],
        'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.0]}}
    
    tpot = TPOTClassifier(generations=10, population_size=40, verbosity=2, config_dict=tpot_config, offspring_size=20)
    tpot.fit(X_train, y_train)
    pred=tpot.predict(X_test)
    return metrics.accuracy_score(pred, y_test), metrics.precision_score(y_test,pred,pos_label='Spam'), metrics.recall_score(y_test,pred,pos_label='Spam'), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)

In [None]:
def Main_GA():
    
    acc1=GA_MNB()
    acc2=GA_SGDC()
    acc3=GA_DTC()
    acc4=GA_RFC()
    acc5=GA_MLPC()
    acc6=GA_GBC()
    acc7=GA_ABC()
    
    
    Accuracy_Table=[('Stochastic Gradient Descent',acc2[0],acc2[1],acc2[2],acc2[3]),
            ('Multinomial Naive BAYES',acc1[0],acc1[1],acc1[2],acc1[3]),
            ('Random Forest',acc4[0],acc4[1],acc4[2],acc4[3]),
            ('Decision Tree',acc3[0],acc3[1],acc3[2],acc3[3]),
            ('Multi-later Perceptron',acc5[0],acc5[1],acc5[2],acc5[3]),
            ('Gradient Boost Classifier',acc6[0],acc6[1],acc6[2],acc6[3]),
            ('Adaboost Classifier',acc7[0],acc7[1],acc7[2],acc7[3])
           ]
    Result_table=pd.DataFrame(Accuracy_Table,columns=["Classifier","Testing Accuracy","Precision Score","Recall Score","F1-Score"])
    
    N = 4
    ind = np.arange(N) 
    width = 0.18

    MNB_vals = [acc1[0],acc1[1],acc1[2],acc1[3]]
    bar1 = plt.bar(ind, MNB_vals, width, color = 'r')

    SGDC_vals = [acc2[0],acc2[1],acc2[2],acc2[3]]
    bar2 = plt.bar(ind+width, SGDC_vals, width, color='g')

    DTC_vals = [acc3[0],acc3[1],acc3[2],acc3[3]]
    bar3 = plt.bar(ind+width*2, DTC_vals, width, color = 'b')

    RFC_vals = [acc4[0],acc4[1],acc4[2],acc4[3]]
    bar4 = plt.bar(ind+width*3, RFC_vals, width, color='y')
    
    MLPC_vals = [acc5[0],acc5[1],acc5[2],acc5[3]]
    bar5 = plt.bar(ind+width*4, MLPC_vals, width, color='c')
    
    GBC_vals = [acc6[0],acc6[1],acc6[2],acc6[3]]
    bar6 = plt.bar(ind+width*4, GBC_vals, width, color='m')
    
    ABC_vals = [acc7[0],acc7[1],acc7[2],acc7[3]]
    bar7 = plt.bar(ind+width*4, ABC_vals, width, color='k')
    
        
    plt.xlabel("Accuracy Fields")
    plt.ylabel("Accuracies")
    plt.title("Accuracies for 75-25 Split")

    plt.xticks(ind+width,['Testing Accuracy', 'Precision Score', 'Recall Score', 'F1-Score'])
    plt.legend( (bar1, bar2, bar3, bar4, bar5, bar6, bar7), ('MNB', 'SGDC', 'DTC', 'RFC', 'MLPC', 'GBC', 'ABC') )
    plt.show()
    
    return Result_table
Main_GA()


In [None]:
#Applying Stratified K_fold_Cross_Validation
def Probable_SKF_scores(x):
    skf=StratifiedKFold(n_splits=x,shuffle=False)
    #Stratified K_Fold_spliting
    for train_index, test_index in skf.split(X,y):
        #pdb.set_trace()
        #print("TRAIN:", train_index,"TEST:",test_index)
        global X_train, X_test, y_train, y_test
        X_train, X_test = Tr_tokens[train_index], Tr_tokens[test_index]
        y_train, y_test = y[train_index], y[test_index]
        for model in models:
            model.fit(X_train, y_train)
            score=model.score(X_test, y_test)
            if model==MNBC:MNBC_accu_stratified.append(score)    
            elif model==SGDC:SGDC_accu_stratified.append(score)    
            elif model==DTC:DTC_accu_stratified.append(score)
            elif model==RFC:RFC_accu_stratified.append(score)
            elif model==MLPC:MPLC_accu_stratified.append(score)
    for model in models:
        if model==MNBC:print("Possible accuracies list for {} are:{} ".format(model,MNBC_accu_stratified))
        elif model==SGDC:print("Possible accuracies list for {} are:{} ".format(model,SGDC_accu_stratified))
        elif model==DTC:print("Possible accuracies list for {} are:{} ".format(model,DTC_accu_stratified))
        elif model==RFC:print("Possible accuracies list for {} are:{} ".format(model,RFC_accu_stratified))
        elif model==MLPC:print("Possible accuracies list for {} are:{} ".format(model,MPLC_accu_stratified))
    MNBC_accu_stratified.clear()
    SGDC_accu_stratified.clear()
    DTC_accu_stratified.clear()
    RFC_accu_stratified.clear()
    MPLC_accu_stratified.clear()
    return X_train, X_test, y_train, y_test

for ele in split_list:
    print('For {} splits the possible accuracies are:'.format(ele))
    Probable_SKF_scores(ele)
    print('###################################################################################################################################')


For 5 splits the possible accuracies are:
Possible accuracies list for MultinomialNB(alpha=0.574365, fit_prior=False) are:[0.9716542750929368, 0.9941914498141264, 0.9825702997908436, 0.99070415988845, 0.976528003718336] 
Possible accuracies list for SGDClassifier() are:[0.9972118959107806, 0.9995353159851301, 1.0, 1.0, 0.9946548919358587] 
Possible accuracies list for DecisionTreeClassifier() are:[0.9990706319702602, 0.9997676579925651, 1.0, 1.0, 0.9941900999302812] 
Possible accuracies list for RandomForestClassifier() are:[1.0, 0.9997676579925651, 0.9997676039972112, 1.0, 0.9941900999302812] 
Possible accuracies list for MLPClassifier(hidden_layer_sizes=5, max_iter=10000, solver='lbfgs') are:[0.9962825278810409, 0.9990706319702602, 0.9993028119916337, 0.9995352079944225, 0.9948872879386474] 
###################################################################################################################################
For 4 splits the possible accuracies are:
Possible accuracies l