In [1]:
#Importing the libraries
import numpy as np
import pandas as pd
import os
import io
import pdb
import sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn import linear_model
import pyswarms as ps
from statistics import mean, stdev
from sklearn.model_selection import cross_val_score
from tpot import TPOTClassifier
import matplotlib.pyplot as plt



In [2]:
#Machine learning models
vectorizer=CountVectorizer(stop_words='english',lowercase=True)
MNBC=MultinomialNB(alpha=0.5743650,class_prior=None,fit_prior=False)    #Multinomial Naive BAYES
SGDC=SGDClassifier(loss="hinge", alpha=0.0001, max_iter=1000, tol=1e-3, epsilon=0.1)   #Stochastic Gradient Descent
DTC=tree.DecisionTreeClassifier(criterion = "entropy", splitter = "best")  #entropy can also be used  #Decision Tree
RFC=RandomForestClassifier(criterion = "entropy")  #entropy must be used after gini    #Random Forest
MLPC=MLPClassifier(hidden_layer_sizes=5,max_iter=10000,solver='lbfgs')  #Multi-layer Perceptron
ABC=AdaBoostClassifier(n_estimators=100)          #AdaBoost Classifier
GBC=GradientBoostingClassifier(n_estimators=100)        #GradientBoosting Classifier
skf=StratifiedKFold(n_splits=4,shuffle=True)

In [3]:
def readFiles(path):
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            path=os.path.join(root, filename)
            lines=[]
            f=io.open(path, 'r', encoding='latin1')
            for line in f:
                lines.append(line)
            f.close
            message='\n'.join(lines)
            yield path, message

def dataFrameFromDirectory(path, classification):
    rows=[]
    index=[]
    for filename, message in readFiles(path):
        rows.append({'message':message, 'class':classification})
        index.append(filename)
    return pd.DataFrame(rows, index=index)

data=pd.DataFrame({'message':[], 'class':[]})

data=data.append(dataFrameFromDirectory('D:/Major Project/App_Data_Set/Spam_Assassin_Dataset/Spam','Spam'))
data=data.append(dataFrameFromDirectory('D:/Major Project/App_Data_Set/Spam_Assassin_Dataset/Ham','Ham'))
X=data['message']
y=data['class']


'''Pre-processing'''
IDF = TfidfVectorizer().fit_transform(X)
Tr_tokens=vectorizer.fit_transform(X)

scores=np.array

In [4]:
#Applying Stratified K_fold_Cross_Validation
def SKF_Split(x):
    skf=StratifiedKFold(n_splits=x,shuffle=True)
    #Stratified K_Fold_spliting
    for train_index, test_index in skf.split(Tr_tokens,y):
        global X_train, X_test, y_train, y_test
        X_train, X_test = Tr_tokens[train_index], Tr_tokens[test_index]
        y_train, y_test = y[train_index], y[test_index]
    return X_train, X_test, y_train, y_test

In [5]:
'''Objective Funtions'''
def MNB_objective_func(scores):
    lst=[]
    skf=StratifiedKFold(n_splits=4,shuffle=True)
    for i in range(10):
        train_index, test_index = next(iter(skf.split(Tr_tokens,y)))
        X_train, X_test = Tr_tokens[train_index], Tr_tokens[test_index]
        y_train, y_test = y[train_index], y[test_index]
        MNBC.fit(X_train, y_train)
        lst.append(MNBC.score(X_test, y_test))
        scores=np.array(lst)
    return scores


def SGDC_objective_func(scores):
    lst=[]
    skf=StratifiedKFold(n_splits=4,shuffle=True)
    for i in range(10):
        train_index, test_index = next(iter(skf.split(Tr_tokens,y)))
        X_train, X_test = Tr_tokens[train_index], Tr_tokens[test_index]
        y_train, y_test = y[train_index], y[test_index]
        SGDC.fit(X_train, y_train)
        lst.append(SGDC.score(X_test, y_test))
        scores=np.array(lst)
    return scores



def MLPC_objective_func(scores):
    lst=[]
    skf=StratifiedKFold(n_splits=4,shuffle=True)
    for i in range(10):
        train_index, test_index = next(iter(skf.split(Tr_tokens,y)))
        X_train, X_test = Tr_tokens[train_index], Tr_tokens[test_index]
        y_train, y_test = y[train_index], y[test_index]
        MLPC.fit(X_train, y_train)
        lst.append(MLPC.score(X_test, y_test))
        scores=np.array(lst)
    return scores



def DTC_objective_func(scores):
    lst=[]
    skf=StratifiedKFold(n_splits=4,shuffle=True)
    for i in range(10):
        train_index, test_index = next(iter(skf.split(Tr_tokens,y)))
        X_train, X_test = Tr_tokens[train_index], Tr_tokens[test_index]
        y_train, y_test = y[train_index], y[test_index]
        DTC.fit(X_train, y_train)
        lst.append(DTC.score(X_test, y_test))
        scores=np.array(lst)
    return scores



def RFC_objective_func(scores):
    lst=[]
    skf=StratifiedKFold(n_splits=4,shuffle=True)
    for i in range(10):
        train_index, test_index = next(iter(skf.split(Tr_tokens,y)))
        X_train, X_test = Tr_tokens[train_index], Tr_tokens[test_index]
        y_train, y_test = y[train_index], y[test_index]
        RFC.fit(X_train, y_train)
        lst.append(RFC.score(X_test, y_test))
        scores=np.array(lst)
    return scores


def GBC_objective_func(scores):
    lst=[]
    skf=StratifiedKFold(n_splits=4,shuffle=True)
    for i in range(10):
        train_index, test_index = next(iter(skf.split(Tr_tokens,y)))
        X_train, X_test = Tr_tokens[train_index], Tr_tokens[test_index]
        y_train, y_test = y[train_index], y[test_index]
        GBC.fit(X_train, y_train)
        lst.append(GBC.score(X_test, y_test))
        scores=np.array(lst)
    return scores


def ABC_objective_func(scores):
    lst=[]
    skf=StratifiedKFold(n_splits=4,shuffle=True)
    for i in range(10):
        train_index, test_index = next(iter(skf.split(Tr_tokens,y)))
        X_train, X_test = Tr_tokens[train_index], Tr_tokens[test_index]
        y_train, y_test = y[train_index], y[test_index]
        ABC.fit(X_train, y_train)
        lst.append(ABC.score(X_test, y_test))
        scores=np.array(lst)
    return scores

In [6]:
'''Classification functions'''
#Multinomial Naive Bayes Calssification
def MNB_Classification(alpha):
    print("Multinomial Naive BAYES's Training and testing is running")
    MNBC.fit(X_train,y_train)
    pred=MNBC.predict(X_test)
    Training_Accuracy, Testing_Accuracy = MNBC.score(X_train,y_train)*100, MNBC.score(X_test,y_test)*100
    return Testing_Accuracy, (metrics.precision_score(y_test,pred,pos_label='Spam')*100), (metrics.recall_score(y_test,pred,pos_label='Spam')*100), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)


#Stochastic Gradient Descent (SGD) Calssification
def SGDC_Classification(alpha,epsilon,tol):
    print("Stochastic Gradient descent training and testing is running")
    SGDC.fit(X_train,y_train)
    pred=SGDC.predict(X_test)
    Training_Accuracy, Testing_Accuracy = SGDC.score(X_train,y_train)*100, SGDC.score(X_test,y_test)*100
    return Testing_Accuracy, (metrics.precision_score(y_test,pred,pos_label='Spam')*100), (metrics.recall_score(y_test,pred,pos_label='Spam')*100), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)



#Decision Tree Classification
def DTC_Classification(max_depth,min_samples_leaf):
    print("Decision Tree Classifier Training and testing is running")
    DTC.fit(X_train,y_train)
    pred=DTC.predict(X_test)
    Training_Accuracy, Testing_Accuracy = DTC.score(X_train,y_train)*100, DTC.score(X_test,y_test)*100
    return Testing_Accuracy, (metrics.precision_score(y_test,pred,pos_label='Spam')*100), (metrics.recall_score(y_test,pred,pos_label='Spam')*100), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)



#Random Forest Classification
def RFC_Classification(n_estimators,max_depth):
    print("Random Forest Training and testing is running")
    RFC.fit(X_train,y_train)
    pred=RFC.predict(X_test)
    Training_Accuracy, Testing_Accuracy = RFC.score(X_train,y_train)*100, RFC.score(X_test,y_test)*100
    return Testing_Accuracy, (metrics.precision_score(y_test,pred,pos_label='Spam')*100), (metrics.recall_score(y_test,pred,pos_label='Spam')*100), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)



#Multi-layer Perceptron Calssification
def MLPC_Classification(hidden_layer_sizes,alpha):
    print("Multi-layer Perceptron Training and testing is running")
    MLPC.fit(X_train,y_train)
    pred=MLPC.predict(X_test)
    Training_Accuracy, Testing_Accuracy = MLPC.score(X_train,y_train)*100, MLPC.score(X_test,y_test)*100
    return Testing_Accuracy, (metrics.precision_score(y_test,pred,pos_label='Spam')*100), (metrics.recall_score(y_test,pred,pos_label='Spam')*100), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)



#GradientBoost Classification
def GBC_Classification(n_estimators, learning_rate):
    print("Gradientboost Classifier Training and testing is running")
    GBC.fit(X_train, y_train)
    pred=GBC.predict(X_test)
    Training_Accuracy, Testing_Accuracy = GBC.score(X_train, y_train)*100, GBC.score(X_test, y_test)*100
    return Testing_Accuracy, (metrics.precision_score(y_test,pred,pos_label='Spam')*100), (metrics.recall_score(y_test,pred,pos_label='Spam')*100), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)



#AdaBoost Classification
def ABC_Classification(n_estimators, learning_rate):
    print("Adaboost Classifier Training and testing is running")
    ABC.fit(X_train, y_train)
    pred=ABC.predict(X_test)
    Training_Accuracy, Testing_Accuracy = ABC.score(X_train, y_train)*100, ABC.score(X_test, y_test)*100
    return Testing_Accuracy, (metrics.precision_score(y_test,pred,pos_label='Spam')*100), (metrics.recall_score(y_test,pred,pos_label='Spam')*100), (metrics.f1_score(y_test, pred, pos_label='Spam')*100)

In [9]:
'''Main PSO Functions'''
def PSO_GBC():
    n_particles=10
    options = {'c1': 0.5, 'c2': 0.7, 'w':0.9, 'n_estimators': [1, 1000], 'learning_rate': [1e-3, 100]}
    # Call an instance of PSO
    optimizer = ps.single.GlobalBestPSO(n_particles=10, dimensions=2, options=options)
    cost, pos=optimizer.optimize(GBC_objective_func, iters=100)
    n_estimators=pos[0]
    learning_rate=pos[1]
    return GBC_Classification(n_estimators, learning_rate)
PSO_GBC()

2022-07-22 12:31:26,119 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.5, 'c2': 0.7, 'w': 0.9, 'n_estimators': [1, 1000], 'learning_rate': [0.001, 100]}
pyswarms.single.global_best:   0%|                                                                               |0/100


KeyboardInterrupt: 

In [None]:
def PSO_ABC():
    n_particles=10
    options = {'c1': 0.5, 'c2': 0.7, 'w':0.9, 'n_estimators': [1, 1000], 'learning_rate': [1e-3, 100]}
    # Call an instance of PSO
    optimizer = ps.single.GlobalBestPSO(n_particles=10, dimensions=2, options=options)
    cost, pos=optimizer.optimize(ABC_objective_func, iters=100)
    n_estimators=pos[0]
    learning_rate=pos[1]
    return ABC_Classification(n_estimators, learning_rate)
PSO_ABC()

In [10]:
def PSO_MNB():
    n_particles=10
    options = {'c1': 0.5, 'c2': 0.7, 'w':0.9, 'alpha': [1e-3, 1000]}
    # Call an instance of PSO
    optimizer = ps.single.GlobalBestPSO(n_particles=10, dimensions=1, options=options)
    cost, pos=optimizer.optimize(MNB_objective_func, iters=100)
    alpha=pos[0]
    return MNB_Classification(alpha)

In [11]:
def PSO_SGDC():
    n_particles=10
    options = {'c1': 0.5, 'c2': 0.7, 'w':0.9, 'alpha':[0.0001,1000],'epsilon':[0.0001,1000],'tol':[.0001,1000]}
    # Call an instance of PSO
    optimizer = ps.single.GlobalBestPSO(n_particles=10, dimensions=3, options=options)
    cost, pos=optimizer.optimize(SGDC_objective_func, iters=100)
    alpha=pos[0]
    epsilon=pos[1]
    tol=pos[2]
    return SGDC_Classification(alpha,epsilon,tol)

In [12]:
def PSO_MLPC():
    n_particles=10
    options = {'c1': 0.5, 'c2': 0.7, 'w':0.9, 'hidden_layer_sizes':[5,100],'alpha':[0.0001,1000]}
    # Call an instance of PSO
    optimizer = ps.single.GlobalBestPSO(n_particles=10, dimensions=2, options=options)
    cost, pos=optimizer.optimize(MLPC_objective_func, iters=100)
    hidden_layer_sizes=pos[0]
    alpha=pos[1]
    return MLPC_Classification(hidden_layer_sizes,alpha)

In [13]:
def PSO_DTC():
    n_particles=10
    options = {'c1': 0.5, 'c2': 0.7, 'w':0.9,'max_depth':[1,20],'min_samples_leaf':2}
    # Call an instance of PSO
    optimizer = ps.single.GlobalBestPSO(n_particles=10, dimensions=2, options=options)
    cost, pos=optimizer.optimize(DTC_objective_func, iters=100)
    max_depth=pos[0]
    min_samples_leaf=pos[1]
    return DTC_Classification(max_depth,min_samples_leaf)

In [14]:
def PSO_RFC():
    n_particles=10
    options = {'c1': 0.5, 'c2': 0.7, 'w':0.9, 'n_estimators':[1,40],'max_depth':[1,20]}
    # Call an instance of PSO
    optimizer = ps.single.GlobalBestPSO(n_particles=10, dimensions=2, options=options)
    cost, pos=optimizer.optimize(RFC_objective_func, iters=100)
    n_estimators=pos[0]
    max_depth=pos[1]
    return RFC_Classification(n_estimators,max_depth)

In [1]:
def Main_PSO():
    
    SKF_Split(4)
    acc1=PSO_MNB()
    acc2=PSO_SGDC()
    acc3=PSO_DTC()
    acc4=PSO_RFC()
    acc5=PSO_MLPC()
    acc6=PSO_GBC()
    acc7=PSO_ABC()
    
        
    Accuracy_Table=[('Stochastic Gradient Descent',acc2[0],acc2[1],acc2[2],acc2[3]),
                ('Multinomial Naive BAYES',acc1[0],acc1[1],acc1[2],acc1[3]),
                ('Random Forest',acc4[0],acc4[1],acc4[2],acc4[3]),
                ('Decision Tree',acc3[0],acc3[1],acc3[2],acc3[3]),
                ('Multi-later Perceptron',acc5[0],acc5[1],acc5[2],acc5[3]),
                ('Gradient Boost Classifier',acc6[0],acc6[1],acc6[2],acc6[3]),
                ('Adaboost Classifier',acc7[0],acc7[1],acc7[2],acc7[3])
               ]
    Result_table=pd.DataFrame(Accuracy_Table,columns=["Classifier","Testing Accuracy","Precision Score","Recall Score","F1-Score"])
    
    N = 4
    ind = np.arange(N) 
    width = 0.12

    MNB_vals = [acc1[0],acc1[1],acc1[2],acc1[3]]
    bar1 = plt.bar(ind, MNB_vals, width, color = 'r')

    SGDC_vals = [acc2[0],acc2[1],acc2[2],acc2[3]]
    bar2 = plt.bar(ind+width, SGDC_vals, width, color='g')

    DTC_vals = [acc3[0],acc3[1],acc3[2],acc3[3]]
    bar3 = plt.bar(ind+width*2, DTC_vals, width, color = 'b')

    RFC_vals = [acc4[0],acc4[1],acc4[2],acc4[3]]
    bar4 = plt.bar(ind+width*3, RFC_vals, width, color='y')
    
    MLPC_vals = [acc5[0],acc5[1],acc5[2],acc5[3]]
    bar5 = plt.bar(ind+width*4, MLPC_vals, width, color='c')
    
    GBC_vals = [acc6[0],acc6[1],acc6[2],acc6[3]]
    bar6 = plt.bar(ind+width*5, GBC_vals, width, color='m')
    
    ABC_vals = [acc7[0],acc7[1],acc7[2],acc7[3]]
    bar7 = plt.bar(ind+width*6, ABC_vals, width, color='k')
    
        
    plt.xlabel("Accuracy Fields")
    plt.ylabel("Accuracies")
    plt.title("Accuracies for 75-25 Split")

    plt.xticks(ind+width,['Testing Accuracy', 'Precision Score', 'Recall Score', 'F1-Score'])
    plt.legend( (bar1, bar2, bar3, bar4, bar5, bar6, bar7), ('MNB', 'SGDC', 'DTC', 'RFC', 'MLPC', 'GBC', 'ABC') )
    plt.show()
    
    return Result_table
Main_PSO()

NameError: name 'SKF_Split' is not defined