In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io.arff import loadarff
from sklearn.model_selection import train_test_split

%matplotlib inline
random_state = 2021

N = '1'
drop_cols = ['Attr21','Attr37', 'Attr7','Attr43','Attr62','Attr32','Attr44','Attr15','Attr19','Attr3','Attr51','Attr4','Attr49','Attr38','Attr60','Attr6']

In [2]:
arff = loadarff('1year.arff')
df = pd.DataFrame(arff[0])
df['class']= df['class'].astype('int')
df = df.drop_duplicates()

In [3]:
df = df.drop(drop_cols,axis='columns')
X = df.drop('class',axis='columns')
Y = df['class']
# (X.shape,Y.shape)

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.25, random_state=random_state,shuffle=True,stratify=Y)

display(Y_train.value_counts(), Y_test.value_counts())
X_train,Y_train,X_test,Y_test = X_train.to_numpy(),Y_train.to_numpy(),X_test.to_numpy(),Y_test.to_numpy()

0    5005
1     203
Name: class, dtype: int64

0    1669
1      68
Name: class, dtype: int64

In [4]:
from sklearn.impute import SimpleImputer
simple_imp = SimpleImputer(missing_values=np.nan, strategy='mean')
simple_imp = simple_imp.fit(X_train)
X_train_imp = simple_imp.transform(X_train)
X_test_imp = simple_imp.transform(X_test)

In [5]:
from sklearn.decomposition import PCA
pca = PCA(n_components=20)
pca = pca.fit(X_train_imp)

X_train_imp = pca.transform(X_train_imp)
X_test_imp = pca.transform(X_test_imp)

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train_imp)
# choose imputer <<comment blocks accordingly>>

#### for simple
scaled_already = False
# X_train_imp = simple_imp.transform(X_train)
# X_test_imp = simple_imp.transform(X_test)


In [7]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier # Voting Ensemble for Classification

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV



In [9]:
def try_all_classifiers(X_train, X_test, y_train, y_test, classifiers, sampling  = None, scaler=None):
   
    accuracy = [0]*len(classifiers)
    f1 = [0]*len(classifiers)
    precision = [0]*len(classifiers)
    recall = [0]*len(classifiers)
    i = 0
    
    model_pipeline = []
    Pipeline([
        ('sampling', SMOTE()),
        ('classification', LogisticRegression())
    ])
    
    if sampling == "SMOTE":
        model_pipeline.append(('sampling', SMOTE(sampling_strategy=0.6,random_state=random_state) ))
        # X_train, y_train = smote.fit_resample(X_train, y_train)
        print("SMOTE")
    if sampling == "RUS":
        model_pipeline.append(('sampling', RandomUnderSampler(sampling_strategy=0.6,random_state=random_state) ))
        # X_train, y_train = rus.fit_resample(X_train, y_train)
        print("RUS")
    if sampling == "SMOTEENN":
        model_pipeline.append(('sampling', SMOTEENN(sampling_strategy=0.6,random_state=random_state) ))
        # X_train, y_train = smoteenn.fit_resample(X_train, y_train)
        print("SMOTEENN")    

    voting_classifs = []
    models_for_voting = [0,3,5,6,7,8]
    voting_weights = [1,0.5,1,0.5,1,1]
    jj =0 
        
    for i in range(len(classifiers)):
        classif = classifiers[i][0]
        pipe_parameters = classifiers[i][2]
        y_pred = []
        print(classifiers_names[i])
        pipeline = Pipeline(model_pipeline+[('classifier',classif)])
        
        if classifiers[i][1] and not scaled_already:
            print("\t- Requires scaling and not scaled. Doing it now...")
            grid = GridSearchCV(pipeline, pipe_parameters, cv=2, scoring="f1",n_jobs=-1,verbose=1)
            #grid = grid.fit(X_train, y_train)
            grid = grid.fit(scaler.transform(X_train), y_train)

            #classif.fit(scaler.transform(X_train), y_train)
            #y_pred = classif.predict(scaler.transform(X_test))
            
            display(grid.best_params_)
            classif = grid.best_estimator_
            y_pred = grid.predict(scaler.transform(X_test))
            #classif.fit(scaler.transform(), )
            #y_pred = classif.predict(scaler.transform(X_test))
            
        else:
            grid = GridSearchCV(pipeline, pipe_parameters, cv=2, scoring="f1",n_jobs=-1,verbose=1)
            grid.fit(X_train, y_train)
            display(grid.best_params_)
            classif = grid.best_estimator_
            y_pred = grid.predict(X_test)
            
            #classif.fit(X_train, y_train)
            #y_pred = classif.predict(X_test)
        
        if i in models_for_voting:
            print("\t- Adding for voting with weight <"+str(voting_weights[jj])+">...")
            jj+=1
            voting_classifs.append(("mod"+str(i+1),classif))
        
        accuracy[i] = metrics.accuracy_score(y_test, y_pred)
        f1[i] = metrics.f1_score(y_test, y_pred, labels=np.unique(y_pred))
        precision[i] = metrics.precision_score(y_test, y_pred)
        recall[i] = metrics.recall_score(y_test, y_pred)
    
    print("\n\nVoting...")
    # create the ensemble model
    ensemble = VotingClassifier(voting_classifs,weights=voting_weights,n_jobs=-1,voting="hard")
    ensemble.fit(scaler.transform(X_train), y_train)
    y_pred = ensemble.predict(scaler.transform(X_test))
    
    accuracy.append(metrics.accuracy_score(y_test, y_pred))
    f1.append(metrics.f1_score(y_test, y_pred, labels=np.unique(y_pred)))
    precision.append(metrics.precision_score(y_test, y_pred))
    recall.append(metrics.recall_score(y_test, y_pred))

    print("Done")
    return accuracy,f1,precision,recall

In [10]:
classifiers_names = ["LR", "LDA", "KNN-5", "KNN-10", "GNB", "DT", "SVC", "RFC", "XGB","Voting"]

classifiers = [(LogisticRegression(max_iter=2048,random_state=random_state),True,
                   [
                       #{
                       # 'classifier__penalty' : ['l1', 'l2'],
                       # 'classifier__C' : np.logspace(-8, 4, 16),
                       # 'classifier__solver' : ['liblinear']
                       # },
                        {
                        'classifier__penalty' : ['l2','none'],
                        'classifier__C' : np.logspace(-8, 4, 16)
                        }
                   ]),
                (LinearDiscriminantAnalysis(),True,
                    { 'classifier__solver' : ['svd', 'lsqr', 'eigen'] }),
                (KNeighborsClassifier(n_neighbors=5),True,
                    {'classifier__weights' : ['uniform','distance'], 'classifier__metric' : ['euclidean', 'manhattan']}), 
                (KNeighborsClassifier(n_neighbors=10),True,
                    {'classifier__weights' : ['uniform','distance'], 'classifier__metric' : ['euclidean', 'manhattan']}),
                (GaussianNB(),True,
                    {'classifier__var_smoothing': np.logspace(0,-9, num=100)}),
                (DecisionTreeClassifier(random_state=random_state),False,
                    { 'classifier__criterion':['gini','entropy'],'classifier__max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]} ),
                (SVC(random_state=random_state),True,
                    [
                        {'classifier__C': [ 0.05, 0.1, 1], 
                         'classifier__gamma': [0.0001, 1],
                         'classifier__kernel': ['rbf']},
                        {'classifier__C': [ 0.05, 0.1, 1],
                         'classifier__kernel': ['linear']}
                    ]),
                (RandomForestClassifier(random_state=random_state),False,
                     { 'classifier__n_estimators': [int(x) for x in np.linspace(start = 128, stop = 512, num = 4)],
                       'classifier__max_features': ['auto'],
                       'classifier__max_depth':  [int(x) for x in np.linspace(10, 100, num = 2)]+[None],
                       'classifier__min_samples_leaf':  [1, 4],
                       'classifier__bootstrap': [False]
                     }),
                (XGBClassifier(use_label_encoder=False),False,
                    {
                        'classifier__gamma': [0.5, 1, 2, 5],
                        'classifier__colsample_bytree': [0.6, 0.8, 1.0],
                        'classifier__max_depth': [3, 6]
                    }) ]



In [11]:
accuracy,f1,precision,recall = try_all_classifiers(X_train_imp,X_test_imp,Y_train,Y_test, classifiers, scaler=scaler)

LR
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:    9.1s finished


{'classifier__C': 6.309573444801943, 'classifier__penalty': 'l2'}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


	- Adding for voting with weight <1>...
LDA
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 3 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.1s finished


{'classifier__solver': 'svd'}

KNN-5
	- Requires scaling and not scaled. Doing it now...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    1.9s finished


{'classifier__metric': 'manhattan', 'classifier__weights': 'uniform'}

KNN-10
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    2.0s finished


{'classifier__metric': 'euclidean', 'classifier__weights': 'uniform'}

	- Adding for voting with weight <0.5>...
GNB
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 100 candidates, totalling 200 fits


  _warn_prf(average, modifier, msg_start, len(result))
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    1.0s finished


{'classifier__var_smoothing': 0.23101297000831597}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


DT
Fitting 2 folds for each of 36 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    3.1s finished


{'classifier__criterion': 'gini', 'classifier__max_depth': 7}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


	- Adding for voting with weight <1>...
SVC
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    5.0s finished


{'classifier__C': 0.05,
 'classifier__gamma': 0.0001,
 'classifier__kernel': 'rbf'}

  _warn_prf(average, modifier, msg_start, len(result))
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


	- Adding for voting with weight <0.5>...
RFC
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  2.2min finished


{'classifier__bootstrap': False,
 'classifier__max_depth': 10,
 'classifier__max_features': 'auto',
 'classifier__min_samples_leaf': 1,
 'classifier__n_estimators': 128}

	- Adding for voting with weight <1>...
XGB
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:   19.1s finished


{'classifier__colsample_bytree': 1.0,
 'classifier__gamma': 2,
 'classifier__max_depth': 6}

	- Adding for voting with weight <1>...


Voting...
Done


In [12]:
accuracy_sm,f1_sm,precision_sm,recall_sm = try_all_classifiers(X_train_imp,X_test_imp,Y_train,Y_test, classifiers, sampling = "SMOTE", scaler=scaler)


SMOTE
LR
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:   10.7s finished


{'classifier__C': 10000.0, 'classifier__penalty': 'l2'}

	- Adding for voting with weight <1>...
LDA
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 3 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.2s finished


{'classifier__solver': 'svd'}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


KNN-5
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    2.4s finished


{'classifier__metric': 'manhattan', 'classifier__weights': 'uniform'}

KNN-10
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    2.6s finished


{'classifier__metric': 'manhattan', 'classifier__weights': 'distance'}

	- Adding for voting with weight <0.5>...
GNB
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 100 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 164 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    3.3s finished


{'classifier__var_smoothing': 3.5111917342151273e-09}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


DT
Fitting 2 folds for each of 36 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    6.2s finished


{'classifier__criterion': 'entropy', 'classifier__max_depth': 5}

	- Adding for voting with weight <1>...
SVC
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:   14.8s finished


{'classifier__C': 1, 'classifier__gamma': 1, 'classifier__kernel': 'rbf'}

	- Adding for voting with weight <0.5>...
RFC
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  3.3min finished


{'classifier__bootstrap': False,
 'classifier__max_depth': 100,
 'classifier__max_features': 'auto',
 'classifier__min_samples_leaf': 4,
 'classifier__n_estimators': 128}

	- Adding for voting with weight <1>...
XGB
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:   31.1s finished


{'classifier__colsample_bytree': 0.8,
 'classifier__gamma': 2,
 'classifier__max_depth': 6}

	- Adding for voting with weight <1>...


Voting...
Done


In [13]:
accuracy_rus,f1_rus,precision_rus,recall_rus = try_all_classifiers(X_train_imp,X_test_imp,Y_train,Y_test, classifiers, sampling = "RUS", scaler=scaler)


RUS
LR
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:    8.1s finished


{'classifier__C': 1584.8931924611175, 'classifier__penalty': 'l2'}

	- Adding for voting with weight <1>...
LDA
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 3 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.1s finished


{'classifier__solver': 'svd'}

KNN-5
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.7s finished


{'classifier__metric': 'manhattan', 'classifier__weights': 'uniform'}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


KNN-10
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.7s finished


{'classifier__metric': 'manhattan', 'classifier__weights': 'uniform'}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


	- Adding for voting with weight <0.5>...
GNB
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 100 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    1.3s finished


{'classifier__var_smoothing': 1.232846739442066e-08}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


DT
Fitting 2 folds for each of 36 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    0.7s finished


{'classifier__criterion': 'entropy', 'classifier__max_depth': 4}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


	- Adding for voting with weight <1>...
SVC
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    0.6s finished


{'classifier__C': 1, 'classifier__gamma': 1, 'classifier__kernel': 'rbf'}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


	- Adding for voting with weight <0.5>...
RFC
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:   25.5s finished


{'classifier__bootstrap': False,
 'classifier__max_depth': 10,
 'classifier__max_features': 'auto',
 'classifier__min_samples_leaf': 4,
 'classifier__n_estimators': 128}

	- Adding for voting with weight <1>...
XGB
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    2.9s finished


{'classifier__colsample_bytree': 0.6,
 'classifier__gamma': 5,
 'classifier__max_depth': 3}

	- Adding for voting with weight <1>...


Voting...
Done


In [14]:
accuracy_smoteenn,f1_smoteenn,precision_smoteenn,recall_smoteenn = try_all_classifiers(X_train_imp,X_test_imp,Y_train,Y_test, classifiers, sampling = "SMOTEENN",scaler=scaler)


SMOTEENN
LR
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   17.1s
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:   25.5s finished


{'classifier__C': 10000.0, 'classifier__penalty': 'l2'}

	- Adding for voting with weight <1>...
LDA
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 3 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    1.6s finished


{'classifier__solver': 'lsqr'}

KNN-5
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    4.0s finished


{'classifier__metric': 'manhattan', 'classifier__weights': 'uniform'}

KNN-10
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    4.1s finished


{'classifier__metric': 'manhattan', 'classifier__weights': 'distance'}

	- Adding for voting with weight <0.5>...
GNB
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 100 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   11.7s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:   49.5s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   50.5s finished


{'classifier__var_smoothing': 1.0}

DT
Fitting 2 folds for each of 36 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   12.2s finished


{'classifier__criterion': 'gini', 'classifier__max_depth': 30}

	- Adding for voting with weight <1>...
SVC
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:   14.6s finished


{'classifier__C': 1, 'classifier__gamma': 1, 'classifier__kernel': 'rbf'}

	- Adding for voting with weight <0.5>...
RFC
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  2.4min finished


{'classifier__bootstrap': False,
 'classifier__max_depth': 100,
 'classifier__max_features': 'auto',
 'classifier__min_samples_leaf': 1,
 'classifier__n_estimators': 256}

	- Adding for voting with weight <1>...
XGB
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:   28.1s finished


{'classifier__colsample_bytree': 0.8,
 'classifier__gamma': 0.5,
 'classifier__max_depth': 6}

	- Adding for voting with weight <1>...


Voting...
Done


In [15]:
print("Imputer & Sampling & Metric & ",end = "")
print(*classifiers_names,sep = " & ", end = " \\\\\n")
print("\\hline \\hline")
print("Simple & No & Acc & ",end="")
print(*['%.2f' % elem for elem in accuracy],sep=" & ", end = " \\\\\n")
print("~ & ~ & Prec & ",end="")
print(*['%.2f' % elem for elem in precision],sep=" & ", end = " \\\\\n")
print("~ & ~ & Rec & ",end="")
print(*['%.2f' % elem for elem in recall],sep=" & ", end = " \\\\\n")
print("~ & ~ & F1 & ",end="")
print(*['%.2f' % elem for elem in f1],sep=" & ", end = " \\\\\n")
print("\cline{2-13}")

print("~ & SMOTE & Acc & ",end="")
print(*['%.2f' % elem for elem in accuracy_sm],sep=" & ", end = " \\\\\n")
print("~ & ~ & Prec & ",end="")
print(*['%.2f' % elem for elem in precision_sm],sep=" & ", end = " \\\\\n")
print("~ & ~ & Rec & ",end="")
print(*['%.2f' % elem for elem in recall_sm],sep=" & ", end = " \\\\\n")
print("~ & ~ & F1 & ",end="")
print(*['%.2f' % elem for elem in f1_sm],sep=" & ", end = " \\\\\n")
print("\cline{2-13}")

print("~ & RUS & Acc & ",end="")
print(*['%.2f' % elem for elem in accuracy_rus],sep=" & ", end = " \\\\\n")
print("~ & ~ & Prec & ",end="")
print(*['%.2f' % elem for elem in precision_rus],sep=" & ", end = " \\\\\n")
print("~ & ~ & Rec & ",end="")
print(*['%.2f' % elem for elem in recall_rus],sep=" & ", end = " \\\\\n")
print("~ & ~ & F1 & ",end="")
print(*['%.2f' % elem for elem in f1_rus],sep=" & ", end = " \\\\\n")
print("\cline{2-13}")


print("~ & SMOTE- & Acc & ",end="")
print(*['%.2f' % elem for elem in accuracy_smoteenn],sep=" & ", end = " \\\\\n")
print("~ & ENN & Prec & ",end="")
print(*['%.2f' % elem for elem in precision_smoteenn],sep=" & ", end = " \\\\\n")
print("~ & ~ & Rec & ",end="")
print(*['%.2f' % elem for elem in recall_smoteenn],sep=" & ", end = " \\\\\n")
print("~ & ~ & F1 & ",end="")
print(*['%.2f' % elem for elem in f1_smoteenn],sep=" & ", end = " \\\\\n")
print("\\hline\\hline")




Imputer & Sampling & Metric & LR & LDA & KNN-5 & KNN-10 & GNB & DT & SVC & RFC & XGB & Voting \\
\hline \hline
Simple & No & Acc & 0.96 & 0.96 & 0.96 & 0.96 & 0.08 & 0.96 & 0.96 & 0.97 & 0.97 & 0.97 \\
~ & ~ & Prec & 0.00 & 0.00 & 0.00 & 0.00 & 0.04 & 0.55 & 0.00 & 0.88 & 0.89 & 0.87 \\
~ & ~ & Rec & 0.00 & 0.00 & 0.00 & 0.00 & 0.99 & 0.38 & 0.00 & 0.32 & 0.37 & 0.29 \\
~ & ~ & F1 & 0.00 & 0.00 & 0.00 & 0.00 & 0.08 & 0.45 & 0.00 & 0.47 & 0.52 & 0.44 \\
\cline{2-13}
~ & SMOTE & Acc & 0.87 & 0.96 & 0.85 & 0.84 & 0.10 & 0.87 & 0.94 & 0.96 & 0.95 & 0.96 \\
~ & ~ & Prec & 0.10 & 0.09 & 0.09 & 0.09 & 0.04 & 0.17 & 0.33 & 0.51 & 0.36 & 0.53 \\
~ & ~ & Rec & 0.29 & 0.01 & 0.31 & 0.32 & 0.97 & 0.62 & 0.41 & 0.38 & 0.47 & 0.40 \\
~ & ~ & F1 & 0.15 & 0.03 & 0.14 & 0.14 & 0.08 & 0.27 & 0.37 & 0.44 & 0.41 & 0.45 \\
\cline{2-13}
~ & RUS & Acc & 0.88 & 0.94 & 0.79 & 0.87 & 0.09 & 0.76 & 0.94 & 0.89 & 0.90 & 0.92 \\
~ & ~ & Prec & 0.09 & 0.08 & 0.06 & 0.09 & 0.04 & 0.11 & 0.10 & 0.20 & 0.21 & 0.27 \\


In [16]:
print('''
| classifier          | Accuracy | Precision | Recall | F1 score |
| =================== | ======== | ========= | ====== | ======== |''')
for c,a,p,r,f in zip(classifiers_names,accuracy,precision,recall,f1):
    print(c,"  |  ",a,"  |  ",p,"  |  ",r,"  |  ",f,"  |  ")
    
print("\n===============================================================\n")
for c,a,p,r,f in zip(classifiers_names,accuracy_sm,precision_sm,recall_sm,f1_sm):
    print(c,"  |  ",a,"  |  ",p,"  |  ",r,"  |  ",f,"  |  ")

print("\n===============================================================\n")
for c,a,p,r,f in zip(classifiers_names,accuracy_rus,precision_rus,recall_rus,f1_rus):
    print(c,"  |  ",a,"  |  ",p,"  |  ",r,"  |  ",f,"  |  ")
  
print("\n===============================================================\n")
for c,a,p,r,f in zip(classifiers_names,accuracy_smoteenn,precision_smoteenn,recall_smoteenn,f1_smoteenn):
    print(c,"  |  ",a,"  |  ",p,"  |  ",r,"  |  ",f,"  |  ")
 


| classifier          | Accuracy | Precision | Recall | F1 score |
LR   |   0.9585492227979274   |   0.0   |   0.0   |   0.0   |  
LDA   |   0.9579735175590098   |   0.0   |   0.0   |   0.0   |  
KNN-5   |   0.9597006332757628   |   0.0   |   0.0   |   0.0   |  
KNN-10   |   0.9608520437535981   |   0.0   |   0.0   |   0.0   |  
GNB   |   0.07829591249280368   |   0.04019196160767846   |   0.9852941176470589   |   0.07723342939481267   |  
DT   |   0.9637305699481865   |   0.5531914893617021   |   0.38235294117647056   |   0.4521739130434782   |  
SVC   |   0.9608520437535981   |   0.0   |   0.0   |   0.0   |  
RFC   |   0.971790443293034   |   0.88   |   0.3235294117647059   |   0.4731182795698925   |  
XGB   |   0.973517559009787   |   0.8928571428571429   |   0.36764705882352944   |   0.5208333333333335   |  
Voting   |   0.9706390328151986   |   0.8695652173913043   |   0.29411764705882354   |   0.4395604395604396   |  


LR   |   0.871042026482441   |   0.10204081632653061   |   