In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io.arff import loadarff
from sklearn.model_selection import train_test_split

%matplotlib inline
random_state = 2021

N = '2'
drop_cols = ['Attr37', 'Attr7','Attr43','Attr62','Attr32','Attr44','Attr15','Attr19','Attr3','Attr51','Attr4','Attr49','Attr38','Attr60','Attr6']

In [18]:
arff = loadarff('2year.arff')
df = pd.DataFrame(arff[0])
df['class']= df['class'].astype('int')
df = df.drop_duplicates()

In [19]:
df = df.drop(drop_cols,axis='columns')
X = df.drop('class',axis='columns')
Y = df['class']
# (X.shape,Y.shape)

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.25, random_state=random_state,shuffle=True,stratify=Y)

display(Y_train.value_counts(), Y_test.value_counts())
X_train,Y_train,X_test,Y_test = X_train.to_numpy(),Y_train.to_numpy(),X_test.to_numpy(),Y_test.to_numpy()

0    7264
1     298
Name: class, dtype: int64

0    2421
1     100
Name: class, dtype: int64

In [20]:
from sklearn.impute import SimpleImputer
simple_imp = SimpleImputer(missing_values=np.nan, strategy='mean')
simple_imp = simple_imp.fit(X_train)
X_train_imp = simple_imp.transform(X_train)
X_test_imp = simple_imp.transform(X_test)

In [21]:
from sklearn.decomposition import PCA
pca = PCA(n_components=20)
pca = pca.fit(X_train_imp)

X_train_imp = pca.transform(X_train_imp)
X_test_imp = pca.transform(X_test_imp)

In [22]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train_imp)
# choose imputer <<comment blocks accordingly>>

#### for simple
scaled_already = False
# X_train_imp = simple_imp.transform(X_train)
# X_test_imp = simple_imp.transform(X_test)


In [23]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier # Voting Ensemble for Classification

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [24]:
def try_all_classifiers(X_train, X_test, y_train, y_test, classifiers, sampling  = None, scaler=None):
   
    accuracy = [0]*len(classifiers)
    f1 = [0]*len(classifiers)
    precision = [0]*len(classifiers)
    recall = [0]*len(classifiers)
    i = 0
    
    model_pipeline = []
    Pipeline([
        ('sampling', SMOTE()),
        ('classification', LogisticRegression())
    ])
    
    if sampling == "SMOTE":
        model_pipeline.append(('sampling', SMOTE(sampling_strategy=0.6,random_state=random_state) ))
        # X_train, y_train = smote.fit_resample(X_train, y_train)
        print("SMOTE")
    if sampling == "RUS":
        model_pipeline.append(('sampling', RandomUnderSampler(sampling_strategy=0.6,random_state=random_state) ))
        # X_train, y_train = rus.fit_resample(X_train, y_train)
        print("RUS")
    if sampling == "SMOTEENN":
        model_pipeline.append(('sampling', SMOTEENN(sampling_strategy=0.6,random_state=random_state) ))
        # X_train, y_train = smoteenn.fit_resample(X_train, y_train)
        print("SMOTEENN")    

    voting_classifs = []
    models_for_voting = [0,3,5,6,7,8]
    voting_weights = [1,0.5,1,0.5,1,1]
    jj =0 
        
    for i in range(len(classifiers)):
        classif = classifiers[i][0]
        pipe_parameters = classifiers[i][2]
        y_pred = []
        print(classifiers_names[i])
        pipeline = Pipeline(model_pipeline+[('classifier',classif)])
        
        if classifiers[i][1] and not scaled_already:
            print("\t- Requires scaling and not scaled. Doing it now...")
            grid = GridSearchCV(pipeline, pipe_parameters, cv=2, scoring="f1",n_jobs=-1,verbose=1)
            #grid = grid.fit(X_train, y_train)
            grid = grid.fit(scaler.transform(X_train), y_train)

            #classif.fit(scaler.transform(X_train), y_train)
            #y_pred = classif.predict(scaler.transform(X_test))
            
            display(grid.best_params_)
            classif = grid.best_estimator_
            y_pred = grid.predict(scaler.transform(X_test))
            #classif.fit(scaler.transform(), )
            #y_pred = classif.predict(scaler.transform(X_test))
            
        else:
            grid = GridSearchCV(pipeline, pipe_parameters, cv=2, scoring="f1",n_jobs=-1,verbose=1)
            grid.fit(X_train, y_train)
            display(grid.best_params_)
            classif = grid.best_estimator_
            y_pred = grid.predict(X_test)
            
            #classif.fit(X_train, y_train)
            #y_pred = classif.predict(X_test)
        
        if i in models_for_voting:
            print("\t- Adding for voting with weight <"+str(voting_weights[jj])+">...")
            jj+=1
            voting_classifs.append(("mod"+str(i+1),classif))
        
        accuracy[i] = metrics.accuracy_score(y_test, y_pred)
        f1[i] = metrics.f1_score(y_test, y_pred, labels=np.unique(y_pred))
        precision[i] = metrics.precision_score(y_test, y_pred)
        recall[i] = metrics.recall_score(y_test, y_pred)
    
    print("\n\nVoting...")
    # create the ensemble model
    ensemble = VotingClassifier(voting_classifs,weights=voting_weights,n_jobs=-1,voting="hard")
    ensemble.fit(scaler.transform(X_train), y_train)
    y_pred = ensemble.predict(scaler.transform(X_test))
    
    accuracy.append(metrics.accuracy_score(y_test, y_pred))
    f1.append(metrics.f1_score(y_test, y_pred, labels=np.unique(y_pred)))
    precision.append(metrics.precision_score(y_test, y_pred))
    recall.append(metrics.recall_score(y_test, y_pred))

    print("Done")
    return accuracy,f1,precision,recall

In [25]:
classifiers_names = ["LR", "LDA", "KNN-5", "KNN-10", "GNB", "DT", "SVC", "RFC", "XGB","Voting"]

classifiers = [(LogisticRegression(max_iter=2048,random_state=random_state),True,
                   [
                       #{
                       # 'classifier__penalty' : ['l1', 'l2'],
                       # 'classifier__C' : np.logspace(-8, 4, 16),
                       # 'classifier__solver' : ['liblinear']
                       # },
                        {
                        'classifier__penalty' : ['l2','none'],
                        'classifier__C' : np.logspace(-8, 4, 16)
                        }
                   ]),
                (LinearDiscriminantAnalysis(),True,
                    { 'classifier__solver' : ['svd', 'lsqr', 'eigen'] }),
                (KNeighborsClassifier(n_neighbors=5),True,
                    {'classifier__weights' : ['uniform','distance'], 'classifier__metric' : ['euclidean', 'manhattan']}), 
                (KNeighborsClassifier(n_neighbors=10),True,
                    {'classifier__weights' : ['uniform','distance'], 'classifier__metric' : ['euclidean', 'manhattan']}),
                (GaussianNB(),True,
                    {'classifier__var_smoothing': np.logspace(0,-9, num=100)}),
                (DecisionTreeClassifier(random_state=random_state),False,
                    { 'classifier__criterion':['gini','entropy'],'classifier__max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]} ),
                (SVC(random_state=random_state),True,
                    [
                        {'classifier__C': [ 0.05, 0.1, 1], 
                         'classifier__gamma': [0.0001, 1],
                         'classifier__kernel': ['rbf']},
                        {'classifier__C': [ 0.05, 0.1, 1],
                         'classifier__kernel': ['linear']}
                    ]),
                (RandomForestClassifier(random_state=random_state),False,
                     { 'classifier__n_estimators': [int(x) for x in np.linspace(start = 128, stop = 512, num = 4)],
                       'classifier__max_features': ['auto'],
                       'classifier__max_depth':  [int(x) for x in np.linspace(10, 100, num = 2)]+[None],
                       'classifier__min_samples_leaf':  [1, 4],
                       'classifier__bootstrap': [False]
                     }),
                (XGBClassifier(use_label_encoder=False),False,
                    {
                        'classifier__gamma': [0.5, 1, 2, 5],
                        'classifier__colsample_bytree': [0.6, 0.8, 1.0],
                        'classifier__max_depth': [3, 6]
                    }) ]



In [26]:
accuracy,f1,precision,recall = try_all_classifiers(X_train_imp,X_test_imp,Y_train,Y_test, classifiers, scaler=scaler)

LR
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   16.6s
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:   24.8s finished


{'classifier__C': 1e-08, 'classifier__penalty': 'l2'}

  _warn_prf(average, modifier, msg_start, len(result))
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


	- Adding for voting with weight <1>...
LDA
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 3 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.1s finished


{'classifier__solver': 'svd'}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


KNN-5
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    3.4s finished


{'classifier__metric': 'euclidean', 'classifier__weights': 'distance'}

KNN-10
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    3.6s finished


{'classifier__metric': 'euclidean', 'classifier__weights': 'distance'}

	- Adding for voting with weight <0.5>...
GNB
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 100 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    1.1s finished


{'classifier__var_smoothing': 0.0012328467394420659}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


DT
Fitting 2 folds for each of 36 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    5.1s finished


{'classifier__criterion': 'entropy', 'classifier__max_depth': 8}

	- Adding for voting with weight <1>...
SVC
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:   15.2s finished


{'classifier__C': 0.05,
 'classifier__gamma': 0.0001,
 'classifier__kernel': 'rbf'}

	- Adding for voting with weight <0.5>...
RFC
Fitting 2 folds for each of 24 candidates, totalling 48 fits


  _warn_prf(average, modifier, msg_start, len(result))
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  3.2min finished


{'classifier__bootstrap': False,
 'classifier__max_depth': 100,
 'classifier__max_features': 'auto',
 'classifier__min_samples_leaf': 1,
 'classifier__n_estimators': 128}

	- Adding for voting with weight <1>...
XGB
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:   27.5s finished


{'classifier__colsample_bytree': 0.8,
 'classifier__gamma': 0.5,
 'classifier__max_depth': 6}

	- Adding for voting with weight <1>...


Voting...
Done


In [27]:
accuracy_sm,f1_sm,precision_sm,recall_sm = try_all_classifiers(X_train_imp,X_test_imp,Y_train,Y_test, classifiers, sampling = "SMOTE", scaler=scaler)


SMOTE
LR
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:   30.9s
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:   44.6s finished
  "Setting penalty='none' will ignore the C and l1_ratio "


{'classifier__C': 1e-08, 'classifier__penalty': 'none'}

	- Adding for voting with weight <1>...
LDA
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 3 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.2s finished


{'classifier__solver': 'svd'}

KNN-5
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    4.6s finished


{'classifier__metric': 'euclidean', 'classifier__weights': 'uniform'}

KNN-10
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    4.8s finished


{'classifier__metric': 'manhattan', 'classifier__weights': 'uniform'}

	- Adding for voting with weight <0.5>...
GNB
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 100 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 164 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    4.1s finished


{'classifier__var_smoothing': 0.001}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


DT
Fitting 2 folds for each of 36 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    9.4s finished


{'classifier__criterion': 'entropy', 'classifier__max_depth': 5}

	- Adding for voting with weight <1>...
SVC
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:   37.0s finished


{'classifier__C': 1, 'classifier__gamma': 1, 'classifier__kernel': 'rbf'}

	- Adding for voting with weight <0.5>...
RFC
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  5.1min finished


{'classifier__bootstrap': False,
 'classifier__max_depth': 100,
 'classifier__max_features': 'auto',
 'classifier__min_samples_leaf': 4,
 'classifier__n_estimators': 128}

	- Adding for voting with weight <1>...
XGB
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:   46.1s finished


{'classifier__colsample_bytree': 0.8,
 'classifier__gamma': 1,
 'classifier__max_depth': 6}

	- Adding for voting with weight <1>...


Voting...




Done


In [28]:
accuracy_rus,f1_rus,precision_rus,recall_rus = try_all_classifiers(X_train_imp,X_test_imp,Y_train,Y_test, classifiers, sampling = "RUS", scaler=scaler)


RUS
LR
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:   15.8s
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:   18.5s finished
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


{'classifier__C': 1e-08, 'classifier__penalty': 'none'}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.1s finished


	- Adding for voting with weight <1>...
LDA
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 3 candidates, totalling 6 fits




{'classifier__solver': 'svd'}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


KNN-5
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    1.1s finished


{'classifier__metric': 'manhattan', 'classifier__weights': 'uniform'}

KNN-10
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    1.1s finished


{'classifier__metric': 'manhattan', 'classifier__weights': 'uniform'}

	- Adding for voting with weight <0.5>...
GNB
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 100 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    1.4s finished


{'classifier__var_smoothing': 0.001}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


DT
Fitting 2 folds for each of 36 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    0.9s finished


{'classifier__criterion': 'entropy', 'classifier__max_depth': 4}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


	- Adding for voting with weight <1>...
SVC
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    0.9s finished


{'classifier__C': 1, 'classifier__gamma': 1, 'classifier__kernel': 'rbf'}

	- Adding for voting with weight <0.5>...
RFC
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:   32.1s finished


{'classifier__bootstrap': False,
 'classifier__max_depth': 100,
 'classifier__max_features': 'auto',
 'classifier__min_samples_leaf': 4,
 'classifier__n_estimators': 512}

	- Adding for voting with weight <1>...
XGB
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    4.0s finished


{'classifier__colsample_bytree': 1.0,
 'classifier__gamma': 5,
 'classifier__max_depth': 3}

	- Adding for voting with weight <1>...


Voting...
Done


In [29]:
accuracy_smoteenn,f1_smoteenn,precision_smoteenn,recall_smoteenn = try_all_classifiers(X_train_imp,X_test_imp,Y_train,Y_test, classifiers, sampling = "SMOTEENN",scaler=scaler)


SMOTEENN
LR
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   46.2s
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:  1.1min finished
  "Setting penalty='none' will ignore the C and l1_ratio "


{'classifier__C': 1e-08, 'classifier__penalty': 'none'}

	- Adding for voting with weight <1>...
LDA
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 3 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    3.2s finished


{'classifier__solver': 'svd'}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


KNN-5
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    8.0s finished


{'classifier__metric': 'manhattan', 'classifier__weights': 'uniform'}

KNN-10
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    8.3s finished


{'classifier__metric': 'manhattan', 'classifier__weights': 'uniform'}

	- Adding for voting with weight <0.5>...
GNB
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 100 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   23.5s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  1.7min finished


{'classifier__var_smoothing': 0.0008111308307896872}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


DT
Fitting 2 folds for each of 36 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   17.3s finished


{'classifier__criterion': 'entropy', 'classifier__max_depth': 30}

	- Adding for voting with weight <1>...
SVC
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:   33.6s finished


{'classifier__C': 1, 'classifier__gamma': 1, 'classifier__kernel': 'rbf'}

	- Adding for voting with weight <0.5>...
RFC
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  3.7min finished


{'classifier__bootstrap': False,
 'classifier__max_depth': 100,
 'classifier__max_features': 'auto',
 'classifier__min_samples_leaf': 4,
 'classifier__n_estimators': 512}

	- Adding for voting with weight <1>...
XGB
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:   40.8s finished


{'classifier__colsample_bytree': 0.8,
 'classifier__gamma': 0.5,
 'classifier__max_depth': 6}

	- Adding for voting with weight <1>...


Voting...




Done


In [30]:
print("Imputer & Sampling & Metric & ",end = "")
print(*classifiers_names,sep = " & ", end = " \\\\\n")
print("\\hline \\hline")
print("Simple & No & Acc & ",end="")
print(*['%.2f' % elem for elem in accuracy],sep=" & ", end = " \\\\\n")
print("~ & ~ & Prec & ",end="")
print(*['%.2f' % elem for elem in precision],sep=" & ", end = " \\\\\n")
print("~ & ~ & Rec & ",end="")
print(*['%.2f' % elem for elem in recall],sep=" & ", end = " \\\\\n")
print("~ & ~ & F1 & ",end="")
print(*['%.2f' % elem for elem in f1],sep=" & ", end = " \\\\\n")
print("\cline{2-13}")

print("~ & SMOTE & Acc & ",end="")
print(*['%.2f' % elem for elem in accuracy_sm],sep=" & ", end = " \\\\\n")
print("~ & ~ & Prec & ",end="")
print(*['%.2f' % elem for elem in precision_sm],sep=" & ", end = " \\\\\n")
print("~ & ~ & Rec & ",end="")
print(*['%.2f' % elem for elem in recall_sm],sep=" & ", end = " \\\\\n")
print("~ & ~ & F1 & ",end="")
print(*['%.2f' % elem for elem in f1_sm],sep=" & ", end = " \\\\\n")
print("\cline{2-13}")

print("~ & RUS & Acc & ",end="")
print(*['%.2f' % elem for elem in accuracy_rus],sep=" & ", end = " \\\\\n")
print("~ & ~ & Prec & ",end="")
print(*['%.2f' % elem for elem in precision_rus],sep=" & ", end = " \\\\\n")
print("~ & ~ & Rec & ",end="")
print(*['%.2f' % elem for elem in recall_rus],sep=" & ", end = " \\\\\n")
print("~ & ~ & F1 & ",end="")
print(*['%.2f' % elem for elem in f1_rus],sep=" & ", end = " \\\\\n")
print("\cline{2-13}")


print("~ & SMOTE- & Acc & ",end="")
print(*['%.2f' % elem for elem in accuracy_smoteenn],sep=" & ", end = " \\\\\n")
print("~ & ENN & Prec & ",end="")
print(*['%.2f' % elem for elem in precision_smoteenn],sep=" & ", end = " \\\\\n")
print("~ & ~ & Rec & ",end="")
print(*['%.2f' % elem for elem in recall_smoteenn],sep=" & ", end = " \\\\\n")
print("~ & ~ & F1 & ",end="")
print(*['%.2f' % elem for elem in f1_smoteenn],sep=" & ", end = " \\\\\n")
print("\\hline\\hline")




Imputer & Sampling & Metric & LR & LDA & KNN-5 & KNN-10 & GNB & DT & SVC & RFC & XGB & Voting \\
\hline \hline
Simple & No & Acc & 0.96 & 0.96 & 0.96 & 0.96 & 0.06 & 0.96 & 0.96 & 0.96 & 0.96 & 0.96 \\
~ & ~ & Prec & 0.00 & 0.00 & 0.43 & 0.00 & 0.04 & 0.38 & 0.00 & 0.61 & 0.74 & 0.80 \\
~ & ~ & Rec & 0.00 & 0.00 & 0.03 & 0.00 & 0.96 & 0.16 & 0.00 & 0.14 & 0.17 & 0.12 \\
~ & ~ & F1 & 0.00 & 0.00 & 0.06 & 0.00 & 0.08 & 0.23 & 0.00 & 0.23 & 0.28 & 0.21 \\
\cline{2-13}
~ & SMOTE & Acc & 0.95 & 0.96 & 0.83 & 0.83 & 0.07 & 0.92 & 0.94 & 0.95 & 0.93 & 0.95 \\
~ & ~ & Prec & 0.00 & 0.00 & 0.06 & 0.07 & 0.04 & 0.18 & 0.23 & 0.31 & 0.22 & 0.32 \\
~ & ~ & Rec & 0.00 & 0.00 & 0.23 & 0.25 & 0.96 & 0.30 & 0.26 & 0.23 & 0.29 & 0.16 \\
~ & ~ & F1 & 0.00 & 0.00 & 0.10 & 0.11 & 0.08 & 0.22 & 0.24 & 0.26 & 0.25 & 0.21 \\
\cline{2-13}
~ & RUS & Acc & 0.92 & 0.93 & 0.78 & 0.88 & 0.48 & 0.77 & 0.95 & 0.85 & 0.88 & 0.92 \\
~ & ~ & Prec & 0.03 & 0.06 & 0.07 & 0.11 & 0.04 & 0.07 & 0.03 & 0.13 & 0.14 & 0.18 \\


In [31]:
print('''
| classifier          | Accuracy | Precision | Recall | F1 score |
| =================== | ======== | ========= | ====== | ======== |''')
for c,a,p,r,f in zip(classifiers_names,accuracy,precision,recall,f1):
    print(c,"  |  ",a,"  |  ",p,"  |  ",r,"  |  ",f,"  |  ")
    
print("\n===============================================================\n")
for c,a,p,r,f in zip(classifiers_names,accuracy_sm,precision_sm,recall_sm,f1_sm):
    print(c,"  |  ",a,"  |  ",p,"  |  ",r,"  |  ",f,"  |  ")

print("\n===============================================================\n")
for c,a,p,r,f in zip(classifiers_names,accuracy_rus,precision_rus,recall_rus,f1_rus):
    print(c,"  |  ",a,"  |  ",p,"  |  ",r,"  |  ",f,"  |  ")
  
print("\n===============================================================\n")
for c,a,p,r,f in zip(classifiers_names,accuracy_smoteenn,precision_smoteenn,recall_smoteenn,f1_smoteenn):
    print(c,"  |  ",a,"  |  ",p,"  |  ",r,"  |  ",f,"  |  ")
 


| classifier          | Accuracy | Precision | Recall | F1 score |
LR   |   0.9603332011106703   |   0.0   |   0.0   |   0.0   |  
LDA   |   0.9595398651328838   |   0.0   |   0.0   |   0.0   |  
KNN-5   |   0.9599365331217771   |   0.42857142857142855   |   0.03   |   0.056074766355140186   |  
KNN-10   |   0.9591431971439904   |   0.0   |   0.0   |   0.0   |  
GNB   |   0.0634668782229274   |   0.03913575214023644   |   0.96   |   0.07520564042303172   |  
DT   |   0.9563665212217374   |   0.38095238095238093   |   0.16   |   0.22535211267605634   |  
SVC   |   0.9603332011106703   |   0.0   |   0.0   |   0.0   |  
RFC   |   0.9623165410551369   |   0.6086956521739131   |   0.14   |   0.22764227642276424   |  
XGB   |   0.9646965489884967   |   0.7391304347826086   |   0.17   |   0.2764227642276423   |  
Voting   |   0.96390321301071   |   0.8   |   0.12   |   0.20869565217391303   |  


LR   |   0.9539865132883776   |   0.0   |   0.0   |   0.0   |  
LDA   |   0.9583498611662039   |