In [1]:
# !pip3 install missingpy

In [2]:
#!pip3 install scikit-learn==0.20.1  # dep for missingpy
#!pip3 install scikit-learn==0.24.1  # required for getting tree diagram

In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io.arff import loadarff
from sklearn.model_selection import train_test_split

%matplotlib inline

In [21]:
random_state = 2021

## Year $N$
Run all in this section...

In [22]:
N = '1'
drop_cols = ['Attr21','Attr37', 'Attr7','Attr43','Attr62','Attr32','Attr44','Attr15','Attr19','Attr3','Attr51','Attr4','Attr49','Attr38','Attr60','Attr6']

In [24]:
arff = loadarff('1year.arff')
df = pd.DataFrame(arff[0])
df['class']= df['class'].astype('int')
df = df.drop_duplicates()
# df.head()

In [25]:
# change dropcols accordingly...
df.isnull().sum().sort_values(ascending=False).head()

Attr37    2709
Attr21    1611
Attr27     309
Attr60     131
Attr45     130
dtype: int64

In [26]:
df = df.drop(drop_cols,axis='columns')
# df.shape

### Retain NaN in test set also

In [27]:
X = df.drop('class',axis='columns')
Y = df['class']
# (X.shape,Y.shape)

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.25, random_state=random_state,shuffle=True,stratify=Y)

display(Y_train.value_counts(), Y_test.value_counts())

0    5005
1     203
Name: class, dtype: int64

0    1669
1      68
Name: class, dtype: int64

In [28]:
X_train,Y_train,X_test,Y_test = X_train.to_numpy(),Y_train.to_numpy(),X_test.to_numpy(),Y_test.to_numpy()

### Standardise

In [29]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)

#scaler = None

#idx = np.isnan(X_train).any(axis=1)
#scaler = StandardScaler().fit(X_train[~idx])

## Missing values

In [30]:
def build_imputer(imputer_estimator,X_train,Y_train,transform_x=True,reset_index = True,verbose=True,max_iter=10,tol=1,imputer=None,scaler=None):
    # train = pd.concat([X_train,Y_train],axis=1)
    # train['class'] = train['class'].astype('category')
    # if reset_index:
    #     train = train.reset_index(drop=True)
    
    if imputer is None:
        imputer = IterativeImputer(estimator=imputer_estimator, n_nearest_features=None, imputation_order='descending',verbose=verbose,max_iter=max_iter,tol=tol)
        imputer = imputer.fit(X_train,Y_train)
    else:
        imputer = imputer.fit(X_train,Y_train)
        
    
    if transform_x:
        X_train = imputer.transform(X_train)
        
        #if scaler is not None:
        #    X_train = scaler.transform(X_train)
        
        return imputer,X_train
    
    return imputer


ModuleNotFoundError: ignored

In [31]:
# To use this experimental feature, we need to explicitly ask for it:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.neighbors import KNeighborsRegressor

# has to be scaled...
knn_imp = build_imputer( KNeighborsRegressor(n_jobs=4) ,scaler.transform(X_train),Y_train,transform_x=False,reset_index = True,verbose=True,max_iter=64,tol=0.1,scaler=scaler)

    

[IterativeImputer] Completing matrix with shape (5208, 48)
[IterativeImputer] Change: 10.071478144563422, scaled tolerance: 7.215953092496527 
[IterativeImputer] Change: 0.805656858926882, scaled tolerance: 7.215953092496527 
[IterativeImputer] Early stopping criterion reached.


In [32]:
from sklearn.impute import SimpleImputer
simple_imp = SimpleImputer(missing_values=np.nan, strategy='mean')
simple_imp = simple_imp.fit(X_train)

# Various Classification Models

## Simple Impute

In [33]:
# choose imputer <<comment blocks accordingly>>

#### for simple
scaled_already = False
X_train_imp = simple_imp.transform(X_train)
X_test_imp = simple_imp.transform(X_test)

################################# OR ################################
'''
#### for KNN
scaled_already = True
X_train_imp = knn_imp.transform(scaler.transform(X_train))
X_test_imp = knn_imp.transform(scaler.transform(X_test))
'''
################################# OR ################################
'''
#### for missf, use saved files...
scaled_already = False
X_train_imp = np.load("y"+N+"_realmissforest_train.npy")
X_test_imp = np.load("y"+N+"_realmissforest_test.npy")
'''

'\n#### for missf, use saved files...\nscaled_already = False\nX_train_imp = np.load("y"+N+"_realmissforest_train.npy")\nX_test_imp = np.load("y"+N+"_realmissforest_test.npy")\n'

In [None]:
# !pip3 install imblearn

In [34]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier # Voting Ensemble for Classification

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [35]:
def try_all_classifiers(X_train, X_test, y_train, y_test, classifiers, sampling  = None, scaler=None):
    '''
    do all imputations before passing here...
    Classifier : array of tuples (classifier,scaling required=True/False)
    '''
    accuracy = [0]*len(classifiers)
    f1 = [0]*len(classifiers)
    precision = [0]*len(classifiers)
    recall = [0]*len(classifiers)
    i = 0
    
    model_pipeline = []
    Pipeline([
        ('sampling', SMOTE()),
        ('classification', LogisticRegression())
    ])
    
    if sampling == "SMOTE":
        model_pipeline.append(('sampling', SMOTE(sampling_strategy=0.6,random_state=random_state) ))
        # X_train, y_train = smote.fit_resample(X_train, y_train)
        print("SMOTE")
    if sampling == "RUS":
        model_pipeline.append(('sampling', RandomUnderSampler(sampling_strategy=0.6,random_state=random_state) ))
        # X_train, y_train = rus.fit_resample(X_train, y_train)
        print("RUS")
    if sampling == "SMOTEENN":
        model_pipeline.append(('sampling', SMOTEENN(sampling_strategy=0.6,random_state=random_state) ))
        # X_train, y_train = smoteenn.fit_resample(X_train, y_train)
        print("SMOTEENN")    

    voting_classifs = []
    models_for_voting = [0,3,5,6,7,8]
    voting_weights = [1,0.5,1,0.5,1,1]
    jj =0 
        
    for i in range(len(classifiers)):
        classif = classifiers[i][0]
        pipe_parameters = classifiers[i][2]
        y_pred = []
        print(classifiers_names[i])
        pipeline = Pipeline(model_pipeline+[('classifier',classif)])
        
        if classifiers[i][1] and not scaled_already:
            print("\t- Requires scaling and not scaled. Doing it now...")
            grid = GridSearchCV(pipeline, pipe_parameters, cv=2, scoring="f1",n_jobs=-1,verbose=1)
            #grid = grid.fit(X_train, y_train)
            grid = grid.fit(scaler.transform(X_train), y_train)

            #classif.fit(scaler.transform(X_train), y_train)
            #y_pred = classif.predict(scaler.transform(X_test))
            
            display(grid.best_params_)
            classif = grid.best_estimator_
            y_pred = grid.predict(scaler.transform(X_test))
            #classif.fit(scaler.transform(), )
            #y_pred = classif.predict(scaler.transform(X_test))
            
        else:
            grid = GridSearchCV(pipeline, pipe_parameters, cv=2, scoring="f1",n_jobs=-1,verbose=1)
            grid.fit(X_train, y_train)
            display(grid.best_params_)
            classif = grid.best_estimator_
            y_pred = grid.predict(X_test)
            
            #classif.fit(X_train, y_train)
            #y_pred = classif.predict(X_test)
        
        if i in models_for_voting:
            print("\t- Adding for voting with weight <"+str(voting_weights[jj])+">...")
            jj+=1
            voting_classifs.append(("mod"+str(i+1),classif))
        
        accuracy[i] = metrics.accuracy_score(y_test, y_pred)
        f1[i] = metrics.f1_score(y_test, y_pred, labels=np.unique(y_pred))
        precision[i] = metrics.precision_score(y_test, y_pred)
        recall[i] = metrics.recall_score(y_test, y_pred)
    
    print("\n\nVoting...")
    # create the ensemble model
    ensemble = VotingClassifier(voting_classifs,weights=voting_weights,n_jobs=-1,voting="hard")
    ensemble.fit(scaler.transform(X_train), y_train)
    y_pred = ensemble.predict(scaler.transform(X_test))
    
    accuracy.append(metrics.accuracy_score(y_test, y_pred))
    f1.append(metrics.f1_score(y_test, y_pred, labels=np.unique(y_pred)))
    precision.append(metrics.precision_score(y_test, y_pred))
    recall.append(metrics.recall_score(y_test, y_pred))

    print("Done")
    return accuracy,f1,precision,recall

In [36]:
# classifiers_voting = [('log',LogisticRegression(max_iter=2048)),("knn10",KNeighborsClassifier(n_neighbors=10)),("dtc",DecisionTreeClassifier()),("svm_linear",SVC(kernel='linear',random_state=random_state)),("rf",RandomForestClassifier(n_estimators=16, n_jobs=8, random_state=random_state)),("xbg",XGBClassifier(use_label_encoder=False))]
# classifiers_voting = [("dtc",DecisionTreeClassifier()),("rf",RandomForestClassifier(n_estimators=64, n_jobs=-1, random_state=random_state)),("xbg",XGBClassifier(use_label_encoder=False))]

classifiers_names = ["LR", "LDA", "KNN-5", "KNN-10", "GNB", "DT", "SVC", "RFC", "XGB","Voting"]

classifiers = [(LogisticRegression(max_iter=2048,random_state=random_state),True,
                   [
                       #{
                       # 'classifier__penalty' : ['l1', 'l2'],
                       # 'classifier__C' : np.logspace(-8, 4, 16),
                       # 'classifier__solver' : ['liblinear']
                       # },
                        {
                        'classifier__penalty' : ['l2','none'],
                        'classifier__C' : np.logspace(-8, 4, 16)
                        }
                   ]),
                (LinearDiscriminantAnalysis(),True,
                    { 'classifier__solver' : ['svd', 'lsqr', 'eigen'] }),
                (KNeighborsClassifier(n_neighbors=5),True,
                    {'classifier__weights' : ['uniform','distance'], 'classifier__metric' : ['euclidean', 'manhattan']}), 
                (KNeighborsClassifier(n_neighbors=10),True,
                    {'classifier__weights' : ['uniform','distance'], 'classifier__metric' : ['euclidean', 'manhattan']}),
                (GaussianNB(),True,
                    {'classifier__var_smoothing': np.logspace(0,-9, num=100)}),
                (DecisionTreeClassifier(random_state=random_state),False,
                    { 'classifier__criterion':['gini','entropy'],'classifier__max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]} ),
                (SVC(random_state=random_state),True,
                    [
                        {'classifier__C': [ 0.05, 0.1, 1], 
                         'classifier__gamma': [0.0001, 1],
                         'classifier__kernel': ['rbf']},
                        {'classifier__C': [ 0.05, 0.1, 1],
                         'classifier__kernel': ['linear']}
                    ]),
                (RandomForestClassifier(random_state=random_state),False,
                     { 'classifier__n_estimators': [int(x) for x in np.linspace(start = 128, stop = 512, num = 4)],
                       'classifier__max_features': ['auto'],
                       'classifier__max_depth':  [int(x) for x in np.linspace(10, 100, num = 2)]+[None],
                       'classifier__min_samples_leaf':  [1, 4],
                       'classifier__bootstrap': [False]
                     }),
                (XGBClassifier(use_label_encoder=False),False,
                    {
                        'classifier__gamma': [0.5, 1, 2, 5],
                        'classifier__colsample_bytree': [0.6, 0.8, 1.0],
                        'classifier__max_depth': [3, 6]
                    }) ]



In [37]:
accuracy,f1,precision,recall = try_all_classifiers(X_train_imp,X_test_imp,Y_train,Y_test, classifiers, scaler=scaler)


LR
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:   34.8s finished
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


{'classifier__C': 1e-08, 'classifier__penalty': 'none'}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.2s finished


	- Adding for voting with weight <1>...
LDA
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 3 candidates, totalling 6 fits


{'classifier__solver': 'svd'}

KNN-5
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    3.7s finished


{'classifier__metric': 'euclidean', 'classifier__weights': 'uniform'}

KNN-10
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    3.8s finished


{'classifier__metric': 'euclidean', 'classifier__weights': 'uniform'}

	- Adding for voting with weight <0.5>...
GNB
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 100 candidates, totalling 200 fits


  _warn_prf(average, modifier, msg_start, len(result))
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    1.4s finished


{'classifier__var_smoothing': 0.002848035868435802}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


DT
Fitting 2 folds for each of 36 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    7.4s finished


{'classifier__criterion': 'gini', 'classifier__max_depth': 5}

	- Adding for voting with weight <1>...
SVC
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    5.1s finished


{'classifier__C': 0.05,
 'classifier__gamma': 0.0001,
 'classifier__kernel': 'rbf'}

	- Adding for voting with weight <0.5>...
RFC
Fitting 2 folds for each of 24 candidates, totalling 48 fits


  _warn_prf(average, modifier, msg_start, len(result))
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  3.5min finished


{'classifier__bootstrap': False,
 'classifier__max_depth': 100,
 'classifier__max_features': 'auto',
 'classifier__min_samples_leaf': 1,
 'classifier__n_estimators': 512}

	- Adding for voting with weight <1>...
XGB
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:   39.5s finished


{'classifier__colsample_bytree': 0.6,
 'classifier__gamma': 1,
 'classifier__max_depth': 6}

	- Adding for voting with weight <1>...


Voting...
Done


#### Sampling

In [38]:
accuracy_sm,f1_sm,precision_sm,recall_sm = try_all_classifiers(X_train_imp,X_test_imp,Y_train,Y_test, classifiers, sampling = "SMOTE", scaler=scaler)


SMOTE
LR
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:   33.6s
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:   50.7s finished


{'classifier__C': 6.309573444801943, 'classifier__penalty': 'l2'}

	- Adding for voting with weight <1>...
LDA
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 3 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.3s finished


{'classifier__solver': 'lsqr'}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


KNN-5
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    5.7s finished


{'classifier__metric': 'euclidean', 'classifier__weights': 'distance'}

KNN-10
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    6.1s finished


{'classifier__metric': 'manhattan', 'classifier__weights': 'uniform'}

	- Adding for voting with weight <0.5>...
GNB
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 100 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 164 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    3.8s finished


{'classifier__var_smoothing': 0.15199110829529336}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


DT
Fitting 2 folds for each of 36 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   12.4s finished


{'classifier__criterion': 'entropy', 'classifier__max_depth': 5}

	- Adding for voting with weight <1>...
SVC
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:   22.4s finished


{'classifier__C': 1, 'classifier__gamma': 1, 'classifier__kernel': 'rbf'}

	- Adding for voting with weight <0.5>...
RFC
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  5.0min finished


{'classifier__bootstrap': False,
 'classifier__max_depth': 100,
 'classifier__max_features': 'auto',
 'classifier__min_samples_leaf': 4,
 'classifier__n_estimators': 256}

	- Adding for voting with weight <1>...
XGB
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  1.1min finished


{'classifier__colsample_bytree': 0.6,
 'classifier__gamma': 2,
 'classifier__max_depth': 6}

	- Adding for voting with weight <1>...


Voting...
Done


In [39]:
accuracy_rus,f1_rus,precision_rus,recall_rus = try_all_classifiers(X_train_imp,X_test_imp,Y_train,Y_test, classifiers, sampling = "RUS", scaler=scaler)


RUS
LR
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:   15.6s finished


{'classifier__C': 251.18864315095823, 'classifier__penalty': 'l2'}

	- Adding for voting with weight <1>...
LDA
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 3 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.1s finished


{'classifier__solver': 'lsqr'}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


KNN-5
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.9s finished


{'classifier__metric': 'manhattan', 'classifier__weights': 'distance'}

KNN-10
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.9s finished


{'classifier__metric': 'manhattan', 'classifier__weights': 'distance'}

	- Adding for voting with weight <0.5>...
GNB
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 100 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    1.6s finished


{'classifier__var_smoothing': 1e-09}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


DT
Fitting 2 folds for each of 36 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    1.0s finished


{'classifier__criterion': 'gini', 'classifier__max_depth': 5}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


	- Adding for voting with weight <1>...
SVC
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Done  15 out of  18 | elapsed:    0.7s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    0.8s finished


{'classifier__C': 1, 'classifier__gamma': 1, 'classifier__kernel': 'rbf'}

	- Adding for voting with weight <0.5>...
RFC
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:   29.0s finished


{'classifier__bootstrap': False,
 'classifier__max_depth': 10,
 'classifier__max_features': 'auto',
 'classifier__min_samples_leaf': 4,
 'classifier__n_estimators': 256}

	- Adding for voting with weight <1>...
XGB
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    5.2s finished


{'classifier__colsample_bytree': 0.6,
 'classifier__gamma': 0.5,
 'classifier__max_depth': 3}

	- Adding for voting with weight <1>...


Voting...
Done


In [40]:
accuracy_smoteenn,f1_smoteenn,precision_smoteenn,recall_smoteenn = try_all_classifiers(X_train_imp,X_test_imp,Y_train,Y_test, classifiers, sampling = "SMOTEENN",scaler=scaler)


SMOTEENN
LR
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   58.4s
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:  1.5min finished
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


{'classifier__C': 1e-08, 'classifier__penalty': 'none'}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


	- Adding for voting with weight <1>...
LDA
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 3 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    4.6s finished


{'classifier__solver': 'svd'}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


KNN-5
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   10.4s finished


{'classifier__metric': 'manhattan', 'classifier__weights': 'uniform'}

KNN-10
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   10.7s finished


{'classifier__metric': 'manhattan', 'classifier__weights': 'uniform'}

	- Adding for voting with weight <0.5>...
GNB
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 100 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   34.2s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  2.5min finished


{'classifier__var_smoothing': 0.0006579332246575676}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


DT
Fitting 2 folds for each of 36 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   15.2s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   25.1s finished


{'classifier__criterion': 'entropy', 'classifier__max_depth': 20}

	- Adding for voting with weight <1>...
SVC
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:   29.9s finished


{'classifier__C': 0.1, 'classifier__kernel': 'linear'}

	- Adding for voting with weight <0.5>...
RFC
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  3.8min finished


{'classifier__bootstrap': False,
 'classifier__max_depth': 100,
 'classifier__max_features': 'auto',
 'classifier__min_samples_leaf': 1,
 'classifier__n_estimators': 512}

	- Adding for voting with weight <1>...
XGB
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:   58.0s finished


{'classifier__colsample_bytree': 0.6,
 'classifier__gamma': 0.5,
 'classifier__max_depth': 6}

	- Adding for voting with weight <1>...


Voting...
Done


In [41]:
print("Imputer & Sampling & Metric & ",end = "")
print(*classifiers_names,sep = " & ", end = " \\\\\n")
print("\\hline \\hline")
print("Simple & No & Acc & ",end="")
print(*['%.2f' % elem for elem in accuracy],sep=" & ", end = " \\\\\n")
print("~ & ~ & Prec & ",end="")
print(*['%.2f' % elem for elem in precision],sep=" & ", end = " \\\\\n")
print("~ & ~ & Rec & ",end="")
print(*['%.2f' % elem for elem in recall],sep=" & ", end = " \\\\\n")
print("~ & ~ & F1 & ",end="")
print(*['%.2f' % elem for elem in f1],sep=" & ", end = " \\\\\n")
print("\cline{2-13}")

print("~ & SMOTE & Acc & ",end="")
print(*['%.2f' % elem for elem in accuracy_sm],sep=" & ", end = " \\\\\n")
print("~ & ~ & Prec & ",end="")
print(*['%.2f' % elem for elem in precision_sm],sep=" & ", end = " \\\\\n")
print("~ & ~ & Rec & ",end="")
print(*['%.2f' % elem for elem in recall_sm],sep=" & ", end = " \\\\\n")
print("~ & ~ & F1 & ",end="")
print(*['%.2f' % elem for elem in f1_sm],sep=" & ", end = " \\\\\n")
print("\cline{2-13}")

print("~ & RUS & Acc & ",end="")
print(*['%.2f' % elem for elem in accuracy_rus],sep=" & ", end = " \\\\\n")
print("~ & ~ & Prec & ",end="")
print(*['%.2f' % elem for elem in precision_rus],sep=" & ", end = " \\\\\n")
print("~ & ~ & Rec & ",end="")
print(*['%.2f' % elem for elem in recall_rus],sep=" & ", end = " \\\\\n")
print("~ & ~ & F1 & ",end="")
print(*['%.2f' % elem for elem in f1_rus],sep=" & ", end = " \\\\\n")
print("\cline{2-13}")


print("~ & SMOTE- & Acc & ",end="")
print(*['%.2f' % elem for elem in accuracy_smoteenn],sep=" & ", end = " \\\\\n")
print("~ & ENN & Prec & ",end="")
print(*['%.2f' % elem for elem in precision_smoteenn],sep=" & ", end = " \\\\\n")
print("~ & ~ & Rec & ",end="")
print(*['%.2f' % elem for elem in recall_smoteenn],sep=" & ", end = " \\\\\n")
print("~ & ~ & F1 & ",end="")
print(*['%.2f' % elem for elem in f1_smoteenn],sep=" & ", end = " \\\\\n")
print("\\hline\\hline")




Imputer & Sampling & Metric & LR & LDA & KNN-5 & KNN-10 & GNB & DT & SVC & RFC & XGB & Voting \\
\hline \hline
Simple & No & Acc & 0.96 & 0.96 & 0.96 & 0.96 & 0.07 & 0.98 & 0.96 & 0.98 & 0.98 & 0.98 \\
~ & ~ & Prec & 0.65 & 0.00 & 0.33 & 0.00 & 0.04 & 0.81 & 0.00 & 0.87 & 0.95 & 0.97 \\
~ & ~ & Rec & 0.19 & 0.00 & 0.01 & 0.00 & 0.99 & 0.56 & 0.00 & 0.50 & 0.54 & 0.50 \\
~ & ~ & F1 & 0.30 & 0.00 & 0.03 & 0.00 & 0.08 & 0.66 & 0.00 & 0.64 & 0.69 & 0.66 \\
\cline{2-13}
~ & SMOTE & Acc & 0.87 & 0.94 & 0.84 & 0.85 & 0.06 & 0.85 & 0.90 & 0.98 & 0.98 & 0.98 \\
~ & ~ & Prec & 0.15 & 0.15 & 0.10 & 0.13 & 0.04 & 0.18 & 0.23 & 0.77 & 0.69 & 0.81 \\
~ & ~ & Rec & 0.47 & 0.09 & 0.40 & 0.53 & 1.00 & 0.76 & 0.60 & 0.53 & 0.66 & 0.62 \\
~ & ~ & F1 & 0.23 & 0.11 & 0.16 & 0.21 & 0.08 & 0.29 & 0.33 & 0.63 & 0.68 & 0.70 \\
\cline{2-13}
~ & RUS & Acc & 0.87 & 0.87 & 0.81 & 0.85 & 0.66 & 0.85 & 0.92 & 0.91 & 0.92 & 0.94 \\
~ & ~ & Prec & 0.15 & 0.12 & 0.10 & 0.11 & 0.06 & 0.19 & 0.10 & 0.27 & 0.29 & 0.35 \\


In [42]:
print('''
| classifier          | Accuracy | Precision | Recall | F1 score |
| =================== | ======== | ========= | ====== | ======== |''')
for c,a,p,r,f in zip(classifiers_names,accuracy,precision,recall,f1):
    print(c,"  |  ",a,"  |  ",p,"  |  ",r,"  |  ",f,"  |  ")
    
print("\n===============================================================\n")
for c,a,p,r,f in zip(classifiers_names,accuracy_sm,precision_sm,recall_sm,f1_sm):
    print(c,"  |  ",a,"  |  ",p,"  |  ",r,"  |  ",f,"  |  ")

print("\n===============================================================\n")
for c,a,p,r,f in zip(classifiers_names,accuracy_rus,precision_rus,recall_rus,f1_rus):
    print(c,"  |  ",a,"  |  ",p,"  |  ",r,"  |  ",f,"  |  ")
  
print("\n===============================================================\n")
for c,a,p,r,f in zip(classifiers_names,accuracy_smoteenn,precision_smoteenn,recall_smoteenn,f1_smoteenn):
    print(c,"  |  ",a,"  |  ",p,"  |  ",r,"  |  ",f,"  |  ")
 


| classifier          | Accuracy | Precision | Recall | F1 score |
LR   |   0.9643062751871042   |   0.65   |   0.19117647058823528   |   0.29545454545454547   |  
LDA   |   0.9579735175590098   |   0.0   |   0.0   |   0.0   |  
KNN-5   |   0.9602763385146805   |   0.3333333333333333   |   0.014705882352941176   |   0.028169014084507043   |  
KNN-10   |   0.9608520437535981   |   0.0   |   0.0   |   0.0   |  
GNB   |   0.0736902705814623   |   0.04   |   0.9852941176470589   |   0.07687894434882386   |  
DT   |   0.9775474956822107   |   0.8085106382978723   |   0.5588235294117647   |   0.6608695652173914   |  
SVC   |   0.9608520437535981   |   0.0   |   0.0   |   0.0   |  
RFC   |   0.9775474956822107   |   0.8717948717948718   |   0.5   |   0.6355140186915887   |  
XGB   |   0.9810017271157168   |   0.9487179487179487   |   0.5441176470588235   |   0.6915887850467289   |  
Voting   |   0.9798503166378814   |   0.9714285714285714   |   0.5   |   0.6601941747572815   |  


LR   |   0

## MissForest Impute

In [43]:
# choose imputer <<comment blocks accordingly>>
'''
#### for simple
scaled_already = False
X_train_imp = simple_imp.transform(X_train)
X_test_imp = simple_imp.transform(X_test)
'''
################################# OR ################################
'''
#### for KNN
scaled_already = True
X_train_imp = knn_imp.transform(scaler.transform(X_train))
X_test_imp = knn_imp.transform(scaler.transform(X_test))
'''
################################# OR ################################

#### for missf, use saved files...
scaled_already = False
X_train_imp = np.load("y"+N+"_realmissforest_train.npy")
X_test_imp = np.load("y"+N+"_realmissforest_test.npy")


In [15]:
# !pip3 install imblearn

In [44]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier # Voting Ensemble for Classification

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [45]:
def try_all_classifiers(X_train, X_test, y_train, y_test, classifiers, sampling  = None, scaler=None):
    '''
    do all imputations before passing here...
    Classifier : array of tuples (classifier,scaling required=True/False)
    '''
    accuracy = [0]*len(classifiers)
    f1 = [0]*len(classifiers)
    precision = [0]*len(classifiers)
    recall = [0]*len(classifiers)
    i = 0
    
    model_pipeline = []
    Pipeline([
        ('sampling', SMOTE()),
        ('classification', LogisticRegression())
    ])
    
    if sampling == "SMOTE":
        model_pipeline.append(('sampling', SMOTE(sampling_strategy=0.6,random_state=random_state) ))
        # X_train, y_train = smote.fit_resample(X_train, y_train)
        print("SMOTE")
    if sampling == "RUS":
        model_pipeline.append(('sampling', RandomUnderSampler(sampling_strategy=0.6,random_state=random_state) ))
        # X_train, y_train = rus.fit_resample(X_train, y_train)
        print("RUS")
    if sampling == "SMOTEENN":
        model_pipeline.append(('sampling', SMOTEENN(sampling_strategy=0.6,random_state=random_state) ))
        # X_train, y_train = smoteenn.fit_resample(X_train, y_train)
        print("SMOTEENN")    

    voting_classifs = []
    models_for_voting = [0,3,5,6,7,8]
    voting_weights = [1,0.5,1,0.5,1,1]
    jj =0 
        
    for i in range(len(classifiers)):
        classif = classifiers[i][0]
        pipe_parameters = classifiers[i][2]
        y_pred = []
        print(classifiers_names[i])
        pipeline = Pipeline(model_pipeline+[('classifier',classif)])
        
        if classifiers[i][1] and not scaled_already:
            print("\t- Requires scaling and not scaled. Doing it now...")
            grid = GridSearchCV(pipeline, pipe_parameters, cv=2, scoring="f1",n_jobs=-1,verbose=1)
            #grid = grid.fit(X_train, y_train)
            grid = grid.fit(scaler.transform(X_train), y_train)

            #classif.fit(scaler.transform(X_train), y_train)
            #y_pred = classif.predict(scaler.transform(X_test))
            
            display(grid.best_params_)
            classif = grid.best_estimator_
            y_pred = grid.predict(scaler.transform(X_test))
            #classif.fit(scaler.transform(), )
            #y_pred = classif.predict(scaler.transform(X_test))
            
        else:
            grid = GridSearchCV(pipeline, pipe_parameters, cv=2, scoring="f1",n_jobs=-1,verbose=1)
            grid.fit(X_train, y_train)
            display(grid.best_params_)
            classif = grid.best_estimator_
            y_pred = grid.predict(X_test)
            
            #classif.fit(X_train, y_train)
            #y_pred = classif.predict(X_test)
        
        if i in models_for_voting:
            print("\t- Adding for voting with weight <"+str(voting_weights[jj])+">...")
            jj+=1
            voting_classifs.append(("mod"+str(i+1),classif))
        
        accuracy[i] = metrics.accuracy_score(y_test, y_pred)
        f1[i] = metrics.f1_score(y_test, y_pred, labels=np.unique(y_pred))
        precision[i] = metrics.precision_score(y_test, y_pred)
        recall[i] = metrics.recall_score(y_test, y_pred)
    
    print("\n\nVoting...")
    # create the ensemble model
    ensemble = VotingClassifier(voting_classifs,weights=voting_weights,n_jobs=-1,voting="hard")
    ensemble.fit(scaler.transform(X_train), y_train)
    y_pred = ensemble.predict(scaler.transform(X_test))
    
    accuracy.append(metrics.accuracy_score(y_test, y_pred))
    f1.append(metrics.f1_score(y_test, y_pred, labels=np.unique(y_pred)))
    precision.append(metrics.precision_score(y_test, y_pred))
    recall.append(metrics.recall_score(y_test, y_pred))

    print("Done")
    return accuracy,f1,precision,recall

In [46]:
# classifiers_voting = [('log',LogisticRegression(max_iter=2048)),("knn10",KNeighborsClassifier(n_neighbors=10)),("dtc",DecisionTreeClassifier()),("svm_linear",SVC(kernel='linear',random_state=random_state)),("rf",RandomForestClassifier(n_estimators=16, n_jobs=8, random_state=random_state)),("xbg",XGBClassifier(use_label_encoder=False))]
# classifiers_voting = [("dtc",DecisionTreeClassifier()),("rf",RandomForestClassifier(n_estimators=64, n_jobs=-1, random_state=random_state)),("xbg",XGBClassifier(use_label_encoder=False))]

classifiers_names = ["LR", "LDA", "KNN-5", "KNN-10", "GNB", "DT", "SVC", "RFC", "XGB","Voting"]

classifiers = [(LogisticRegression(max_iter=2048,random_state=random_state),True,
                   [
                       #{
                       # 'classifier__penalty' : ['l1', 'l2'],
                       # 'classifier__C' : np.logspace(-8, 4, 16),
                       # 'classifier__solver' : ['liblinear']
                       # },
                        {
                        'classifier__penalty' : ['l2','none'],
                        'classifier__C' : np.logspace(-8, 4, 16)
                        }
                   ]),
                (LinearDiscriminantAnalysis(),True,
                    { 'classifier__solver' : ['svd', 'lsqr', 'eigen'] }),
                (KNeighborsClassifier(n_neighbors=5),True,
                    {'classifier__weights' : ['uniform','distance'], 'classifier__metric' : ['euclidean', 'manhattan']}), 
                (KNeighborsClassifier(n_neighbors=10),True,
                    {'classifier__weights' : ['uniform','distance'], 'classifier__metric' : ['euclidean', 'manhattan']}),
                (GaussianNB(),True,
                    {'classifier__var_smoothing': np.logspace(0,-9, num=100)}),
                (DecisionTreeClassifier(random_state=random_state),False,
                    { 'classifier__criterion':['gini','entropy'],'classifier__max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]} ),
                (SVC(random_state=random_state),True,
                    [
                        {'classifier__C': [ 0.05, 0.1, 1], 
                         'classifier__gamma': [0.0001, 1],
                         'classifier__kernel': ['rbf']},
                        {'classifier__C': [ 0.05, 0.1, 1],
                         'classifier__kernel': ['linear']}
                    ]),
                (RandomForestClassifier(random_state=random_state),False,
                     { 'classifier__n_estimators': [int(x) for x in np.linspace(start = 128, stop = 512, num = 4)],
                       'classifier__max_features': ['auto'],
                       'classifier__max_depth':  [int(x) for x in np.linspace(10, 100, num = 2)]+[None],
                       'classifier__min_samples_leaf':  [1, 4],
                       'classifier__bootstrap': [False]
                     }),
                (XGBClassifier(use_label_encoder=False),False,
                    {
                        'classifier__gamma': [0.5, 1, 2, 5],
                        'classifier__colsample_bytree': [0.6, 0.8, 1.0],
                        'classifier__max_depth': [3, 6]
                    }) ]



In [47]:
accuracy,f1,precision,recall = try_all_classifiers(X_train_imp,X_test_imp,Y_train,Y_test, classifiers, scaler=scaler)


LR
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:   25.7s
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:   38.6s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


{'classifier__C': 1584.8931924611175, 'classifier__penalty': 'l2'}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.1s finished


	- Adding for voting with weight <1>...
LDA
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 3 candidates, totalling 6 fits


{'classifier__solver': 'lsqr'}

KNN-5
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    3.8s finished


{'classifier__metric': 'euclidean', 'classifier__weights': 'distance'}

KNN-10
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    4.1s finished


{'classifier__metric': 'euclidean', 'classifier__weights': 'uniform'}

	- Adding for voting with weight <0.5>...
GNB
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 100 candidates, totalling 200 fits


  _warn_prf(average, modifier, msg_start, len(result))
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    1.4s finished


{'classifier__var_smoothing': 0.1}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


DT
Fitting 2 folds for each of 36 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    8.6s finished


{'classifier__criterion': 'gini', 'classifier__max_depth': 12}

	- Adding for voting with weight <1>...
SVC
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    5.8s finished


{'classifier__C': 0.05,
 'classifier__gamma': 0.0001,
 'classifier__kernel': 'rbf'}

	- Adding for voting with weight <0.5>...
RFC
Fitting 2 folds for each of 24 candidates, totalling 48 fits


  _warn_prf(average, modifier, msg_start, len(result))
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  3.8min finished


{'classifier__bootstrap': False,
 'classifier__max_depth': 10,
 'classifier__max_features': 'auto',
 'classifier__min_samples_leaf': 1,
 'classifier__n_estimators': 384}

	- Adding for voting with weight <1>...
XGB
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:   39.4s finished


{'classifier__colsample_bytree': 1.0,
 'classifier__gamma': 1,
 'classifier__max_depth': 3}

	- Adding for voting with weight <1>...


Voting...
Done


#### Sampling

In [48]:
accuracy_sm,f1_sm,precision_sm,recall_sm = try_all_classifiers(X_train_imp,X_test_imp,Y_train,Y_test, classifiers, sampling = "SMOTE", scaler=scaler)


SMOTE
LR
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:   36.2s
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:   55.2s finished


{'classifier__C': 1.0, 'classifier__penalty': 'l2'}

	- Adding for voting with weight <1>...
LDA
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 3 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.3s finished


{'classifier__solver': 'lsqr'}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


KNN-5
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    5.8s finished


{'classifier__metric': 'manhattan', 'classifier__weights': 'uniform'}

KNN-10
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    6.2s finished


{'classifier__metric': 'manhattan', 'classifier__weights': 'uniform'}

	- Adding for voting with weight <0.5>...
GNB
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 100 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 164 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    3.8s finished


{'classifier__var_smoothing': 0.0012328467394420659}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


DT
Fitting 2 folds for each of 36 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   14.3s finished


{'classifier__criterion': 'gini', 'classifier__max_depth': 9}

	- Adding for voting with weight <1>...
SVC
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:   22.8s finished


{'classifier__C': 1, 'classifier__kernel': 'linear'}

	- Adding for voting with weight <0.5>...
RFC
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  5.4min finished


{'classifier__bootstrap': False,
 'classifier__max_depth': 10,
 'classifier__max_features': 'auto',
 'classifier__min_samples_leaf': 1,
 'classifier__n_estimators': 512}

	- Adding for voting with weight <1>...
XGB
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  1.1min finished


{'classifier__colsample_bytree': 1.0,
 'classifier__gamma': 0.5,
 'classifier__max_depth': 6}

	- Adding for voting with weight <1>...


Voting...
Done


In [49]:
accuracy_rus,f1_rus,precision_rus,recall_rus = try_all_classifiers(X_train_imp,X_test_imp,Y_train,Y_test, classifiers, sampling = "RUS", scaler=scaler)


RUS
LR
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:   16.3s finished


{'classifier__C': 1.0, 'classifier__penalty': 'l2'}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


	- Adding for voting with weight <1>...
LDA
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 3 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.1s finished


{'classifier__solver': 'svd'}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


KNN-5
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.8s finished


{'classifier__metric': 'manhattan', 'classifier__weights': 'distance'}

KNN-10
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.8s finished


{'classifier__metric': 'euclidean', 'classifier__weights': 'uniform'}

	- Adding for voting with weight <0.5>...
GNB
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 100 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    1.5s finished


{'classifier__var_smoothing': 0.003511191734215131}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


DT
Fitting 2 folds for each of 36 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    1.0s finished


{'classifier__criterion': 'gini', 'classifier__max_depth': 4}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


	- Adding for voting with weight <1>...
SVC
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    0.7s finished


{'classifier__C': 1, 'classifier__kernel': 'linear'}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


	- Adding for voting with weight <0.5>...
RFC
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:   29.7s finished


{'classifier__bootstrap': False,
 'classifier__max_depth': 100,
 'classifier__max_features': 'auto',
 'classifier__min_samples_leaf': 4,
 'classifier__n_estimators': 128}

	- Adding for voting with weight <1>...
XGB
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    5.4s finished


{'classifier__colsample_bytree': 0.8,
 'classifier__gamma': 1,
 'classifier__max_depth': 3}

	- Adding for voting with weight <1>...


Voting...
Done


In [50]:
accuracy_smoteenn,f1_smoteenn,precision_smoteenn,recall_smoteenn = try_all_classifiers(X_train_imp,X_test_imp,Y_train,Y_test, classifiers, sampling = "SMOTEENN",scaler=scaler)


SMOTEENN
LR
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:  1.6min finished


{'classifier__C': 39.810717055349855, 'classifier__penalty': 'l2'}

	- Adding for voting with weight <1>...
LDA
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 3 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    4.6s finished


{'classifier__solver': 'lsqr'}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


KNN-5
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   10.8s finished


{'classifier__metric': 'manhattan', 'classifier__weights': 'uniform'}

KNN-10
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   11.1s finished


{'classifier__metric': 'euclidean', 'classifier__weights': 'uniform'}

	- Adding for voting with weight <0.5>...
GNB
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 100 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   34.5s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  2.5min finished


{'classifier__var_smoothing': 1.519911082952933e-06}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


DT
Fitting 2 folds for each of 36 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   17.8s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   29.5s finished


{'classifier__criterion': 'gini', 'classifier__max_depth': 10}

	- Adding for voting with weight <1>...
SVC
	- Requires scaling and not scaled. Doing it now...
Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:   29.4s finished


{'classifier__C': 0.05, 'classifier__kernel': 'linear'}

	- Adding for voting with weight <0.5>...
RFC
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  3.4min finished


{'classifier__bootstrap': False,
 'classifier__max_depth': 10,
 'classifier__max_features': 'auto',
 'classifier__min_samples_leaf': 4,
 'classifier__n_estimators': 128}

	- Adding for voting with weight <1>...
XGB
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:   54.3s finished


{'classifier__colsample_bytree': 0.8,
 'classifier__gamma': 5,
 'classifier__max_depth': 3}

	- Adding for voting with weight <1>...


Voting...
Done


In [51]:
print("Imputer & Sampling & Metric & ",end = "")
print(*classifiers_names,sep = " & ", end = " \\\\\n")
print("\\hline \\hline")
print("MissForest & No & Acc & ",end="")
print(*['%.2f' % elem for elem in accuracy],sep=" & ", end = " \\\\\n")
print("~ & ~ & Prec & ",end="")
print(*['%.2f' % elem for elem in precision],sep=" & ", end = " \\\\\n")
print("~ & ~ & Rec & ",end="")
print(*['%.2f' % elem for elem in recall],sep=" & ", end = " \\\\\n")
print("~ & ~ & F1 & ",end="")
print(*['%.2f' % elem for elem in f1],sep=" & ", end = " \\\\\n")
print("\cline{2-13}")

print("~ & SMOTE & Acc & ",end="")
print(*['%.2f' % elem for elem in accuracy_sm],sep=" & ", end = " \\\\\n")
print("~ & ~ & Prec & ",end="")
print(*['%.2f' % elem for elem in precision_sm],sep=" & ", end = " \\\\\n")
print("~ & ~ & Rec & ",end="")
print(*['%.2f' % elem for elem in recall_sm],sep=" & ", end = " \\\\\n")
print("~ & ~ & F1 & ",end="")
print(*['%.2f' % elem for elem in f1_sm],sep=" & ", end = " \\\\\n")
print("\cline{2-13}")

print("~ & RUS & Acc & ",end="")
print(*['%.2f' % elem for elem in accuracy_rus],sep=" & ", end = " \\\\\n")
print("~ & ~ & Prec & ",end="")
print(*['%.2f' % elem for elem in precision_rus],sep=" & ", end = " \\\\\n")
print("~ & ~ & Rec & ",end="")
print(*['%.2f' % elem for elem in recall_rus],sep=" & ", end = " \\\\\n")
print("~ & ~ & F1 & ",end="")
print(*['%.2f' % elem for elem in f1_rus],sep=" & ", end = " \\\\\n")
print("\cline{2-13}")


print("~ & SMOTE- & Acc & ",end="")
print(*['%.2f' % elem for elem in accuracy_smoteenn],sep=" & ", end = " \\\\\n")
print("~ & ENN & Prec & ",end="")
print(*['%.2f' % elem for elem in precision_smoteenn],sep=" & ", end = " \\\\\n")
print("~ & ~ & Rec & ",end="")
print(*['%.2f' % elem for elem in recall_smoteenn],sep=" & ", end = " \\\\\n")
print("~ & ~ & F1 & ",end="")
print(*['%.2f' % elem for elem in f1_smoteenn],sep=" & ", end = " \\\\\n")
print("\\hline\\hline")




Imputer & Sampling & Metric & LR & LDA & KNN-5 & KNN-10 & GNB & DT & SVC & RFC & XGB & Voting \\
\hline \hline
MissForest & No & Acc & 0.96 & 0.96 & 0.96 & 0.96 & 0.95 & 0.94 & 0.96 & 0.96 & 0.96 & 0.96 \\
~ & ~ & Prec & 0.17 & 0.00 & 0.00 & 0.00 & 0.09 & 0.22 & 0.00 & 0.00 & 0.80 & 0.00 \\
~ & ~ & Rec & 0.01 & 0.00 & 0.00 & 0.00 & 0.03 & 0.21 & 0.00 & 0.00 & 0.06 & 0.00 \\
~ & ~ & F1 & 0.03 & 0.00 & 0.00 & 0.00 & 0.04 & 0.21 & 0.00 & 0.00 & 0.11 & 0.00 \\
\cline{2-13}
~ & SMOTE & Acc & 0.87 & 0.94 & 0.82 & 0.82 & 0.07 & 0.84 & 0.93 & 0.95 & 0.94 & 0.95 \\
~ & ~ & Prec & 0.13 & 0.10 & 0.11 & 0.10 & 0.04 & 0.11 & 0.18 & 0.32 & 0.31 & 0.33 \\
~ & ~ & Rec & 0.40 & 0.06 & 0.50 & 0.46 & 0.99 & 0.44 & 0.21 & 0.28 & 0.40 & 0.37 \\
~ & ~ & F1 & 0.20 & 0.07 & 0.18 & 0.17 & 0.08 & 0.17 & 0.19 & 0.30 & 0.35 & 0.35 \\
\cline{2-13}
~ & RUS & Acc & 0.92 & 0.87 & 0.79 & 0.87 & 0.91 & 0.80 & 0.95 & 0.86 & 0.85 & 0.91 \\
~ & ~ & Prec & 0.16 & 0.12 & 0.09 & 0.07 & 0.04 & 0.08 & 0.09 & 0.16 & 0.16 & 0.17

In [52]:
print('''
| classifier          | Accuracy | Precision | Recall | F1 score |
| =================== | ======== | ========= | ====== | ======== |''')
for c,a,p,r,f in zip(classifiers_names,accuracy,precision,recall,f1):
    print(c,"  |  ",a,"  |  ",p,"  |  ",r,"  |  ",f,"  |  ")
    
print("\n===============================================================\n")
for c,a,p,r,f in zip(classifiers_names,accuracy_sm,precision_sm,recall_sm,f1_sm):
    print(c,"  |  ",a,"  |  ",p,"  |  ",r,"  |  ",f,"  |  ")

print("\n===============================================================\n")
for c,a,p,r,f in zip(classifiers_names,accuracy_rus,precision_rus,recall_rus,f1_rus):
    print(c,"  |  ",a,"  |  ",p,"  |  ",r,"  |  ",f,"  |  ")
  
print("\n===============================================================\n")
for c,a,p,r,f in zip(classifiers_names,accuracy_smoteenn,precision_smoteenn,recall_smoteenn,f1_smoteenn):
    print(c,"  |  ",a,"  |  ",p,"  |  ",r,"  |  ",f,"  |  ")
 


| classifier          | Accuracy | Precision | Recall | F1 score |
LR   |   0.9585492227979274   |   0.16666666666666666   |   0.014705882352941176   |   0.02702702702702703   |  
LDA   |   0.9556706966033391   |   0.0   |   0.0   |   0.0   |  
KNN-5   |   0.9579735175590098   |   0.0   |   0.0   |   0.0   |  
KNN-10   |   0.9608520437535981   |   0.0   |   0.0   |   0.0   |  
GNB   |   0.9504893494530801   |   0.09090909090909091   |   0.029411764705882353   |   0.04444444444444444   |  
DT   |   0.9401266551525619   |   0.21875   |   0.20588235294117646   |   0.21212121212121213   |  
SVC   |   0.9608520437535981   |   0.0   |   0.0   |   0.0   |  
RFC   |   0.9602763385146805   |   0.0   |   0.0   |   0.0   |  
XGB   |   0.9625791594703512   |   0.8   |   0.058823529411764705   |   0.1095890410958904   |  
Voting   |   0.9602763385146805   |   0.0   |   0.0   |   0.0   |  


LR   |   0.8739205526770294   |   0.13170731707317074   |   0.39705882352941174   |   0.19780219780219777   

## KNN Impute

In [53]:
# choose imputer <<comment blocks accordingly>>
'''
#### for simple
scaled_already = False
X_train_imp = simple_imp.transform(X_train)
X_test_imp = simple_imp.transform(X_test)
'''
################################# OR ################################

#### for KNN
scaled_already = True
X_train_imp = knn_imp.transform(scaler.transform(X_train))
X_test_imp = knn_imp.transform(scaler.transform(X_test))

################################# OR ################################
'''
#### for missf, use saved files...
scaled_already = False
X_train_imp = np.load("y"+N+"_realmissforest_train.npy")
X_test_imp = np.load("y"+N+"_realmissforest_test.npy")
'''

[IterativeImputer] Completing matrix with shape (5208, 48)
[IterativeImputer] Completing matrix with shape (1737, 48)


'\n#### for missf, use saved files...\nscaled_already = False\nX_train_imp = np.load("y"+N+"_realmissforest_train.npy")\nX_test_imp = np.load("y"+N+"_realmissforest_test.npy")\n'

In [None]:
# !pip3 install imblearn

In [54]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier # Voting Ensemble for Classification

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [55]:
def try_all_classifiers(X_train, X_test, y_train, y_test, classifiers, sampling  = None, scaler=None):
    '''
    do all imputations before passing here...
    Classifier : array of tuples (classifier,scaling required=True/False)
    '''
    accuracy = [0]*len(classifiers)
    f1 = [0]*len(classifiers)
    precision = [0]*len(classifiers)
    recall = [0]*len(classifiers)
    i = 0
    
    model_pipeline = []
    Pipeline([
        ('sampling', SMOTE()),
        ('classification', LogisticRegression())
    ])
    
    if sampling == "SMOTE":
        model_pipeline.append(('sampling', SMOTE(sampling_strategy=0.6,random_state=random_state) ))
        # X_train, y_train = smote.fit_resample(X_train, y_train)
        print("SMOTE")
    if sampling == "RUS":
        model_pipeline.append(('sampling', RandomUnderSampler(sampling_strategy=0.6,random_state=random_state) ))
        # X_train, y_train = rus.fit_resample(X_train, y_train)
        print("RUS")
    if sampling == "SMOTEENN":
        model_pipeline.append(('sampling', SMOTEENN(sampling_strategy=0.6,random_state=random_state) ))
        # X_train, y_train = smoteenn.fit_resample(X_train, y_train)
        print("SMOTEENN")    

    voting_classifs = []
    models_for_voting = [0,3,5,6,7,8]
    voting_weights = [1,0.5,1,0.5,1,1]
    jj =0 
        
    for i in range(len(classifiers)):
        classif = classifiers[i][0]
        pipe_parameters = classifiers[i][2]
        y_pred = []
        print(classifiers_names[i])
        pipeline = Pipeline(model_pipeline+[('classifier',classif)])
        
        if classifiers[i][1] and not scaled_already:
            print("\t- Requires scaling and not scaled. Doing it now...")
            grid = GridSearchCV(pipeline, pipe_parameters, cv=2, scoring="f1",n_jobs=-1,verbose=1)
            #grid = grid.fit(X_train, y_train)
            grid = grid.fit(scaler.transform(X_train), y_train)

            #classif.fit(scaler.transform(X_train), y_train)
            #y_pred = classif.predict(scaler.transform(X_test))
            
            display(grid.best_params_)
            classif = grid.best_estimator_
            y_pred = grid.predict(scaler.transform(X_test))
            #classif.fit(scaler.transform(), )
            #y_pred = classif.predict(scaler.transform(X_test))
            
        else:
            grid = GridSearchCV(pipeline, pipe_parameters, cv=2, scoring="f1",n_jobs=-1,verbose=1)
            grid.fit(X_train, y_train)
            display(grid.best_params_)
            classif = grid.best_estimator_
            y_pred = grid.predict(X_test)
            
            #classif.fit(X_train, y_train)
            #y_pred = classif.predict(X_test)
        
        if i in models_for_voting:
            print("\t- Adding for voting with weight <"+str(voting_weights[jj])+">...")
            jj+=1
            voting_classifs.append(("mod"+str(i+1),classif))
        
        accuracy[i] = metrics.accuracy_score(y_test, y_pred)
        f1[i] = metrics.f1_score(y_test, y_pred, labels=np.unique(y_pred))
        precision[i] = metrics.precision_score(y_test, y_pred)
        recall[i] = metrics.recall_score(y_test, y_pred)
    
    print("\n\nVoting...")
    # create the ensemble model
    ensemble = VotingClassifier(voting_classifs,weights=voting_weights,n_jobs=-1,voting="hard")
    ensemble.fit(scaler.transform(X_train), y_train)
    y_pred = ensemble.predict(scaler.transform(X_test))
    
    accuracy.append(metrics.accuracy_score(y_test, y_pred))
    f1.append(metrics.f1_score(y_test, y_pred, labels=np.unique(y_pred)))
    precision.append(metrics.precision_score(y_test, y_pred))
    recall.append(metrics.recall_score(y_test, y_pred))

    print("Done")
    return accuracy,f1,precision,recall

In [56]:
# classifiers_voting = [('log',LogisticRegression(max_iter=2048)),("knn10",KNeighborsClassifier(n_neighbors=10)),("dtc",DecisionTreeClassifier()),("svm_linear",SVC(kernel='linear',random_state=random_state)),("rf",RandomForestClassifier(n_estimators=16, n_jobs=8, random_state=random_state)),("xbg",XGBClassifier(use_label_encoder=False))]
# classifiers_voting = [("dtc",DecisionTreeClassifier()),("rf",RandomForestClassifier(n_estimators=64, n_jobs=-1, random_state=random_state)),("xbg",XGBClassifier(use_label_encoder=False))]

classifiers_names = ["LR", "LDA", "KNN-5", "KNN-10", "GNB", "DT", "SVC", "RFC", "XGB","Voting"]

classifiers = [(LogisticRegression(max_iter=2048,random_state=random_state),True,
                   [
                       #{
                       # 'classifier__penalty' : ['l1', 'l2'],
                       # 'classifier__C' : np.logspace(-8, 4, 16),
                       # 'classifier__solver' : ['liblinear']
                       # },
                        {
                        'classifier__penalty' : ['l2','none'],
                        'classifier__C' : np.logspace(-8, 4, 16)
                        }
                   ]),
                (LinearDiscriminantAnalysis(),True,
                    { 'classifier__solver' : ['svd', 'lsqr', 'eigen'] }),
                (KNeighborsClassifier(n_neighbors=5),True,
                    {'classifier__weights' : ['uniform','distance'], 'classifier__metric' : ['euclidean', 'manhattan']}), 
                (KNeighborsClassifier(n_neighbors=10),True,
                    {'classifier__weights' : ['uniform','distance'], 'classifier__metric' : ['euclidean', 'manhattan']}),
                (GaussianNB(),True,
                    {'classifier__var_smoothing': np.logspace(0,-9, num=100)}),
                (DecisionTreeClassifier(random_state=random_state),False,
                    { 'classifier__criterion':['gini','entropy'],'classifier__max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]} ),
                (SVC(random_state=random_state),True,
                    [
                        {'classifier__C': [ 0.05, 0.1, 1], 
                         'classifier__gamma': [0.0001, 1],
                         'classifier__kernel': ['rbf']},
                        {'classifier__C': [ 0.05, 0.1, 1],
                         'classifier__kernel': ['linear']}
                    ]),
                (RandomForestClassifier(random_state=random_state),False,
                     { 'classifier__n_estimators': [int(x) for x in np.linspace(start = 128, stop = 512, num = 4)],
                       'classifier__max_features': ['auto'],
                       'classifier__max_depth':  [int(x) for x in np.linspace(10, 100, num = 2)]+[None],
                       'classifier__min_samples_leaf':  [1, 4],
                       'classifier__bootstrap': [False]
                     }),
                (XGBClassifier(use_label_encoder=False),False,
                    {
                        'classifier__gamma': [0.5, 1, 2, 5],
                        'classifier__colsample_bytree': [0.6, 0.8, 1.0],
                        'classifier__max_depth': [3, 6]
                    }) ]



In [57]:
accuracy,f1,precision,recall = try_all_classifiers(X_train_imp,X_test_imp,Y_train,Y_test, classifiers, scaler=scaler)


LR
Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  54 tasks      | elapsed:   25.6s
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:   33.9s finished
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


{'classifier__C': 1e-08, 'classifier__penalty': 'none'}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.2s finished


	- Adding for voting with weight <1>...
LDA
Fitting 2 folds for each of 3 candidates, totalling 6 fits


{'classifier__solver': 'svd'}

KNN-5
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    3.7s finished


{'classifier__metric': 'euclidean', 'classifier__weights': 'distance'}

KNN-10
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    3.9s finished


{'classifier__metric': 'euclidean', 'classifier__weights': 'uniform'}

	- Adding for voting with weight <0.5>...
GNB
Fitting 2 folds for each of 100 candidates, totalling 200 fits


  _warn_prf(average, modifier, msg_start, len(result))
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    1.4s finished


{'classifier__var_smoothing': 0.0005336699231206307}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


DT
Fitting 2 folds for each of 36 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    8.3s finished


{'classifier__criterion': 'gini', 'classifier__max_depth': 20}

	- Adding for voting with weight <1>...
SVC
Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    4.9s finished


{'classifier__C': 0.05,
 'classifier__gamma': 0.0001,
 'classifier__kernel': 'rbf'}

	- Adding for voting with weight <0.5>...
RFC
Fitting 2 folds for each of 24 candidates, totalling 48 fits


  _warn_prf(average, modifier, msg_start, len(result))
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  3.7min finished


{'classifier__bootstrap': False,
 'classifier__max_depth': 100,
 'classifier__max_features': 'auto',
 'classifier__min_samples_leaf': 1,
 'classifier__n_estimators': 512}

	- Adding for voting with weight <1>...
XGB
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:   40.1s finished


{'classifier__colsample_bytree': 1.0,
 'classifier__gamma': 0.5,
 'classifier__max_depth': 6}

	- Adding for voting with weight <1>...


Voting...
Done


#### Sampling

In [58]:
accuracy_sm,f1_sm,precision_sm,recall_sm = try_all_classifiers(X_train_imp,X_test_imp,Y_train,Y_test, classifiers, sampling = "SMOTE", scaler=scaler)


SMOTE
LR
Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:   34.3s
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:   50.9s finished


{'classifier__C': 1.0, 'classifier__penalty': 'l2'}

	- Adding for voting with weight <1>...
LDA
Fitting 2 folds for each of 3 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.3s finished


{'classifier__solver': 'lsqr'}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


KNN-5
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    5.5s finished


{'classifier__metric': 'euclidean', 'classifier__weights': 'distance'}

KNN-10
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    5.7s finished


{'classifier__metric': 'manhattan', 'classifier__weights': 'uniform'}

	- Adding for voting with weight <0.5>...
GNB
Fitting 2 folds for each of 100 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 164 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    3.8s finished


{'classifier__var_smoothing': 1.519911082952933e-07}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


DT
Fitting 2 folds for each of 36 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   13.9s finished


{'classifier__criterion': 'gini', 'classifier__max_depth': 30}

	- Adding for voting with weight <1>...
SVC
Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:   22.2s finished


{'classifier__C': 1, 'classifier__kernel': 'linear'}

	- Adding for voting with weight <0.5>...
RFC
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  5.4min finished


{'classifier__bootstrap': False,
 'classifier__max_depth': 10,
 'classifier__max_features': 'auto',
 'classifier__min_samples_leaf': 4,
 'classifier__n_estimators': 512}

	- Adding for voting with weight <1>...
XGB
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  1.1min finished


{'classifier__colsample_bytree': 1.0,
 'classifier__gamma': 2,
 'classifier__max_depth': 6}

	- Adding for voting with weight <1>...


Voting...




Done


In [59]:
accuracy_rus,f1_rus,precision_rus,recall_rus = try_all_classifiers(X_train_imp,X_test_imp,Y_train,Y_test, classifiers, sampling = "RUS", scaler=scaler)


RUS
LR
Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:   15.5s finished


{'classifier__C': 1.0, 'classifier__penalty': 'l2'}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.1s finished


	- Adding for voting with weight <1>...
LDA
Fitting 2 folds for each of 3 candidates, totalling 6 fits




{'classifier__solver': 'svd'}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


KNN-5
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.8s finished


{'classifier__metric': 'manhattan', 'classifier__weights': 'distance'}

KNN-10
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.8s finished


{'classifier__metric': 'euclidean', 'classifier__weights': 'distance'}

	- Adding for voting with weight <0.5>...
GNB
Fitting 2 folds for each of 100 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    1.5s finished


{'classifier__var_smoothing': 0.0001873817422860383}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


DT
Fitting 2 folds for each of 36 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    1.1s finished


{'classifier__criterion': 'entropy', 'classifier__max_depth': 4}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


	- Adding for voting with weight <1>...
SVC
Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    0.7s finished


{'classifier__C': 1, 'classifier__gamma': 1, 'classifier__kernel': 'rbf'}

	- Adding for voting with weight <0.5>...
RFC
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:   29.6s finished


{'classifier__bootstrap': False,
 'classifier__max_depth': 100,
 'classifier__max_features': 'auto',
 'classifier__min_samples_leaf': 4,
 'classifier__n_estimators': 128}

	- Adding for voting with weight <1>...
XGB
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    5.3s finished


{'classifier__colsample_bytree': 0.6,
 'classifier__gamma': 5,
 'classifier__max_depth': 3}

	- Adding for voting with weight <1>...


Voting...
Done


In [60]:
accuracy_smoteenn,f1_smoteenn,precision_smoteenn,recall_smoteenn = try_all_classifiers(X_train_imp,X_test_imp,Y_train,Y_test, classifiers, sampling = "SMOTEENN",scaler=scaler)


SMOTEENN
LR
Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   57.6s
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:  1.4min finished


{'classifier__C': 0.025118864315095822, 'classifier__penalty': 'l2'}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


	- Adding for voting with weight <1>...
LDA
Fitting 2 folds for each of 3 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    4.3s finished


{'classifier__solver': 'svd'}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


KNN-5
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    9.7s finished


{'classifier__metric': 'manhattan', 'classifier__weights': 'uniform'}

KNN-10
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    9.9s finished


{'classifier__metric': 'euclidean', 'classifier__weights': 'uniform'}

	- Adding for voting with weight <0.5>...
GNB
Fitting 2 folds for each of 100 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   32.0s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  2.3min finished


{'classifier__var_smoothing': 8.111308307896856e-09}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


DT
Fitting 2 folds for each of 36 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   39.3s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  1.1min finished


{'classifier__criterion': 'gini', 'classifier__max_depth': 8}

	- Adding for voting with weight <1>...
SVC
Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:   30.2s finished


{'classifier__C': 0.1, 'classifier__kernel': 'linear'}

	- Adding for voting with weight <0.5>...
RFC
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  5.0min finished


{'classifier__bootstrap': False,
 'classifier__max_depth': 10,
 'classifier__max_features': 'auto',
 'classifier__min_samples_leaf': 4,
 'classifier__n_estimators': 384}

	- Adding for voting with weight <1>...
XGB
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  1.5min finished


{'classifier__colsample_bytree': 1.0,
 'classifier__gamma': 2,
 'classifier__max_depth': 6}

	- Adding for voting with weight <1>...


Voting...
Done


In [61]:
print("Imputer & Sampling & Metric & ",end = "")
print(*classifiers_names,sep = " & ", end = " \\\\\n")
print("\\hline \\hline")
print("KNN & No & Acc & ",end="")
print(*['%.2f' % elem for elem in accuracy],sep=" & ", end = " \\\\\n")
print("~ & ~ & Prec & ",end="")
print(*['%.2f' % elem for elem in precision],sep=" & ", end = " \\\\\n")
print("~ & ~ & Rec & ",end="")
print(*['%.2f' % elem for elem in recall],sep=" & ", end = " \\\\\n")
print("~ & ~ & F1 & ",end="")
print(*['%.2f' % elem for elem in f1],sep=" & ", end = " \\\\\n")
print("\cline{2-13}")

print("~ & SMOTE & Acc & ",end="")
print(*['%.2f' % elem for elem in accuracy_sm],sep=" & ", end = " \\\\\n")
print("~ & ~ & Prec & ",end="")
print(*['%.2f' % elem for elem in precision_sm],sep=" & ", end = " \\\\\n")
print("~ & ~ & Rec & ",end="")
print(*['%.2f' % elem for elem in recall_sm],sep=" & ", end = " \\\\\n")
print("~ & ~ & F1 & ",end="")
print(*['%.2f' % elem for elem in f1_sm],sep=" & ", end = " \\\\\n")
print("\cline{2-13}")

print("~ & RUS & Acc & ",end="")
print(*['%.2f' % elem for elem in accuracy_rus],sep=" & ", end = " \\\\\n")
print("~ & ~ & Prec & ",end="")
print(*['%.2f' % elem for elem in precision_rus],sep=" & ", end = " \\\\\n")
print("~ & ~ & Rec & ",end="")
print(*['%.2f' % elem for elem in recall_rus],sep=" & ", end = " \\\\\n")
print("~ & ~ & F1 & ",end="")
print(*['%.2f' % elem for elem in f1_rus],sep=" & ", end = " \\\\\n")
print("\cline{2-13}")


print("~ & SMOTE- & Acc & ",end="")
print(*['%.2f' % elem for elem in accuracy_smoteenn],sep=" & ", end = " \\\\\n")
print("~ & ENN & Prec & ",end="")
print(*['%.2f' % elem for elem in precision_smoteenn],sep=" & ", end = " \\\\\n")
print("~ & ~ & Rec & ",end="")
print(*['%.2f' % elem for elem in recall_smoteenn],sep=" & ", end = " \\\\\n")
print("~ & ~ & F1 & ",end="")
print(*['%.2f' % elem for elem in f1_smoteenn],sep=" & ", end = " \\\\\n")
print("\\hline\\hline")




Imputer & Sampling & Metric & LR & LDA & KNN-5 & KNN-10 & GNB & DT & SVC & RFC & XGB & Voting \\
\hline \hline
KNN & No & Acc & 0.96 & 0.96 & 0.96 & 0.96 & 0.07 & 0.94 & 0.96 & 0.96 & 0.96 & 0.96 \\
~ & ~ & Prec & 0.00 & 0.00 & 0.17 & 0.00 & 0.04 & 0.25 & 0.00 & 0.17 & 0.77 & 0.00 \\
~ & ~ & Rec & 0.00 & 0.00 & 0.01 & 0.00 & 0.99 & 0.25 & 0.00 & 0.01 & 0.15 & 0.00 \\
~ & ~ & F1 & 0.00 & 0.00 & 0.03 & 0.00 & 0.08 & 0.25 & 0.00 & 0.03 & 0.25 & 0.00 \\
\cline{2-13}
~ & SMOTE & Acc & 0.87 & 0.94 & 0.83 & 0.83 & 0.09 & 0.89 & 0.93 & 0.94 & 0.95 & 0.95 \\
~ & ~ & Prec & 0.13 & 0.11 & 0.10 & 0.11 & 0.04 & 0.15 & 0.18 & 0.26 & 0.39 & 0.34 \\
~ & ~ & Rec & 0.41 & 0.07 & 0.40 & 0.50 & 0.96 & 0.40 & 0.25 & 0.32 & 0.46 & 0.24 \\
~ & ~ & F1 & 0.20 & 0.09 & 0.15 & 0.18 & 0.08 & 0.22 & 0.21 & 0.29 & 0.42 & 0.28 \\
\cline{2-13}
~ & RUS & Acc & 0.91 & 0.86 & 0.79 & 0.82 & 0.12 & 0.84 & 0.92 & 0.87 & 0.86 & 0.91 \\
~ & ~ & Prec & 0.15 & 0.12 & 0.09 & 0.09 & 0.04 & 0.12 & 0.10 & 0.18 & 0.17 & 0.18 \\
~ &

In [62]:
print('''
| classifier          | Accuracy | Precision | Recall | F1 score |
| =================== | ======== | ========= | ====== | ======== |''')
for c,a,p,r,f in zip(classifiers_names,accuracy,precision,recall,f1):
    print(c,"  |  ",a,"  |  ",p,"  |  ",r,"  |  ",f,"  |  ")
    
print("\n===============================================================\n")
for c,a,p,r,f in zip(classifiers_names,accuracy_sm,precision_sm,recall_sm,f1_sm):
    print(c,"  |  ",a,"  |  ",p,"  |  ",r,"  |  ",f,"  |  ")

print("\n===============================================================\n")
for c,a,p,r,f in zip(classifiers_names,accuracy_rus,precision_rus,recall_rus,f1_rus):
    print(c,"  |  ",a,"  |  ",p,"  |  ",r,"  |  ",f,"  |  ")
  
print("\n===============================================================\n")
for c,a,p,r,f in zip(classifiers_names,accuracy_smoteenn,precision_smoteenn,recall_smoteenn,f1_smoteenn):
    print(c,"  |  ",a,"  |  ",p,"  |  ",r,"  |  ",f,"  |  ")
 


| classifier          | Accuracy | Precision | Recall | F1 score |
LR   |   0.9573978123200921   |   0.0   |   0.0   |   0.0   |  
LDA   |   0.9579735175590098   |   0.0   |   0.0   |   0.0   |  
KNN-5   |   0.9585492227979274   |   0.16666666666666666   |   0.014705882352941176   |   0.02702702702702703   |  
KNN-10   |   0.9608520437535981   |   0.0   |   0.0   |   0.0   |  
GNB   |   0.07484168105929764   |   0.04004781829049611   |   0.9852941176470589   |   0.07696726019529006   |  
DT   |   0.9407023603914796   |   0.2463768115942029   |   0.25   |   0.24817518248175185   |  
SVC   |   0.9608520437535981   |   0.0   |   0.0   |   0.0   |  
RFC   |   0.9585492227979274   |   0.16666666666666666   |   0.014705882352941176   |   0.02702702702702703   |  
XGB   |   0.9648819804260219   |   0.7692307692307693   |   0.14705882352941177   |   0.2469135802469136   |  
Voting   |   0.9602763385146805   |   0.0   |   0.0   |   0.0   |  


LR   |   0.8721934369602763   |   0.13333333333333

# ------
# EOF
