# Model Building

putting it all togather

## Setting up

In [39]:
import os, sys
import random
import warnings

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

#For the plots
%matplotlib inline
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
warnings.filterwarnings('ignore')
plt.style.use('ggplot')

from sklearn import set_config

set_config(display="diagram")

set random reed

In [40]:
random_state = 42
# np.random.seed = random_state
rng = np.random.default_rng(random_state)

In [41]:
train = pd.read_csv('../data/processed/train.csv')
train.drop(['id'], inplace=True, axis=1)

test = pd.read_csv('../data/processed/test.csv')
test.drop(['id'], inplace=True, axis=1)

## Importing Data

In [42]:
col_y = 'song_popularity'

X = train.copy()
y = X.pop(col_y)

In [43]:
mask = X.nunique() < 15
categorical_cols = X.columns[mask]
numerical_cols = [col for col in X.columns if col not in categorical_cols]

In [44]:
# convert audio_mode, key & time_signature column values to categorical
X['key'] = X['key'].astype( "Int64")
X[categorical_cols] = X[categorical_cols].astype('category')
X = pd.get_dummies(X)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 28 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   song_duration_ms  35899 non-null  float64
 1   acousticness      36008 non-null  float64
 2   danceability      35974 non-null  float64
 3   energy            36025 non-null  float64
 4   instrumentalness  36015 non-null  float64
 5   liveness          35914 non-null  float64
 6   loudness          36043 non-null  float64
 7   speechiness       40000 non-null  float64
 8   tempo             40000 non-null  float64
 9   audio_valence     40000 non-null  float64
 10  key_0             40000 non-null  uint8  
 11  key_1             40000 non-null  uint8  
 12  key_2             40000 non-null  uint8  
 13  key_3             40000 non-null  uint8  
 14  key_4             40000 non-null  uint8  
 15  key_5             40000 non-null  uint8  
 16  key_6             40000 non-null  uint8 

# Splitting data

In [45]:
# split the dataset in train, validation & test sets
from sklearn.model_selection import train_test_split

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=random_state, stratify=y_train_full)

In [46]:
X_train_full.shape, X_train.shape, X_val.shape, X_test.shape

((32000, 28), (25600, 28), (6400, 28), (8000, 28))

## Importing modeling & pre/post-processing libraries

In [9]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.experimental import enable_iterative_imputer, enable_halving_search_cv
from sklearn.impute import SimpleImputer, KNNImputer, MissingIndicator, IterativeImputer, MissingIndicator
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, FunctionTransformer, MinMaxScaler, PowerTransformer, RobustScaler, power_transform, minmax_scale
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, RandomizedSearchCV, HalvingGridSearchCV, HalvingRandomSearchCV
from sklearn.metrics import roc_auc_score, make_scorer, accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, roc_curve, auc, classification_report

# import classifier models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier, VotingClassifier, BaggingClassifier, StackingClassifier
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import LogisticRegression, RidgeClassifier, RidgeClassifierCV, Lasso, LassoCV, ElasticNet, ElasticNetCV, LogisticRegressionCV, SGDClassifier, SGDRegressor, Ridge, LassoLarsCV, LassoLars, BayesianRidge, RidgeCV
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, LocalOutlierFactor
from sklearn.svm import SVC, NuSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis


sys.path.append("kuma_utils/")
from kuma_utils.preprocessing.imputer import LGBMImputer

## building Preprocessors

In [21]:
# columns used for inverse sigmoid transformation
from sklearn.preprocessing import MinMaxScaler, minmax_scale, PowerTransformer, FunctionTransformer
# from sklearn_pandas import DataFrameMapper

col_sig = [
    'acousticness',
    'danceability',
    'energy',
    'instrumentalness',
    'liveness',
    'speechiness',
    'audio_valence',
]

col_pow = [
    'song_duration_ms',
    'tempo',
]

# make a custom transformer to transform the data
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.base import BaseEstimator, TransformerMixin

class PreProcessorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, cols, show_impute=False, inplace=False):
        self.show_impute = show_impute
        self.inplace = inplace
        self.col_sig, self.col_pow, self.loudness = cols
        self.cols = self.col_sig + self.col_pow
        self.inv_sigmoid = FunctionTransformer(lambda x: np.log(x / (1-x)))
        self.loudness_transformer = FunctionTransformer(lambda x: np.log1p(-x))
        
    def fit(self, X, y=None):
            
        self.scaler = MinMaxScaler(feature_range=(0+1e-6, 1-1e-6)).fit(X[self.col_sig])
        
        X_tmp = X[self.col_sig+self.col_pow].copy()
        X_tmp[self.col_sig] = self.scaler.transform(X_tmp[self.col_sig])
        X_tmp[self.col_sig] = self.inv_sigmoid.fit_transform(X_tmp[self.col_sig])
        
        self.transformer = PowerTransformer().fit(X_tmp)

        return self
    
    def transform(self, X, y=None):

        if not self.inplace:
            X = X.copy()
        
        X[self.col_sig] = self.scaler.transform(X[self.col_sig])
        X[self.col_sig] = self.inv_sigmoid.fit_transform(X[self.col_sig])
    #    # Box-Cox transformation
        X[self.col_sig+self.col_pow] = self.transformer.transform(X[self.col_sig+self.col_pow])
        X[self.loudness] = self.loudness_transformer.fit_transform(X[self.loudness])
        
        if y is None:
            return X
        else:
            return X, y

class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, outlier_remover, **kwrgs):
        self.outlier_remover = outlier_remover(**kwrgs)

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        y_hat = self.outlier_remover.fit_predict(X)
        mask = y_hat != -1
        
        if y is None:
            return X.iloc[mask, :]
        else:
            return X.iloc[mask, :], y[mask]

class Imputer(BaseEstimator, TransformerMixin):
    def __init__(self, imputer, **kwrgs):
        self.add_indicator = kwrgs.get('add_indicator', False)
        self.imputer = imputer(**kwrgs)

    def fit(self, X, y=None):
        col = X.columns
        if self.add_indicator:
            mask = X.isna().any(axis=0)
            self.imputed_col_names = col.tolist()+["_"+col+"_imputed_" for col in mask[mask].index]
        else:
            self.imputed_col_names = col

        return self


    def transform(self, X, y=None):
        train_knnimp = self.imputer.fit_transform(X)
        X = pd.DataFrame(train_knnimp, columns=self.imputed_col_names)
        
        if y is None:
            return X
        else:
            return X, y


imputer = Imputer(IterativeImputer, add_indicator=True, max_iter=10)
transformer = PreProcessorTransformer([col_sig, col_pow, 'loudness'])
outlier_remover = OutlierRemover(LocalOutlierFactor, n_neighbors=20, contamination=0.02)

In [22]:
preprocessor_pipe = Pipeline([
    ("na-imputation", imputer),
    ("transformation", transformer),
    ('outlier remover', outlier_remover),
])
preprocessor_pipe

In [12]:
preprocessor_pipe.fit(X_train_full)
X_preprocessed = preprocessor_pipe.transform(X_train_full)
X_preprocessed.head()

Unnamed: 0,song_duration_ms,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,audio_valence,...,time_signature_3,time_signature_4,time_signature_5,_song_duration_ms_imputed_,_acousticness_imputed_,_danceability_imputed_,_energy_imputed_,_instrumentalness_imputed_,_liveness_imputed_,_loudness_imputed_
0,1.270568,-1.147494,0.528199,0.350624,-0.321015,0.802553,1.820978,0.919715,0.288723,-1.055973,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1.550201,-1.022922,-1.349817,0.118358,-0.403108,-0.137582,2.413711,1.906215,-0.712463,-0.393752,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.270879,-0.687957,-1.654322,1.975529,-0.241883,-0.693959,1.812806,0.535059,0.363934,-1.347403,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,-1.001279,1.729995,-1.380073,-1.755755,-0.174952,0.49985,2.632064,-0.730326,0.397858,-0.527553,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.504512,1.150465,1.167045,-0.650929,-0.398942,1.314107,2.139182,-0.561565,-1.747482,-0.853085,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


### Custom transformations for each column

#### Descritizing `instrumentalness`

In [23]:
from sklearn.cluster import KMeans

# for modeling
class Descretizer(BaseEstimator, TransformerMixin):
    def __init__(self, descritizer, variable, inplace=False, **kwrgs):
        self.variable = variable
        self.inplace = inplace
        self.descritizer = descritizer(**kwrgs)

    def fit(self, X, y=None):
        self.descritizer.fit(X[[self.variable]])
        
        return self


    def transform(self, X, y=None):
        if self.inplace:
            X[self.variable] = self.descritizer.predict(X[[self.variable]])
        else:
            X[self.variable+"_descrete"] = self.descritizer.predict(X[[self.variable]])

        if y is None:
            return X
        else:
            return X, y

In [24]:
descretizer = Descretizer(KMeans, 'instrumentalness', True, n_clusters=5, random_state=0)

pipe = Pipeline([
    ('preprocessor_pipe', preprocessor_pipe),
    ('descretizer_pipe', descretizer),
])
pipe

In [26]:
pipe.fit(X_train_full, y_train_full)
X_train_preprocessed, y_train_preprocessed = pipe.transform(X_train_full, y_train_full)
X_train_preprocessed, y_train_preprocessed

TypeError: transform() takes 2 positional arguments but 3 were given

In [27]:
preprocessor_pipe = Pipeline([
    ("na-imputation", imputer),
    ("transformation", transformer),
])

preprocessor_pipe.fit(X_train_full)
X_preprocessed = preprocessor_pipe.transform(X_train_full)

outlier_remover = OutlierRemover(LocalOutlierFactor, n_neighbors=20, contamination=0.02)
outlier_remover.fit(X_preprocessed, y_train_full)
X_train_preprocessed, y_train_preprocessed = outlier_remover.transform(X_preprocessed, y_train_full)

descretizer = Descretizer(KMeans, 'instrumentalness', True, n_clusters=5, random_state=0)
descretizer.fit(X_train_preprocessed)
X_train_preprocessed = descretizer.transform(X_train_preprocessed)

In [47]:
X_train_preprocessed.shape, y_train_preprocessed.shape, X_train_full.shape, y_train_full.shape

((31360, 35), (31360,), (32000, 28), (32000,))

# Model selection

brute force through all models: the worst possible way

In [None]:
# !pip install lazypredict
# !pip install ml-model-selection

Collecting ml-model-selection
  Downloading ml_model_selection-1.0.0-py3-none-any.whl (14 kB)
Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)




Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py): started
  Building wheel for sklearn (setup.py): finished with status 'done'
  Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=1309 sha256=7bc0941884cc755cd76ad36ce70d04be86460bc1179dbe896f047589e64e9ef9
  Stored in directory: c:\users\akshu\appdata\local\pip\cache\wheels\e4\7b\98\b6466d71b8d738a0c547008b9eb39bf8676d1ff6ca4b22af1c
Successfully built sklearn
Installing collected packages: sklearn, ml-model-selection
Successfully installed ml-model-selection-1.0.0 sklearn-0.0


In [48]:
from model_selection import models_validation

model_paramGrid_list = [
                            ("NB", GaussianNB(), {}),
                            ("LR", LogisticRegression(), {"penalty" : ["l1", "l2"],
                                                            "C" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                                            "solver" : ["liblinear", "saga"]}),
                            ("RidgeClassifier", RidgeClassifier(), {"alpha" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                                                    "solver" : ["svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]}),
                            ("RidgeClassifierCV", RidgeClassifierCV(), {"alphas" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                                                        "cv" : [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
                                                                        "solver" : ["svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]}),
                            ("Lasso", Lasso(), {"alpha" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                                "solver" : ["svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]}),
                            ("LassoCV", LassoCV(), {"alphas" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                                    "cv" : [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
                                                    "solver" : ["svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]}),
                            ("ElasticNet", ElasticNet(), {"alpha" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                                            "l1_ratio" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                                            "solver" : ["svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]}),
                            ("ElasticNetCV", ElasticNetCV(), {"alphas" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                                                "l1_ratio" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                                                "cv" : [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
                                                                "solver" : ["svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]}),
                            ("LogisticRegressionCV", LogisticRegressionCV(), {"penalty" : ["l1", "l2"],
                                                                                "C" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                                                                "cv" : [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
                                                                                "solver" : ["liblinear", "saga"]}),
                            ("SGDClassifier", SGDClassifier(), {"loss" : ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
                                                                "penalty" : ["l2", "l1", "elasticnet"],
                                                                "alpha" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                                                "l1_ratio" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                                                "fit_intercept" : [True, False],
                                                                "max_iter" : [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
                                                                "tol" : [0.0001, 0.001, 0.01, 0.1, 1.0],
                                                                "shuffle" : [True, False],
                                                                "verbose" : [True, False],
                                                                "epsilon" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                                                "n_jobs" : [-1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                                                                "warm_start" : [True, False],
                                                                "average" : [True, False],
                                                                "class_weight" : ["balanced", None]}),
                            ("SGDRegressor", SGDRegressor(), {"loss" : ["squared_loss", "huber", "epsilon_insensitive", "squared_epsilon_insensitive"],
                                                                "penalty" : ["l2", "l1", "elasticnet"],
                                                                "alpha" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                                                "l1_ratio" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                                                "fit_intercept" : [True, False],
                                                                "max_iter" : [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
                                                                "tol" : [0.0001, 0.001, 0.01, 0.1, 1.0],
                                                                "shuffle" : [True, False],
                                                                "verbose" : [True, False],
                                                                "epsilon" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                                                "n_jobs" : [-1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                                                                "warm_start" : [True, False],
                                                                "average" : [True, False],
                                                                "class_weight" : ["balanced", None]}),
                            ("Ridge", Ridge(), {"alpha" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                                "fit_intercept" : [True, False],
                                                "normalize" : [True, False],
                                                "solver" : ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]}),
                            ("RidgeCV", RidgeCV(), {"alphas" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                                    "fit_intercept" : [True, False],
                                                    "normalize" : [True, False],
                                                    "cv" : [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
                                                    "solver" : ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]}),
                            ("Lasso", Lasso(), {"alpha" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                                "fit_intercept" : [True, False],
                                                "normalize" : [True, False],
                                                "precompute" : [True, False],
                                                "max_iter" : [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
                                                "tol" : [0.0001, 0.001, 0.01, 0.1, 1.0],
                                                "copy_X" : [True, False],
                                                "warm_start" : [True, False],
                                                "positive" : [True, False],
                                                "random_state" : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}),
                            ("LassoCV", LassoCV(), {"alphas" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                                    "fit_intercept" : [True, False],
                                                    "normalize" : [True, False],
                                                    "precompute" : [True, False],
                                                    "max_iter" : [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
                                                    "tol" : [0.0001, 0.001, 0.01, 0.1, 1.0],
                                                    "copy_X" : [True, False],
                                                    "warm_start" : [True, False],
                                                    "positive" : [True, False],
                                                    "random_state" : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                                                    "cv" : [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]}),
                            ("LassoLars", LassoLars(), {"alpha" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                                        "fit_intercept" : [True, False],
                                                        "normalize" : [True, False],
                                                        "precompute" : [True, False],
                                                        "max_iter" : [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
                                                        "tol" : [0.0001, 0.001, 0.01, 0.1, 1.0],
                                                        "copy_X" : [True, False],
                                                        "warm_start" : [True, False],
                                                        "positive" : [True, False],
                                                        "random_state" : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}),
                            ("LassoLarsCV", LassoLarsCV(), {"alphas" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                                            "fit_intercept" : [True, False],
                                                            "normalize" : [True, False],
                                                            "precompute" : [True, False],
                                                            "max_iter" : [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
                                                            "tol" : [0.0001, 0.001, 0.01, 0.1, 1.0],
                                                            "copy_X" : [True, False],
                                                            "warm_start" : [True, False],
                                                            "positive" : [True, False],
                                                            "random_state" : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                                                            "cv" : [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]}),
                            ("BayesianRidge", BayesianRidge(), {"n_iter" : [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
                                                                "tol" : [0.0001, 0.001, 0.01, 0.1, 1.0],
                                                                "alpha_1" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                                                "alpha_2" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                                                "lambda_1" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                                                "lambda_2" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                                                "fit_intercept" : [True, False],
                                                                "normalize" : [True, False],
                                                                "copy_X" : [True, False],
                                                                "verbose" : [True, False],
                                                                "n_jobs" : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}),                       
                            ("SVC", SVC(), {"kernel" : ["linear", "poly", "rbf", "sigmoid"],
                                            "C" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                            "gamma" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}),
                            ("NuSVC", NuSVC(), {"kernel" : ["linear", "poly", "rbf", "sigmoid"],
                                                "nu" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                                "gamma" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}),
                                                
                            ("KNN", KNeighborsClassifier(), {"n_neighbors" : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                                                                "weights" : ["uniform", "distance"],
                                                                "algorithm" : ["auto", "ball_tree", "kd_tree", "brute"],
                                                                "p" : [1, 2, 3]}),
                            ("DT", DecisionTreeClassifier(), {"criterion" : ["gini", "entropy"],
                                                                "max_depth" : [None, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
                                                                    "min_samples_split" : [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
                                                                    "min_samples_leaf" : [1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
                                                                    "max_features" : [None, "auto", "sqrt", "log2"]}),
                            ("RF", RandomForestClassifier(), {"n_estimators" : [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                                                                "criterion" : ["gini", "entropy"],
                                                                "max_depth" : [None, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
                                                                "min_samples_split" : [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
                                                                "min_samples_leaf" : [1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
                                                                "max_features" : [None, "auto", "sqrt", "log2"]}),
                            ("ETC", ExtraTreeClassifier(), {"n_estimators" : [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                                                                "criterion" : ["gini", "entropy"],
                                                                "max_depth" : [None, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
                                                                "min_samples_split" : [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
                                                                "min_samples_leaf" : [1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
                                                                "max_features" : [None, "auto", "sqrt", "log2"]}),
                            ("GB", GradientBoostingClassifier(), {"n_estimators" : [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                                                                    "learning_rate" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                                                    "max_depth" : [None, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
                                                                    "min_samples_split" : [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
                                                                    "min_samples_leaf" : [1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
                                                                    "max_features" : [None, "auto", "sqrt", "log2"]}),
                            ("AB", AdaBoostClassifier(), {"n_estimators" : [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                                                            "learning_rate" : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}),
                            ("VotingClassifier", VotingClassifier(
                                estimators=[('rf', RandomForestClassifier()), 
                                            ('lr', LogisticRegression()), 
                                            ('svm', SVC()), 
                                            ('dt', DecisionTreeClassifier()),
                                            ('knn', KNeighborsClassifier()),
                                            ('nb', GaussianNB()),
                                            ('et', ExtraTreesClassifier()),
                                            ('gb', GradientBoostingClassifier()),
                                            ('ab', AdaBoostClassifier())],
                                voting='soft'), {}),
                            ("BaggingClassifier", BaggingClassifier(), {"bese_estimator" : [RandomForestClassifier(),
                                                                                                LogisticRegression(),
                                                                                                SVC(),
                                                                                                DecisionTreeClassifier(),
                                                                                                KNeighborsClassifier(),
                                                                                                GaussianNB(),
                                                                                                ExtraTreesClassifier(),
                                                                                                GradientBoostingClassifier(),
                                                                                                AdaBoostClassifier()]}),
                            ("StackingClassifier", StackingClassifier(
                                estimators=[('rf', RandomForestClassifier()),
                                            ('lr', LogisticRegression()),
                                            ('svm', SVC()),
                                            ('dt', DecisionTreeClassifier()),
                                            ('knn', KNeighborsClassifier()),
                                            ('nb', GaussianNB()),
                                            ('et', ExtraTreesClassifier()),
                                            ('gb', GradientBoostingClassifier()),
                                            ('ab', AdaBoostClassifier())]), {}),
                                            
                       ]

In [50]:
models_train_val_score, models_best_params, best_index, test_score, ax = models_validation(X_train_preprocessed, y_train_preprocessed,
                                                                                     model_paramGrid_list,
                                                                                     plot=True)