In [1]:
import warnings
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder,
    LabelEncoder, SplineTransformer, OrdinalEncoder
)
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn import set_config
from sklearn.model_selection import RandomizedSearchCV, cross_validate, ShuffleSplit
from sklearn.exceptions import ConvergenceWarning

In [2]:
def concatenate(*args):
    final_dict = {key: [] for key in args[0].keys()}
    for dictionary in args:
        for key, value in dictionary.items():
            final_dict[key].extend(value)
    return final_dict

In [3]:
df = sns.load_dataset('penguins')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [4]:
target_column = 'sex'
nominal_columns = [column for column in ['species', 'island', 'sex'] if column != target_column]
quantitative_columns = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']

In [5]:
nominal_preprocessing = Pipeline([    
    ("missing", SimpleImputer(strategy='most_frequent')),
    ("encoder", OneHotEncoder(sparse=False)),
    ("scaler", StandardScaler())
])
quantitative_preprocessing = Pipeline([
    ("missing", SimpleImputer()),
    ("scaler", StandardScaler())
])
preprocessing = ColumnTransformer([
    ("nominal", nominal_preprocessing, nominal_columns),
    ("quantitative", quantitative_preprocessing, quantitative_columns)
])

In [6]:
 df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [7]:
X = (
    df
    .query(f'{target_column}.notna()')
    .drop([target_column], axis=1)
)
target_transformer = LabelEncoder()
y = target_transformer.fit_transform(
    df
    .query(f'{target_column}.notna()')[[target_column]]
    .values.ravel()
)

In [8]:
models = [{
    'name': 'knn',
    'model': KNeighborsClassifier(),
    'parameters': {
        'n_neighbors': np.arange(3, 21, 2),
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    }
},{
    'name': 'LR',
    'model': LogisticRegression(max_iter=3000, solver='liblinear'),
    'parameters': {
        'penalty': ['l1', 'l2'],
        'C' : np.logspace(-4, 4, 10),
    }
},{
    'name': 'SVC',
    'model': SVC(max_iter=10000, gamma='auto'),
    'parameters': {
        "C": [1, 10, 100, 1e3, 1e4, 1e5]
    }
},{
    'name': 'GB',
    'model': GradientBoostingClassifier(loss="deviance"),
    'parameters': {
        "learning_rate": [0.01, 0.05, 0.1, 0.2],
        "min_samples_split": np.linspace(0.1, 0.5, 6),
        "min_samples_leaf": np.linspace(0.1, 0.5, 6),
        "max_depth": [3, 5, 8],
        "max_features": ["log2", "sqrt"],
        "criterion": ["friedman_mse", "squared_error"],
        "subsample": [0.5, 0.8, 0.9, 1.0]
    }
}]

In [9]:
n_splits_cv = 30
n_splits_cv_gs = 5
sc = []
for model in models:
    print(f"runing {model['name']}")
    param_grid = {
        'preprocessing__quantitative__scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
        'preprocessing__nominal__encoder': [OneHotEncoder(sparse=False), OrdinalEncoder()],
        'preprocessing__nominal__scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
        'preprocessing__quantitative__missing__strategy': ['mean', 'median'],
        **{f"model__{key}": value for key, value in model['parameters'].items()}
    }
    approach = Pipeline([
        ('preprocessing', preprocessing),
        ('model', model['model'])
    ])
    gs = RandomizedSearchCV(
        estimator=approach,
        param_distributions=param_grid,
        scoring='accuracy',
        cv=n_splits_cv_gs,
        random_state=42
    )
    scores = cross_validate(
        estimator = gs,
        X=X,
        y=y,
        cv = ShuffleSplit(n_splits=n_splits_cv, test_size=.2),
        n_jobs = 4,
        scoring = ['accuracy', 'precision', 'recall','f1', 'roc_auc'],
    )
    scores['model'] = [model['name']] * n_splits_cv
    sc.append(scores)
scores = concatenate(*sc)

runing knn
runing LR
runing SVC
runing GB


In [13]:
def highlight_max(s, props=''):
    values = [float(value.split()[0]) for value in s.values[1:]]
    result = [''] * len(s.values)
    if s.values[0].endswith('time'):
        result[np.argmin(values)+1] = props
    else:
        result[np.argmax(values)+1] = props
    return result

results = (
    pd
    .DataFrame(scores)
    .groupby(['model'])
    .agg([lambda x: f"{np.mean(x):.3f} ± {np.std(x):.3f}"])#
    .transpose()
    .reset_index()
    .rename(columns={"level_0": "score"})
    .drop(columns="level_1")
    # .set_index('score')
)
results.columns.name = ''
results = (
    results
    .style
    .hide(axis='index')
    .apply(highlight_max, props='color:white;background-color:gray', axis=1)
)
results

score,GB,LR,SVC,knn
fit_time,1.568 ± 0.009,0.484 ± 0.005,0.671 ± 0.017,0.537 ± 0.001
score_time,0.014 ± 0.000,0.029 ± 0.002,0.020 ± 0.001,0.022 ± 0.000
test_accuracy,0.910 ± 0.015,0.910 ± 0.045,0.925 ± 0.015,0.866 ± 0.015
test_precision,0.920 ± 0.011,0.853 ± 0.053,0.898 ± 0.009,0.858 ± 0.058
test_recall,0.891 ± 0.009,0.944 ± 0.056,0.954 ± 0.013,0.863 ± 0.006
test_f1,0.905 ± 0.010,0.896 ± 0.054,0.925 ± 0.011,0.860 ± 0.032
test_roc_auc,0.963 ± 0.001,0.973 ± 0.007,0.984 ± 0.005,0.942 ± 0.012


In [11]:
best_model = {
    'name': 'GB',
    'model': GradientBoostingClassifier(loss="deviance"),
    'parameters': {
        "learning_rate": [0.01, 0.05, 0.1, 0.2],
        "min_samples_split": np.linspace(0.1, 0.5, 6),
        "min_samples_leaf": np.linspace(0.1, 0.5, 6),
        "max_depth": [3, 5, 8],
        "max_features": ["log2", "sqrt"],
        "criterion": ["friedman_mse", "squared_error"],
        "subsample": [0.5, 0.8, 0.9, 1.0]
    }
}

approach = Pipeline([
    ('preprocessing', preprocessing),
    ('model', best_model['model'])
])

gs = RandomizedSearchCV(
    estimator=approach,
    param_distributions=param_grid,
    scoring='accuracy',
    cv=n_splits_cv_gs,
    random_state=42
)

gs.fit(X, y)

model = gs.best_estimator_
joblib.dump(model, '../models/best_model.joblib')

['../models/best_model.joblib']