# Build basic ML models with advanced pipeline

In [1]:
import pandas as pd
print("pandas version: {}". format(pd.__version__))

# numpy: support for large, multi-dimensional arrays and matrices and high-level mathematical functions
import numpy as np
print("numpy version: {}". format(np.__version__))

import sklearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedShuffleSplit, learning_curve, train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import precision_recall_curve, roc_curve, confusion_matrix, roc_auc_score, f1_score, accuracy_score
from sklearn.model_selection import cross_val_predict
print("sklearn version: {}". format(sklearn.__version__))
sklearn .set_config(display="diagram")

import xgboost
from xgboost import XGBClassifier
print("xgboost version: {}". format(xgboost.__version__))

import lightgbm
from lightgbm import LGBMClassifier
print("lightgbm version: {}". format(lightgbm.__version__))


from typing import Optional, Tuple


import optuna
print("optuna version: {}". format(optuna.__version__))

import mlflow
from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID
from mlflow.tracking import MlflowClient
print("mlflow version: {}". format(mlflow.__version__))

import matplotlib.pyplot as plt
import seaborn as sns

import time
import warnings
warnings.simplefilter('ignore')


import yaml
with open('ml_parameter.yaml') as file:
  config_data= yaml.safe_load(file)

pandas version: 1.4.3
numpy version: 1.21.5
sklearn version: 1.1.1
xgboost version: 1.6.1
lightgbm version: 3.3.2


  from .autonotebook import tqdm as notebook_tqdm


optuna version: 2.10.1
mlflow version: 1.28.0


In [2]:
VERSION = "3.4"

# define cross validation
cv = StratifiedShuffleSplit(
    n_splits = 10,
    test_size = 0.2,
    random_state = config_data["RANDOM_STATE"]
    )

In [3]:
client = MlflowClient()
try:
    experiment = client.create_experiment(config_data["experiment_name"])
except:
    experiment = client.get_experiment_by_name(config_data["experiment_name"]).experiment_id

In [4]:
def load_data():
    df_train = pd.read_pickle('../03_DataPreprocessing/df_train_prepared_unfinished.pkl')
    df_test = pd.read_pickle('../03_DataPreprocessing/df_test_prepared_unfinished.pkl')

    df_train.drop(['Last', 'TGroup'], axis=1, inplace=True)
    df_test.drop(['Last', 'TGroup'], axis=1, inplace=True)

    return df_train, df_test

df_train, df_test = load_data()

In [5]:
df_train.drop(['PassengerId', 'Name', 'dataset', 'First'], axis=1, inplace=True)
df_test.drop(['PassengerId', 'Name',  'dataset', 'Survived', 'First'], axis=1, inplace=True)

In [6]:
class DataSplitter:
    """The class is used to split the data"""

    def __init__(self, test_size:float):
        self.test_size = test_size

        self.X_train: pd.DataFrame = None
        self.y_train: pd.DataFrame = None
        self.X_validate: pd.DataFrame = None
        self.y_validate: pd.DataFrame = None


    def split(self, df_train: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
        """Split the data between train and test."""
        y_train = df_train['Survived']
        x_train = df_train.drop(['Survived'], axis=1)

        self.X_train, self.X_validate, self.y_train, self.y_validate = train_test_split(x_train, y_train, test_size=self.test_size)

        return self.X_train, self.y_train, self.X_validate, self.y_validate

In [7]:
data_splitter = DataSplitter(test_size=0.2)
x_train, y_train, x_validate, y_validate = data_splitter.split(df_train)

In [8]:
class BiningTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, n_bins_fare:int, n_bins_age:int):
        self.n_bins_fare = n_bins_fare
        self.n_bins_age = n_bins_age

    def fit(self, x:pd.DataFrame, y:Optional[pd.DataFrame]=None) -> "BiningTransformer":
        self.x_train = x.copy()
        return self

    def transform(self, x:pd.DataFrame) -> pd.DataFrame:

        x['Fare_bin'] = pd.qcut(x['Fare'], self.n_bins_fare, labels=False)
        x['Age_bin'] = pd.qcut(x['Age'], self.n_bins_age, labels=False)
        return x

In [9]:
class SkewedFeatureTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, transform_skewed_features_flag:bool):
        self.transform_skewed_features_flag = transform_skewed_features_flag

    def fit(self, x:pd.DataFrame, y:Optional[pd.DataFrame]=None) -> "BiningTransformer":
        self.x_train = x.copy()
        return self

    def transform(self, x:pd.DataFrame) -> pd.DataFrame:
        if self.transform_skewed_features_flag==True:
            x["Fare"] = x["Fare"].apply(np.log)
             # the not transformed data that contains 0
            # after the transformation we have -inf values that have to be replaced by 0
            x["Fare"][np.isneginf(x["Fare"])]=0

        return x

In [10]:
class OneHotEncoderTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, ohe_min_frequency:float, ohe_max_categories:float):
        self.ohe_min_frequency = ohe_min_frequency
        self.ohe_max_categories = ohe_max_categories
        self.cat_vars=None
        self.enc=None

    def fit(self, x:pd.DataFrame, y:Optional[pd.DataFrame]=None) -> "OneHotEncoderTransformer":
        enc = OneHotEncoder(handle_unknown='ignore', sparse=False, drop="if_binary",
            min_frequency=self.ohe_min_frequency, max_categories=self.ohe_max_categories)

        self.cat_vars = x.dtypes[x.dtypes == "object"].index
        self.enc = enc.fit(x[self.cat_vars])
        return self

    
    def transform(self, x:pd.DataFrame) -> pd.DataFrame:
        ohe = pd.DataFrame(self.enc.transform(x[self.cat_vars]), columns=self.enc.get_feature_names())
        
        x.reset_index(drop=True, inplace=True)
        ohe.reset_index(drop=True, inplace=True)

        x = pd.concat([x, ohe], axis=1).drop(self.cat_vars, axis=1)
        return x
    

In [11]:
class LowVarianceTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, variance_threshold:float):
        self.variance_threshold = variance_threshold
        self.sel_features=None
        self.sel=None
        self.sel_features_reduced=None

    def fit(self, x:pd.DataFrame, y:Optional[pd.DataFrame]=None) -> "LowVarianceTransformer":
        # remove all features that are either one or zero in more than 95% of the samples
        sel = VarianceThreshold(threshold=(self.variance_threshold * (1 - self.variance_threshold)))
        self.sel_features = list(x)
        # fit the VarianceThreshold object to the training data
        self.sel = sel.fit(x[self.sel_features])

        # get the column names after the variance threshold reduction
        self.sel_features_reduced = [self.sel_features[i] for i in self.sel.get_support(indices=True)]
        return self


    def transform(self, x:pd.DataFrame) -> pd.DataFrame:
        x = pd.DataFrame(self.sel.transform(x[self.sel_features]), columns=self.sel_features_reduced)
        return x

In [12]:
class CorrelationTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, correlation_threshold:float):
        self.correlation_threshold = correlation_threshold
        self.to_drop=None

    def fit(self, x:pd.DataFrame, y:Optional[pd.DataFrame]=None) -> "CorrelationTransformer":
        corr_matrix = x.corr().abs()

        # Select upper triangle of correlation matrix
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

        # Find features with correlation higher than 0.9 or lower -0.9
        self.to_drop = [column for column in upper.columns if any((upper[column] > self.correlation_threshold) | (upper[column] < -self.correlation_threshold))]
        return self


    def transform(self, x:pd.DataFrame) -> pd.DataFrame:
        x =  x.drop(self.to_drop, axis=1)
        return x

In [13]:
class ScalerTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, columnprep__transformers_num) -> None:
        self.columnprep__transformers_num = columnprep__transformers_num
        self.transformer_not_num=None
        self.transformer_num=None

        if columnprep__transformers_num == "StandardScaler":
            self.scaler = StandardScaler()
        elif columnprep__transformers_num == "MinMaxScaler":
            self.scaler = MinMaxScaler()

    def fit(self, x:pd.DataFrame, y:Optional[pd.DataFrame]=None) -> "ScalerTransformer":
        self.transformer_not_num = [col for col in list(x) if (col.startswith("x") & col[1].isnumeric())]
        self.transformer_num = [col for col in list(x) if col not in self.transformer_not_num]
        
        self.scaler.fit(x[self.transformer_num], y)
        return self


    def transform(self, x:pd.DataFrame) -> pd.DataFrame:
        x_transform = self.scaler.transform(x[self.transformer_num])
        x_transform = pd.DataFrame(x_transform, index=x.index, columns=x[self.transformer_num].columns)
        
        return pd.concat([x_transform, x[self.transformer_not_num]], axis=1)

In [14]:
def create_model(trial, model_type, child_run):

    # create parameter for data preprocessing pipeline
    n_bins_fare=trial.suggest_int('preprocessing_n_bins_fare', 5, 15)
    n_bins_age=trial.suggest_int('preprocessing_n_bins_age', 5, 15)
    transform_skewed_features_flag=trial.suggest_categorical("preprocessing_transform_skewed_features_flag", [True, False])
    ohe_min_frequency=trial.suggest_float("preprocessing_ohe_min_frequency", 0, 0.2, log=False)
    ohe_max_categories=trial.suggest_int('preprocessing_ohe_max_categories', 5, 20)
    feature_selection_low_variance_flag=trial.suggest_categorical("preprocessing_feature_selection_low_variance_flag", [True, False])
    correlation=trial.suggest_float("preprocessing_correlation", 0.7, 0.95, log=False)

    # log all parameters of the data preprocessing with mlflow
    client.log_param(child_run.info.run_id, "preprocessing_n_bins_fare", n_bins_fare)
    client.log_param(child_run.info.run_id, "preprocessing_n_bins_age", n_bins_age)
    client.log_param(child_run.info.run_id, "preprocessing_transform_skewed_features_flag", transform_skewed_features_flag)
    client.log_param(child_run.info.run_id, "preprocessing_ohe_min_frequency", ohe_min_frequency)
    client.log_param(child_run.info.run_id, "preprocessing_ohe_max_categories", ohe_max_categories)
    client.log_param(child_run.info.run_id, "preprocessing_feature_selection_low_variance_flag", feature_selection_low_variance_flag)
    client.log_param(child_run.info.run_id, "preprocessing_correlation", correlation)  


    ''' columnprep '''
    columnprep__transformers_num = trial.suggest_categorical("columnprep__transformers_num", ["StandardScaler", "MinMaxScaler"])


    ''' algo '''
    if model_type == 'SVC':
        svm_kernel = trial.suggest_categorical('svm_kernel', config_data["svm_kernel"])
        svm_C = trial.suggest_float('svm_C', config_data["svm_C"][0], config_data["svm_C"][1], log=True)
        svm_degree = trial.suggest_discrete_uniform('svm_degree', config_data["svm_degree"][0], config_data["svm_degree"][1], config_data["svm_degree"][2])
        
        model = SVC(
            kernel=svm_kernel,
            C=svm_C,
            degree=svm_degree,
            probability=True,
            random_state=config_data["RANDOM_STATE"]
        )

        client.log_param(child_run.info.run_id, "svm_kernel", svm_kernel)
        client.log_param(child_run.info.run_id, "svm_C", svm_C)
        client.log_param(child_run.info.run_id, "svm_degree", svm_degree)
    

    if model_type == 'LogisticRegression':
        lr_C = trial.suggest_float("lr_C", config_data["lr_C"][0], config_data["lr_C"][1], log=True)
        lr_penalty = trial.suggest_categorical('lr_penalty', config_data["lr_penalty"])
        if lr_penalty == 'l1':
            lr_solver = 'saga'
        else:
            lr_solver = 'lbfgs'
        
        model = LogisticRegression(
            C=lr_C,
            penalty=lr_penalty,
            solver=lr_solver,
            random_state=config_data["RANDOM_STATE"],
            n_jobs=-1
        )

        client.log_param(child_run.info.run_id, "lr_C", lr_C)
        client.log_param(child_run.info.run_id, "lr_penalty", lr_penalty)
        client.log_param(child_run.info.run_id, "lr_solver", lr_solver)


    if model_type == 'DecisionTreeClassifier':
        dt_max_depth = trial.suggest_int('dt_max_depth', config_data["dt_max_depth"][0], x_train.shape[1])
        dt_criterion = trial.suggest_categorical("dt_criterion", config_data["dt_criterion"])
        dt_max_leaf_nodes = trial.suggest_int("dt_max_leaf_nodes", config_data["dt_max_leaf_nodes"][0], config_data["dt_max_leaf_nodes"][1])
        
        model = DecisionTreeClassifier(
            max_depth=dt_max_depth,
            criterion=dt_criterion,
            max_leaf_nodes=dt_max_leaf_nodes,
            random_state=config_data["RANDOM_STATE"]
          )
    
        client.log_param(child_run.info.run_id, "dt_max_depth", dt_max_depth)
        client.log_param(child_run.info.run_id, "dt_criterion", dt_criterion)
        client.log_param(child_run.info.run_id, "dt_max_leaf_nodes", dt_max_leaf_nodes)
    


    if model_type == 'ExtraTreesClassifier':
        etc_n_estimators = trial.suggest_int('etc_n_estimators', config_data["etc_n_estimators"][0], config_data["etc_n_estimators"][1])
        etc_max_depth = trial.suggest_int('etc_max_depth', config_data["etc_max_depth"][0], x_train.shape[1])
        etc_min_samples_split = trial.suggest_float('etc_min_samples_split', config_data["etc_min_samples_split"][0], config_data["etc_min_samples_split"][1])
        etc_min_samples_leaf = trial.suggest_float('etc_min_samples_leaf', config_data["etc_min_samples_leaf"][0], config_data["etc_min_samples_leaf"][1])
        etc_criterion = trial.suggest_categorical("etc_criterion", config_data["etc_criterion"])
        etc_max_leaf_nodes = trial.suggest_int("etc_max_leaf_nodes", config_data["etc_max_leaf_nodes"][0], config_data["etc_max_leaf_nodes"][1])
        
        model = ExtraTreesClassifier(
            n_estimators=etc_n_estimators,
            max_depth=etc_max_depth,
            min_samples_split=etc_min_samples_split,
            min_samples_leaf=etc_min_samples_leaf,
            criterion=etc_criterion,
            max_leaf_nodes=etc_max_leaf_nodes,
            random_state=config_data["RANDOM_STATE"],
            n_jobs=-1
          )
    
        client.log_param(child_run.info.run_id, "etc_n_estimators", etc_n_estimators)
        client.log_param(child_run.info.run_id, "etc_max_depth", etc_max_depth)
        client.log_param(child_run.info.run_id, "etc_min_samples_split", etc_min_samples_split)
        client.log_param(child_run.info.run_id, "etc_min_samples_leaf", etc_min_samples_leaf)
        client.log_param(child_run.info.run_id, "etc_criterion", etc_criterion)
        client.log_param(child_run.info.run_id, "etc_max_leaf_nodes", etc_max_leaf_nodes)


    if model_type == 'RandomForestClassifier':
        rfc_n_estimators = trial.suggest_int('rfc_n_estimators', config_data["rfc_n_estimators"][0], config_data["rfc_n_estimators"][1])
        rfc_max_depth = trial.suggest_int('rfc_max_depth', config_data["rfc_max_depth"][0], x_train.shape[1])
        rfc_min_samples_split = trial.suggest_float('rfc_min_samples_split', config_data["rfc_min_samples_split"][0], config_data["rfc_min_samples_split"][1])
        rfc_min_samples_leaf = trial.suggest_float('rfc_min_samples_leaf', config_data["rfc_min_samples_leaf"][0], config_data["rfc_min_samples_leaf"][1])
        rfc_criterion = trial.suggest_categorical("rfc_criterion", config_data["rfc_criterion"])
        rfc_max_leaf_nodes = trial.suggest_int("rfc_max_leaf_nodes", config_data["rfc_max_leaf_nodes"][0], config_data["rfc_max_leaf_nodes"][1])
        
        model = RandomForestClassifier(
            n_estimators=rfc_n_estimators,
            max_depth=rfc_max_depth,
            min_samples_split=rfc_min_samples_split,
            min_samples_leaf=rfc_min_samples_leaf,
            criterion=rfc_criterion,
            max_leaf_nodes=rfc_max_leaf_nodes,
            random_state=config_data["RANDOM_STATE"],
            n_jobs=-1
          )
    
        client.log_param(child_run.info.run_id, "rfc_n_estimators", rfc_n_estimators)
        client.log_param(child_run.info.run_id, "rfc_max_depth", rfc_max_depth)
        client.log_param(child_run.info.run_id, "rfc_min_samples_split", rfc_min_samples_split)
        client.log_param(child_run.info.run_id, "rfc_min_samples_leaf", rfc_min_samples_leaf)
        client.log_param(child_run.info.run_id, "rfc_criterion", rfc_criterion)
        client.log_param(child_run.info.run_id, "rfc_max_leaf_nodes", rfc_max_leaf_nodes)


    if model_type == 'XGBClassifier':
        xgb_n_estimators = trial.suggest_int('xgb_n_estimators', config_data["xgb_n_estimators"][0], config_data["xgb_n_estimators"][1])
        xgb_learning_rate = trial.suggest_float("xgb_learning_rate", config_data["xgb_learning_rate"][0], config_data["xgb_learning_rate"][1], log=True)
        xgb_reg_lambda = trial.suggest_float("xgb_reg_lambda", config_data["xgb_reg_lambda"][0], config_data["xgb_reg_lambda"][1], log=True)
        xgb_reg_alpha = trial.suggest_float("xgb_reg_alpha", config_data["xgb_reg_alpha"][0], config_data["xgb_reg_alpha"][1], log=True)
        
        model = XGBClassifier(
            n_estimators=xgb_n_estimators,
            learning_rate=xgb_learning_rate,
            reg_lambda=xgb_reg_lambda,
            reg_alpha=xgb_reg_alpha,
            random_state=config_data["RANDOM_STATE"],
            n_jobs = -1
          )
    
        client.log_param(child_run.info.run_id, "xgb_n_estimators", xgb_n_estimators)
        client.log_param(child_run.info.run_id, "xgb_learning_rate", xgb_learning_rate)
        client.log_param(child_run.info.run_id, "xgb_reg_lambda", xgb_reg_lambda)
        client.log_param(child_run.info.run_id, "xgb_reg_alpha", xgb_reg_alpha)


    if model_type == 'LGBMClassifier':
        lgb_n_estimators = trial.suggest_int('lgb_n_estimators', config_data["lgb_n_estimators"][0], config_data["lgb_n_estimators"][1])
        lgb_learning_rate = trial.suggest_float("lgb_learning_rate", config_data["lgb_learning_rate"][0], config_data["lgb_learning_rate"][1], log=True)
        lgb_max_depth = trial.suggest_int('lgb_max_depth', config_data["lgb_max_depth"][0], config_data["lgb_max_depth"][1])
        lgb_num_leaves = trial.suggest_int('lgb_num_leaves', config_data["lgb_num_leaves"][0], 2^lgb_max_depth+3)
        lgb_min_data_in_leaf = trial.suggest_int('lgb_min_data_in_leaf', config_data["lgb_min_data_in_leaf"][0], config_data["lgb_min_data_in_leaf"][1])
        lgb_subsample = trial.suggest_float('lgb_subsample', config_data["lgb_subsample"][0], config_data["lgb_subsample"][1])
        lgb_feature_fraction = trial.suggest_float('lgb_feature_fraction', config_data["lgb_feature_fraction"][0], config_data["lgb_feature_fraction"][1])
        lgb_reg_lambda = trial.suggest_float("lgb_reg_lambda", config_data["lgb_reg_lambda"][0], config_data["lgb_reg_lambda"][1], log=True)
        lgb_reg_alpha = trial.suggest_float("lgb_reg_alpha", config_data["lgb_reg_alpha"][0], config_data["lgb_reg_alpha"][1], log=True)
        
        model = LGBMClassifier(
            n_estimators=lgb_n_estimators,
            learning_rate=lgb_learning_rate,
            max_depth=lgb_max_depth,
            num_leaves=lgb_num_leaves,
            min_data_in_leaf=lgb_min_data_in_leaf,
            subsample=lgb_subsample,
            feature_fraction=lgb_feature_fraction,
            reg_lambda=lgb_reg_lambda,
            reg_alpha=lgb_reg_alpha,
            # num_threads = 4,
            random_state=config_data["RANDOM_STATE"],
            n_jobs=-1
            # device_type='cuda_exp'
          )
    
        client.log_param(child_run.info.run_id, "lgb_n_estimators", lgb_n_estimators)
        client.log_param(child_run.info.run_id, "lgb_learning_rate", lgb_learning_rate)
        client.log_param(child_run.info.run_id, "lgb_max_depth", lgb_max_depth)
        client.log_param(child_run.info.run_id, "lgb_num_leaves", lgb_num_leaves)
        client.log_param(child_run.info.run_id, "lgb_min_data_in_leaf", lgb_min_data_in_leaf)
        client.log_param(child_run.info.run_id, "lgb_subsample", lgb_subsample)
        client.log_param(child_run.info.run_id, "lgb_feature_fraction", lgb_feature_fraction)
        client.log_param(child_run.info.run_id, "lgb_reg_lambda", lgb_reg_lambda)
        client.log_param(child_run.info.run_id, "lgb_reg_alpha", lgb_reg_alpha)


    if model_type == 'GradientBoostingClassifier':
        gbc_n_estimators = trial.suggest_int('gbc_n_estimators', config_data["gbc_n_estimators"][0], config_data["gbc_n_estimators"][1])
        gbc_learning_rate = trial.suggest_float("gbc_learning_rate", config_data["gbc_learning_rate"][0], config_data["gbc_learning_rate"][1], log=True)
        gbc_max_depth = trial.suggest_int('gbc_max_depth', config_data["gbc_max_depth"][0], config_data["gbc_max_depth"][1])
        gbc_min_samples_split = trial.suggest_float('gbc_min_samples_split', config_data["gbc_min_samples_split"][0], config_data["gbc_min_samples_split"][1])
        gbc_min_samples_leaf = trial.suggest_float('gbc_min_samples_leaf', config_data["gbc_min_samples_leaf"][0], config_data["gbc_min_samples_leaf"][1])
        gbc_max_leaf_nodes = trial.suggest_int("gbc_max_leaf_nodes", config_data["gbc_max_leaf_nodes"][0], config_data["gbc_max_leaf_nodes"][1])
        
        model = GradientBoostingClassifier(
            n_estimators=gbc_n_estimators,
            learning_rate=gbc_learning_rate,
            max_depth=gbc_max_depth,
            min_samples_split=gbc_min_samples_split,
            min_samples_leaf=gbc_min_samples_leaf,
            max_leaf_nodes=gbc_max_leaf_nodes,
            random_state=config_data["RANDOM_STATE"],
          )
    
        client.log_param(child_run.info.run_id, "gbc_n_estimators", gbc_n_estimators)
        client.log_param(child_run.info.run_id, "gbc_learning_rate", gbc_learning_rate)
        client.log_param(child_run.info.run_id, "gbc_max_depth", gbc_max_depth)
        client.log_param(child_run.info.run_id, "gbc_min_samples_split", gbc_min_samples_split)
        client.log_param(child_run.info.run_id, "gbc_min_samples_leaf", gbc_min_samples_leaf)
        client.log_param(child_run.info.run_id, "gbc_max_leaf_nodes", gbc_max_leaf_nodes)



    client.log_param(child_run.info.run_id, "algo", model.__class__.__name__)


    pipeline = make_pipeline(
        BiningTransformer(n_bins_fare=n_bins_fare, n_bins_age=n_bins_age),
        SkewedFeatureTransformer(transform_skewed_features_flag=transform_skewed_features_flag),
        OneHotEncoderTransformer(ohe_min_frequency=ohe_min_frequency, ohe_max_categories=ohe_max_categories),
        LowVarianceTransformer(variance_threshold=0.95),
        CorrelationTransformer(correlation_threshold=correlation),
        ScalerTransformer(columnprep__transformers_num=columnprep__transformers_num),
        model,
        verbose=True
    )
            
    return pipeline

In [15]:
def evaluate_model(x_train, y_train, y_validate, y_validate_pred, y_validate_scores, pipeline, child_run):
    """
    evaluate the classification model with
    - classification report
    - precision-recall-curve
    - ROC curve
    """

    def plot_learning_curve(pipeline, x_train, y_train):

        train_sizes, train_scores, test_scores = learning_curve(
            pipeline,
            x_train,
            y_train,
            cv=cv,
            n_jobs=-1,
            train_sizes=np.linspace(.1, 1.0, 8)
            )


        fig1, ax1 = plt.subplots()
        ax1.set_xlabel("Training examples")
        ax1.set_ylabel("Score")
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)
        ax1.grid()

        ax1.fill_between(train_sizes, train_scores_mean - train_scores_std,
                        train_scores_mean + train_scores_std, alpha=0.1,
                        color="r")
        ax1.fill_between(train_sizes, test_scores_mean - test_scores_std,
                        test_scores_mean + test_scores_std, alpha=0.1, color="g")
        ax1.plot(train_sizes, train_scores_mean, 'o-', color="r",
                label="Training score")
        ax1.plot(train_sizes, test_scores_mean, 'o-', color="g",
                label="Cross-validation score")

        ax1.legend(loc="best")
        ax1.set_title("Difference between training and CV: "\
            + str(round(test_scores_mean[7] / train_scores_mean[7] * 100, 2))\
            + "%")
        client.log_figure(child_run.info.run_id, fig1, 'plot_learning_curve.png')
        plt.close()


    def plot_confusion_matrix(y_validate, y_validate_pred):
        group_names = ["True Neg", "False Pos", "False Neg", "True Pos"]
        group_counts = ["{0:0.0f}".format(value) for value in
                        confusion_matrix(y_validate, y_validate_pred).flatten()]
        group_percentages = ["{0:.2%}".format(value) for value in
                             confusion_matrix(y_validate, y_validate_pred).flatten()/np.sum(confusion_matrix(y_validate, y_validate_pred))]
        labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
                  zip(group_names,group_counts,group_percentages)]
        labels = np.asarray(labels).reshape(2,2)

        fig2, ax2 = plt.subplots()
        sns.heatmap(confusion_matrix(y_validate, y_validate_pred), annot=labels, fmt="", cmap='Blues')
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        client.log_figure(child_run.info.run_id, fig2, 'plot_confusion_matrix.png')
        plt.close()


    def plot_precision_recall_vs_threshold(y_validate, y_scores, child_run):
        precisions, recalls, thresholds = precision_recall_curve(y_validate, y_scores)

        # convert to f score
        fscore = (2 * precisions * recalls) / (precisions + recalls)
        # locate the index of the largest f score
        ix = np.argmax(fscore)
        
        client.log_metric(child_run.info.run_id, "f1_score", round(fscore[ix], 5))

        fig3, ax3 = plt.subplots()
        ax3.plot(thresholds, precisions[:-1], "b", label="Precision")
        ax3.plot(thresholds, recalls[:-1], "g", label="Recall")
        ax3.plot(thresholds, fscore[:-1], "r", label="F1 Score")
        ax3.axvline(x=thresholds[ix], color='red', linestyle='--')
        plt.axhline(y=precisions[ix], color='b', linestyle='--')
        plt.axhline(y=recalls[ix], color='g', linestyle='--')
        ax3.set_xlabel("Threshold")
        ax3.legend(loc="upper left")
        ax3.set_ylim([0,1])
        client.log_figure(child_run.info.run_id, fig3, 'plot_f1.png')
        plt.close()

        fig4, ax4 = plt.subplots()
        ax4.plot(recalls, precisions, marker='.', label='Logistic')
        ax4.scatter(recalls[ix], precisions[ix], 200, marker='o', color='red', label='Best')
        ax4.set_xlabel('Recall')
        ax4.set_ylabel('Precision')
        client.log_figure(child_run.info.run_id, fig4, 'plot_precision_recall.png')
        plt.close()
        

    def plot_roc_curve(y_validate, y_scores, child_run):
        fpr, tpr, thresholds = roc_curve(y_validate, y_scores)

        roc_auc = round(roc_auc_score(y_validate, y_scores), 3)
        
        optimal_idx = np.argmax(tpr - fpr)

        fig5, ax5 = plt.subplots()
        ax5.plot(fpr, tpr, linewidth=2)
        ax5.plot([0,1], [0,1], 'k--')
        ax5.axis([0,1,0,1])
        ax5.scatter(fpr[optimal_idx], tpr[optimal_idx], 200, marker='o', color='red', label='Best')
        ax5.set_xlabel('False Positive Rate')
        ax5.set_ylabel('True Positive Rate')
        client.log_figure(child_run.info.run_id, fig5, 'plot_roc_curve.png')
        plt.close()

        client.log_metric(child_run.info.run_id, "roc_auc", roc_auc)

        

    plot_confusion_matrix(y_validate, y_validate_pred)
    plot_precision_recall_vs_threshold(y_validate, y_validate_scores, child_run)
    plot_roc_curve(y_validate, y_validate_scores, child_run)
    plot_learning_curve(pipeline, x_train, y_train)

In [16]:
def create_submission(best_model, x_test, parent_run):
    # predict the test values with the training classification model
    y_pred = best_model.predict(x_test).astype(int)
    
    df_submission = pd.read_csv("../01_RawData/gender_submission.csv")
    df_submission['Survived'] = y_pred
    
    df_submission.to_csv('submissions/%s.csv'%parent_run.info.run_id, index=False)


In [17]:
class Objective:
    
    def __init__(self, model_type, parent_run):
        self.best_model = None
        self._model = None
        
        self.model_type = model_type
        self.parent_run = parent_run

    
    def __call__(self, trial):
        start_time = time.time()

        child_run = client.create_run(
            experiment_id=experiment,
            tags={
                MLFLOW_PARENT_RUN_ID: self.parent_run.info.run_id
            }
        )

        """ Machine Learning Model Creation """
        pipeline = create_model(trial, self.model_type, child_run)
        self._model = pipeline



        # predict probabilities
        y_train_proba = cross_val_predict(pipeline, x_train, y_train, cv=5, n_jobs=-1, method='predict_proba')
        # keep probabilities for the positive outcome only
        y_train_scores = y_train_proba[:, 1]

        # apply threshold to positive probabilities to create labels
        def to_labels(pos_probs, threshold):
            return (pos_probs >= threshold).astype('int')

        # define thresholds
        thresholds = np.arange(0, 1, 0.001)
        # evaluate each threshold
        scores = [f1_score(y_train, to_labels(y_train_scores, t)) for t in thresholds]
        # get best threshold
        ix = np.argmax(scores)
        client.log_metric(child_run.info.run_id, "classification_threshold", ix)

        y_train_pred = (y_train_scores >= thresholds[ix]).astype(bool)

        score = accuracy_score(y_train, y_train_pred)
        client.log_metric(child_run.info.run_id, "cv_score", score)

        client.log_metric(child_run.info.run_id, "runtime", time.time() - start_time)

        return score



    def callback(self, study, trial):
        if study.best_trial == trial:
            self.best_model = self._model


In [18]:
def create_new_mlrun(model_type):

  parent_run = client.create_run(experiment_id=experiment, tags={"Version": VERSION})

  objective = Objective(model_type, parent_run)

  study = optuna.create_study(
    sampler = optuna.samplers.TPESampler(),
    direction="maximize"
    )

  study.optimize(
    objective,
    n_trials=config_data["N_TRAILS"],
    timeout=config_data["TIMEOUT"],
    n_jobs=-1,
    callbacks=[objective.callback]
    )

  print("Best trial:")
  print(study.best_value)
  print(study.best_params)

  client.log_metric(parent_run.info.run_id, "best_cv_score", round(study.best_value, 3))
  # client.log_param(parent_run.info.run_id, "transformer_num", str(transformer_num))

  client.log_param(parent_run.info.run_id, "cv_n_splits", cv.n_splits)
  client.log_param(parent_run.info.run_id, "cv_train_size", cv.train_size)
  client.log_param(parent_run.info.run_id, "cv_test_size", cv.test_size)
  client.log_param(parent_run.info.run_id, "cv_random_state", cv.random_state)

  print("Log best parameters")
  for param in study.best_params:
    client.log_param(parent_run.info.run_id, param, study.best_params[param])


  print("Save best model")
  # save the best model as file
  best_model = objective.best_model
  mlflow.sklearn.save_model(best_model, "models/%s/"%parent_run.info.run_id)

  print("Fit best model")
  # fit the pipeline with the total training dataset to compute the validation results
  best_model.fit(x_train, y_train)

  print("Create submission")
  # create submission of best model
  create_submission(best_model, df_test, parent_run)

  print("Predict training outcome")
  # predict the training outcome
  y_validate_pred = best_model.predict(x_validate)

  # predict probabilities
  y_validate_proba = best_model.predict_proba(x_validate)
  # keep probabilities for the positive outcome only
  y_validate_scores = y_validate_proba[:, 1]

  print("Evaluate model performance")
  evaluate_model(x_train, y_train, y_validate, y_validate_pred, y_validate_scores, best_model, parent_run)

  mlflow.end_run()

  return study

In [19]:
# study_lr = create_new_mlrun(model_type='LogisticRegression')
# optuna.visualization.plot_optimization_history(study_lr)

In [20]:
study_dt = create_new_mlrun(model_type='DecisionTreeClassifier')
optuna.visualization.plot_optimization_history(study_dt)

[32m[I 2022-12-07 17:28:05,424][0m A new study created in memory with name: no-name-81d3df98-3788-49d6-b9dc-9e14f782a876[0m
[32m[I 2022-12-07 17:28:25,306][0m Trial 2 finished with value: 0.8061797752808989 and parameters: {'preprocessing_n_bins_fare': 7, 'preprocessing_n_bins_age': 13, 'preprocessing_transform_skewed_features_flag': False, 'preprocessing_ohe_min_frequency': 0.04217310261610688, 'preprocessing_ohe_max_categories': 12, 'preprocessing_feature_selection_low_variance_flag': False, 'preprocessing_correlation': 0.9488803072246007, 'columnprep__transformers_num': 'MinMaxScaler', 'dt_max_depth': 5, 'dt_criterion': 'gini', 'dt_max_leaf_nodes': 8}. Best is trial 2 with value: 0.8061797752808989.[0m
[32m[I 2022-12-07 17:28:25,336][0m Trial 3 finished with value: 0.7865168539325843 and parameters: {'preprocessing_n_bins_fare': 8, 'preprocessing_n_bins_age': 14, 'preprocessing_transform_skewed_features_flag': False, 'preprocessing_ohe_min_frequency': 0.1548123481484011, 'pr

Best trial:
0.8202247191011236
{'preprocessing_n_bins_fare': 13, 'preprocessing_n_bins_age': 13, 'preprocessing_transform_skewed_features_flag': True, 'preprocessing_ohe_min_frequency': 0.1663534506888884, 'preprocessing_ohe_max_categories': 5, 'preprocessing_feature_selection_low_variance_flag': False, 'preprocessing_correlation': 0.9490172774762956, 'columnprep__transformers_num': 'MinMaxScaler', 'dt_max_depth': 13, 'dt_criterion': 'gini', 'dt_max_leaf_nodes': 10}
Log best parameters
Save best model
Fit best model
[Pipeline] . (step 1 of 7) Processing biningtransformer, total=   0.0s
[Pipeline]  (step 2 of 7) Processing skewedfeaturetransformer, total=   0.0s
[Pipeline]  (step 3 of 7) Processing onehotencodertransformer, total=   0.0s
[Pipeline]  (step 4 of 7) Processing lowvariancetransformer, total=   0.0s
[Pipeline]  (step 5 of 7) Processing correlationtransformer, total=   0.0s
[Pipeline] . (step 6 of 7) Processing scalertransformer, total=   0.0s
[Pipeline]  (step 7 of 7) Proces

In [21]:
# study_svm = create_new_mlrun(model_type='SVC')
# optuna.visualization.plot_optimization_history(study_svm)

In [22]:
# study_svm = create_new_mlrun(model_type='ExtraTreesClassifier')
# optuna.visualization.plot_optimization_history(study_svm)

In [23]:
# study_svm = create_new_mlrun(model_type='GradientBoostingClassifier')
# optuna.visualization.plot_optimization_history(study_svm)

In [24]:
# study_svm = create_new_mlrun(model_type='RandomForestClassifier')
# optuna.visualization.plot_optimization_history(study_svm)

In [25]:
mlflow.end_run()

In [26]:
def add_Kaggle_score(run_id, kaggle_score):
    # show if kaggle_score is already present
    if "kaggle_score" not in dict(dict(mlflow.get_run(run_id))["data"])["metrics"].keys():
        # if no kaggle_score is present, start run and write kaggle_score
        with mlflow.start_run(run_id=run_id):
            mlflow.log_metric("kaggle_score", kaggle_score)

In [27]:
add_Kaggle_score(run_id="9de764cc084145ecb78ac80ac69c8e16", kaggle_score=0.77272) # SCV
add_Kaggle_score(run_id="bb4cc975082b4558a5aa36fa5f10caa7", kaggle_score=0.78229) # DecisionTreeClassifier
add_Kaggle_score(run_id="9ce44a56971045598cee8afcf4c098f1", kaggle_score=0.76315) # DecisionTreeClassifier
add_Kaggle_score(run_id="0f17ada2ca224d1ebd2ceea939c90e0b", kaggle_score=0.78708) # LogisticRegression
add_Kaggle_score(run_id="7c40bca8da2140ac9090118d4b3da91a", kaggle_score=0.78229) # LogisticRegression

In [28]:
add_Kaggle_score(run_id="e17cba6520e746beba20beed0f3b1fbc", kaggle_score=0.78229) # GradientBoostingClassifier
add_Kaggle_score(run_id="b000d21026a545188146ac997a60e209", kaggle_score=0.77033) # ExtraTreesClassifier
add_Kaggle_score(run_id="8195b99b2326474e911326557e29db42", kaggle_score=0.76555) # RandomForestClassifier
add_Kaggle_score(run_id="e51353c56ece42c7a508e27ab50daa8c", kaggle_score=0.77751) # RandomForestClassifier

In [29]:
df = mlflow.search_runs(experiment_names=["Titanic"])
df = df[df["run_id"] == "bb4cc975082b4558a5aa36fa5f10caa7"]
df.head().T

Unnamed: 0,963
run_id,bb4cc975082b4558a5aa36fa5f10caa7
experiment_id,1
status,FINISHED
artifact_uri,file:///E:/Data%20Science%20Projects/Kaggle/ti...
start_time,2022-11-21 19:19:21.252000+00:00
end_time,2022-11-21 19:25:36.869000+00:00
metrics.runtime,
metrics.cv_score,
metrics.classification_threshold,
metrics.roc_auc,0.805
