# Build basic ML models incl. pipeline and validation strategy
Build , test and evaluate multiple basic ML models regarding
- which kinds of models have the best performance?
- which metrics are requested by the Kaggle competition?
- which settings in the data preprocessing stage increate the model performance?
- which validation strategy works the best?

In [1]:
import pandas as pd
print("pandas version: {}". format(pd.__version__))

# numpy: support for large, multi-dimensional arrays and matrices and high-level mathematical functions
import numpy as np
print("numpy version: {}". format(np.__version__))

import sklearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit, train_test_split, cross_val_score, learning_curve
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import precision_recall_curve, roc_curve, confusion_matrix, roc_auc_score
print("sklearn version: {}". format(sklearn.__version__))

import optuna
print("optuna version: {}". format(optuna.__version__))

import mlflow
from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID
from mlflow.tracking import MlflowClient
print("mlflow version: {}". format(mlflow.__version__))

import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
import warnings
warnings.simplefilter('ignore')


import yaml
with open('ml_parameter.yaml') as file:
  config_data= yaml.safe_load(file)

pandas version: 1.4.3
numpy version: 1.21.5
sklearn version: 1.1.1


  from .autonotebook import tqdm as notebook_tqdm


optuna version: 2.10.1
mlflow version: 1.28.0


In [2]:
client = MlflowClient()
try:
    experiment = client.create_experiment(config_data["experiment_name"])
except:
    experiment = client.get_experiment_by_name(config_data["experiment_name"]).experiment_id

In [3]:
class DataPreprocessing(BaseEstimator, TransformerMixin):
    def __init__(self, n_bins_fare, n_bins_age, transform_skewed_features_flag, ohe_min_frequency, ohe_max_categories, feature_selection_low_variance_flag, correlation):
        self.n_bins_fare=n_bins_fare
        self.n_bins_age=n_bins_age
        self.transform_skewed_features_flag=transform_skewed_features_flag
        self.ohe_min_frequency=ohe_min_frequency
        self.ohe_max_categories=ohe_max_categories
        self.feature_selection_low_variance_flag=feature_selection_low_variance_flag
        self.correlation=correlation


    def fit(self, X, y=None):
        return self


    def load_data(self):
        df_train = pd.read_pickle('../03_DataPreprocessing/df_train_prepared_unfinished.pkl')
        df_test = pd.read_pickle('../03_DataPreprocessing/df_test_prepared_unfinished.pkl')
        return df_train, df_test


    def bining(self, df_train, df_test):
        df_train['Fare_bin'] = pd.qcut(df_train['Fare'], self.n_bins_fare, labels=False)
        df_test['Fare_bin'] = pd.qcut(df_test['Fare'], self.n_bins_fare, labels=False)

        df_train['Age_bin'] = pd.qcut(df_train['Age'], self.n_bins_age, labels=False)
        df_test['Age_bin'] = pd.qcut(df_test['Age'], self.n_bins_age, labels=False)

        return df_train, df_test


    def transform_skewed_features(self, df_train, df_test):
        df_train["Fare"] = df_train["Fare"].apply(np.log)
        df_test["Fare"] = df_test["Fare"].apply(np.log)

        # the not transformed data that contains 0
        # after the transformation we have -inf values that have to be replaced by 0
        df_train["Fare"][np.isneginf(df_train["Fare"])]=0
        df_test["Fare"][np.isneginf(df_test["Fare"])]=0

        return df_train, df_test


    def transform_categoric_OneHotEncoder(self, df_train, df_test):
        from sklearn.preprocessing import OneHotEncoder
        enc = OneHotEncoder(handle_unknown='ignore', sparse=False, drop="if_binary", min_frequency=self.ohe_min_frequency, max_categories=self.ohe_max_categories)
        cat_vars = df_train.dtypes[df_train.dtypes == "object"].index

        ohe_train = pd.DataFrame(enc.fit_transform(df_train[cat_vars]), columns=enc.get_feature_names())
        df_train = pd.concat([df_train, ohe_train], axis=1).drop(cat_vars, axis=1)

        ohe_test = pd.DataFrame(enc.transform(df_test[cat_vars]), columns=enc.get_feature_names())
        df_test = pd.concat([df_test, ohe_test], axis=1).drop(cat_vars, axis=1)

        return df_train, df_test


    def feature_selection_low_variance(self, df_train, df_test):
        from sklearn.feature_selection import VarianceThreshold

        # remove all features that are either one or zero in more than 95% of the samples
        sel = VarianceThreshold(threshold=(.95 * (1 - .95)))
        sel_features = list(df_train)

        # remove the label from the list of columns
        sel_features.remove("Survived")

        # fit the VarianceThreshold object to the training data
        sel.fit(df_train[sel_features])

        # get the column names after the variance threshold reduction
        sel_features_reduced = [sel_features[i] for i in sel.get_support(indices=True)]

        # create the training and test dataset by transform the datasets to the variance threshold object
        df_train_ = pd.DataFrame(sel.transform(df_train[sel_features]), columns=sel_features_reduced)
        # dont forget to join the label back to the training data
        df_train = pd.concat([df_train_, df_train["Survived"]], axis=1)
        df_test = pd.DataFrame(sel.transform(df_test[sel_features]), columns=sel_features_reduced)

        return df_train, df_test


    def feature_selection_correlation(self, df_train, df_test):
        corr_matrix = df_train.corr().abs()

        # Select upper triangle of correlation matrix
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

        # Find features with correlation higher than 0.9 or lower -0.9
        to_drop = [column for column in upper.columns if any((upper[column] > self.correlation) | (upper[column] < -self.correlation))]

        df_train.drop(to_drop, axis=1, inplace=True)
        df_test.drop(to_drop, axis=1, inplace=True)

        return df_train, df_test




    def transform(self, X=None, y=None):
        df_train, df_test = self.load_data()

        df_train, df_test = self.bining(df_train, df_test)

        if self.transform_skewed_features_flag == True:
            df_train, df_test = self.transform_skewed_features(df_train, df_test)
        
        df_train.drop(['PassengerId', 'Name', 'dataset', 'First'], axis=1, inplace=True)
        df_test.drop(['PassengerId', 'Name',  'dataset', 'Survived', 'First'], axis=1, inplace=True)

        df_train, df_test = self.transform_categoric_OneHotEncoder(df_train, df_test)

        if self.feature_selection_low_variance_flag == True:
            df_train, df_test = self.feature_selection_low_variance(df_train, df_test)

        df_train, df_test = self.feature_selection_correlation(df_train, df_test)

        return df_train, df_test


In [4]:
# define cross validation
cv = ShuffleSplit(
    n_splits = 10,
    test_size = 0.2,
    random_state = config_data["RANDOM_STATE"]
    )

In [5]:
def create_model(trial, model_type, parent_run, x_train):   

    child_run = client.create_run(
        experiment_id=experiment,
        tags={
            MLFLOW_PARENT_RUN_ID: parent_run.info.run_id
        }
    )



    ''' columnprep '''
    columnprep__transformers_num = trial.suggest_categorical("columnprep__transformers_num", ["StandardScaler", "MinMaxScaler"])

    transformer_not_num = [x for x in list(x_train) if (x.startswith("x") & x[1].isnumeric())]
    transformer_num = [x for x in list(x_train) if x not in transformer_not_num]

    if columnprep__transformers_num == "StandardScaler":
        col_transform = ColumnTransformer(
                    transformers=[
                        ('num', StandardScaler(), transformer_num)
                    ], remainder='passthrough'
                )
    elif columnprep__transformers_num == "MinMaxScaler":
        col_transform = ColumnTransformer(
            transformers=[
                ('num', MinMaxScaler(), transformer_num)
            ], remainder='passthrough'
        )



    ''' algo '''
    if model_type == 'svm':
        svm_kernel = trial.suggest_categorical('svm_kernel', config_data["svm_kernel"])
        svm_C = trial.suggest_float('svm_C', config_data["svm_C"][0], config_data["svm_C"][1], log=True)
        svm_degree = trial.suggest_discrete_uniform('svm_degree', config_data["svm_degree"][0], config_data["svm_degree"][1], config_data["svm_degree"][2])
        
        model = SVC(
            kernel=svm_kernel,
            C=svm_C,
            degree=svm_degree,
            probability=True,
            random_state=config_data["RANDOM_STATE"]
        )

        client.log_param(child_run.info.run_id, "svm_kernel", svm_kernel)
        client.log_param(child_run.info.run_id, "svm_C", svm_C)
        client.log_param(child_run.info.run_id, "svm_degree", svm_degree)
    

    if model_type == 'logistic-regression':
        lr_C = trial.suggest_float("lr_C", config_data["lr_C"][0], config_data["lr_C"][1], log=True)
        lr_penalty = trial.suggest_categorical('lr_penalty', config_data["lr_penalty"])
        if lr_penalty == 'l1':
            lr_solver = 'saga'
        else:
            lr_solver = 'lbfgs'
        
        model = LogisticRegression(
            C=lr_C,
            penalty=lr_penalty,
            solver=lr_solver,
            random_state=config_data["RANDOM_STATE"],
            n_jobs=-1
        )

        client.log_param(child_run.info.run_id, "lr_C", lr_C)
        client.log_param(child_run.info.run_id, "lr_penalty", lr_penalty)
        client.log_param(child_run.info.run_id, "lr_solver", lr_solver)


    if model_type == 'decision-tree':
        dt_max_depth = trial.suggest_int('dt_max_depth', config_data["dt_max_depth"][0], x_train.shape[1])
        dt_criterion = trial.suggest_categorical("dt_criterion", config_data["dt_criterion"])
        dt_max_leaf_nodes = trial.suggest_int("dt_max_leaf_nodes", config_data["dt_max_leaf_nodes"][0], config_data["dt_max_leaf_nodes"][1])
        
        model = DecisionTreeClassifier(
            max_depth=dt_max_depth,
            criterion=dt_criterion,
            max_leaf_nodes=dt_max_leaf_nodes,
            random_state=config_data["RANDOM_STATE"]
          )
    
        client.log_param(child_run.info.run_id, "dt_max_depth", dt_max_depth)
        client.log_param(child_run.info.run_id, "dt_criterion", dt_criterion)
        client.log_param(child_run.info.run_id, "dt_max_leaf_nodes", dt_max_leaf_nodes)
    
    
    client.log_param(child_run.info.run_id, "algo", model.__class__.__name__)


    pipeline = Pipeline(steps=[
        ('columnprep', col_transform),
        ('algo', model)
    ])

            
    return pipeline, child_run

In [6]:
def evaluate_model(x_train, y_train, y_validate, y_validate_pred, y_validate_scores, pipeline, child_run):
    """
    evaluate the classification model with
    - classification report
    - precision-recall-curve
    - ROC curve
    """

    def plot_learning_curve(pipeline, x_train, y_train):

        train_sizes, train_scores, test_scores = learning_curve(
            pipeline,
            x_train,
            y_train,
            cv=5,
            n_jobs=-1,
            train_sizes=np.linspace(.1, 1.0, 8)
            )


        fig1, ax1 = plt.subplots()
        ax1.set_xlabel("Training examples")
        ax1.set_ylabel("Score")
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)
        ax1.grid()

        ax1.fill_between(train_sizes, train_scores_mean - train_scores_std,
                        train_scores_mean + train_scores_std, alpha=0.1,
                        color="r")
        ax1.fill_between(train_sizes, test_scores_mean - test_scores_std,
                        test_scores_mean + test_scores_std, alpha=0.1, color="g")
        ax1.plot(train_sizes, train_scores_mean, 'o-', color="r",
                label="Training score")
        ax1.plot(train_sizes, test_scores_mean, 'o-', color="g",
                label="Cross-validation score")

        ax1.legend(loc="best")
        ax1.set_title("Difference between training and CV: "\
            + str(round(test_scores_mean[7] / train_scores_mean[7] * 100, 2))\
            + "%")
        client.log_figure(child_run.info.run_id, fig1, 'plot_learning_curve.png')
        plt.close()


    def plot_confusion_matrix(y_validate, y_validate_pred):
        group_names = ["True Neg", "False Pos", "False Neg", "True Pos"]
        group_counts = ["{0:0.0f}".format(value) for value in
                        confusion_matrix(y_validate, y_validate_pred).flatten()]
        group_percentages = ["{0:.2%}".format(value) for value in
                             confusion_matrix(y_validate, y_validate_pred).flatten()/np.sum(confusion_matrix(y_validate, y_validate_pred))]
        labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
                  zip(group_names,group_counts,group_percentages)]
        labels = np.asarray(labels).reshape(2,2)

        fig2, ax2 = plt.subplots()
        sns.heatmap(confusion_matrix(y_validate, y_validate_pred), annot=labels, fmt="", cmap='Blues')
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        client.log_figure(child_run.info.run_id, fig2, 'plot_confusion_matrix.png')
        plt.close()


    def plot_precision_recall_vs_threshold(y_validate, y_scores, child_run):
        precisions, recalls, thresholds = precision_recall_curve(y_validate, y_scores)

        # convert to f score
        fscore = (2 * precisions * recalls) / (precisions + recalls)
        # locate the index of the largest f score
        ix = np.argmax(fscore)
        
        client.log_metric(child_run.info.run_id, "f1_score", round(fscore[ix], 5))

        fig3, ax3 = plt.subplots()
        ax3.plot(thresholds, precisions[:-1], "b", label="Precision")
        ax3.plot(thresholds, recalls[:-1], "g", label="Recall")
        ax3.plot(thresholds, fscore[:-1], "r", label="F1 Score")
        ax3.axvline(x=thresholds[ix], color='red', linestyle='--')
        plt.axhline(y=precisions[ix], color='b', linestyle='--')
        plt.axhline(y=recalls[ix], color='g', linestyle='--')
        ax3.set_xlabel("Threshold")
        ax3.legend(loc="upper left")
        ax3.set_ylim([0,1])
        client.log_figure(child_run.info.run_id, fig3, 'plot_f1.png')
        plt.close()

        fig4, ax4 = plt.subplots()
        ax4.plot(recalls, precisions, marker='.', label='Logistic')
        ax4.scatter(recalls[ix], precisions[ix], 200, marker='o', color='red', label='Best')
        ax4.set_xlabel('Recall')
        ax4.set_ylabel('Precision')
        client.log_figure(child_run.info.run_id, fig4, 'plot_precision_recall.png')
        plt.close()
        

    def plot_roc_curve(y_validate, y_scores, child_run):
        fpr, tpr, thresholds = roc_curve(y_validate, y_scores)

        roc_auc = round(roc_auc_score(y_validate, y_scores), 3)
        
        optimal_idx = np.argmax(tpr - fpr)

        fig5, ax5 = plt.subplots()
        ax5.plot(fpr, tpr, linewidth=2)
        ax5.plot([0,1], [0,1], 'k--')
        ax5.axis([0,1,0,1])
        ax5.scatter(fpr[optimal_idx], tpr[optimal_idx], 200, marker='o', color='red', label='Best')
        ax5.set_xlabel('False Positive Rate')
        ax5.set_ylabel('True Positive Rate')
        client.log_figure(child_run.info.run_id, fig5, 'plot_roc_curve.png')
        plt.close()

        client.log_metric(child_run.info.run_id, "roc_auc", roc_auc)

        

    plot_confusion_matrix(y_validate, y_validate_pred)
    plot_precision_recall_vs_threshold(y_validate, y_validate_scores, child_run)
    plot_roc_curve(y_validate, y_validate_scores, child_run)
    plot_learning_curve(pipeline, x_train, y_train)

In [12]:
class Objective:
    
    def __init__(self, model_type, parent_run):
        self.best_model = None
        self._model = None
        
        self.best_x_train = None
        self._x_train = None
        self.best_x_validate = None
        self._x_validate = None
        self.best_y_train = None
        self._y_train = None
        self.best_y_validate = None
        self._y_validate = None
        self.best_x_test = None
        self._x_test = None
        
        self.model_type = model_type
        self.parent_run = parent_run

    
    def __call__(self, trial):
        n_bins_fare=trial.suggest_int('preprocessing_n_bins_fare', 5, 15)
        n_bins_age=trial.suggest_int('preprocessing_n_bins_age', 5, 15)
        transform_skewed_features_flag=trial.suggest_categorical("preprocessing_transform_skewed_features_flag", [True, False])
        ohe_min_frequency=trial.suggest_float("preprocessing_ohe_min_frequency", 0, 0.2, log=False)
        ohe_max_categories=trial.suggest_int('preprocessing_ohe_max_categories', 20, 100)
        feature_selection_low_variance_flag=trial.suggest_categorical("preprocessing_feature_selection_low_variance_flag", [True, False])
        correlation=trial.suggest_float("preprocessing_correlation", 0.7, 0.95, log=False)
    
        datapreprocessing = DataPreprocessing(
            n_bins_fare=n_bins_fare,
            n_bins_age=n_bins_age,
            transform_skewed_features_flag=transform_skewed_features_flag,
            ohe_min_frequency=ohe_min_frequency,
            ohe_max_categories=ohe_max_categories,
            feature_selection_low_variance_flag=feature_selection_low_variance_flag,
            correlation=correlation
            )

        # client.log_param(child_run.info.run_id, "preprocessing_n_bins_fare", n_bins_fare)
        # client.log_param(child_run.info.run_id, "preprocessing_n_bins_age", n_bins_age)

        df_train, df_test = datapreprocessing.transform()
        
        # split the training and test dataset to the input features (x_train, x_test) and the survival class (y_train)
        y_train = df_train['Survived']
        x_train = df_train.drop(['Survived'], axis=1)
        x_test = df_test

        x_train, x_validate, y_train, y_validate = train_test_split(x_train, y_train, test_size=0.2, random_state=config_data["RANDOM_STATE"])
        self._x_train, self._x_validate, self._y_train, self._y_validate, self._x_test = x_train, x_validate, y_train, y_validate, x_test


        pipeline, child_run = create_model(trial, self.model_type, self.parent_run, x_train)
        self._model = pipeline

        score = cross_val_score(
            pipeline,
            x_train,
            y_train,
            cv=cv,
            scoring="accuracy",
            n_jobs=-1
        ).mean()

        client.log_metric(child_run.info.run_id, "cv_score", score)

        return score

    def callback(self, study, trial):
        if study.best_trial == trial:
            self.best_model = self._model

            self.best_x_train = self._x_train
            self.best_x_validate = self._x_validate
            self.best_y_train = self._y_train
            self.best_y_validate = self._y_validate
            self.best_x_test = self._x_test


In [13]:
def create_submission(best_model, x_test, parent_run):
    # predict the test values with the training classification model
    y_pred = best_model.predict(x_test).astype(int)
    
    df_submission = pd.read_csv("../01_RawData/gender_submission.csv")
    df_submission['Survived'] = y_pred
    
    df_submission.to_csv('submissions/%s.csv'%parent_run.info.run_id, index=False)


In [14]:
model_type='logistic-regression'

parent_run = client.create_run(experiment_id=experiment)

objective = Objective(model_type, parent_run)

study = optuna.create_study(
  sampler = optuna.samplers.TPESampler(),
  direction="maximize"
  )

study.optimize(
  objective,
  n_trials=config_data["N_TRAILS"],
  timeout=config_data["TIMEOUT"],
  n_jobs=-1,
  callbacks=[objective.callback]
  )

print("Best trial:")
print(study.best_value)
print(study.best_params)

client.log_metric(parent_run.info.run_id, "best_cv_score", round(study.best_value, 3))
# client.log_param(parent_run.info.run_id, "transformer_num", str(transformer_num))

client.log_param(parent_run.info.run_id, "cv_n_splits", cv.n_splits)
client.log_param(parent_run.info.run_id, "cv_train_size", cv.train_size)
client.log_param(parent_run.info.run_id, "cv_test_size", cv.test_size)
client.log_param(parent_run.info.run_id, "cv_random_state", cv.random_state)

print("Log best parameters")
for param in study.best_params:
  client.log_param(parent_run.info.run_id, param, study.best_params[param])


print("Save best model")
# save the best model as file
best_model = objective.best_model
mlflow.sklearn.save_model(best_model, "models/%s/"%parent_run.info.run_id)

x_train = objective.best_x_train
y_train = objective.best_y_train
x_validate = objective.best_x_validate
y_validate = objective.best_y_validate
x_test = objective.best_x_test

print("Fit best model")
# fit the pipeline to compute the validation results
best_model.fit(x_train, y_train)

print("Create submission")
# create submission of best model
create_submission(best_model, x_test, parent_run)

print("Predict training outcome")
# predict the training outcome
y_validate_pred = best_model.predict(x_validate)

# predict probabilities
y_validate_proba = best_model.predict_proba(x_validate)
# keep probabilities for the positive outcome only
y_validate_scores = y_validate_proba[:, 1]

print("Evaluate model performance")
evaluate_model(x_train, y_train, y_validate, y_validate_pred, y_validate_scores, best_model, parent_run)

mlflow.end_run()

[32m[I 2022-10-27 21:11:33,075][0m A new study created in memory with name: no-name-f01eb2c2-acb1-4599-866c-493b9a7e780a[0m
[32m[I 2022-10-27 21:11:33,995][0m Trial 3 finished with value: 0.8335664335664337 and parameters: {'preprocessing_n_bins_fare': 7, 'preprocessing_n_bins_age': 13, 'preprocessing_transform_skewed_features_flag': False, 'preprocessing_ohe_min_frequency': 0.04371747120300762, 'preprocessing_ohe_max_categories': 51, 'preprocessing_feature_selection_low_variance_flag': False, 'preprocessing_correlation': 0.8866289499066307, 'columnprep__transformers_num': 'StandardScaler', 'lr_C': 1.8815313242511988, 'lr_penalty': 'l1'}. Best is trial 3 with value: 0.8335664335664337.[0m
[32m[I 2022-10-27 21:11:34,070][0m Trial 1 finished with value: 0.8244755244755245 and parameters: {'preprocessing_n_bins_fare': 13, 'preprocessing_n_bins_age': 8, 'preprocessing_transform_skewed_features_flag': False, 'preprocessing_ohe_min_frequency': 0.12903945108420858, 'preprocessing_ohe_

Best trial:
0.8363636363636363
{'preprocessing_n_bins_fare': 7, 'preprocessing_n_bins_age': 7, 'preprocessing_transform_skewed_features_flag': False, 'preprocessing_ohe_min_frequency': 0.03344515812898148, 'preprocessing_ohe_max_categories': 80, 'preprocessing_feature_selection_low_variance_flag': False, 'preprocessing_correlation': 0.8559154711071972, 'columnprep__transformers_num': 'MinMaxScaler', 'lr_C': 18.04245328784567, 'lr_penalty': 'l1'}
Log best parameters
Save best model
Fit best model
Create submission
Predict training outcome
Evaluate model performance


In [15]:
list(x_train)

['Pclass',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'TNlen',
 'LeadingDigit',
 'Fare_bin',
 'x0_male',
 'x1_C',
 'x1_Q',
 'x1_S',
 'x2_adult',
 'x2_children',
 'x2_senior',
 'x3_Alone',
 'x3_Medium',
 'x3_infrequent_sklearn',
 'x4_infrequent_sklearn',
 'x5_Master',
 'x5_Miss',
 'x5_Mrs',
 'x5_infrequent_sklearn',
 'x6_NoPrefix',
 'x6_ca',
 'x6_pc',
 'x6_infrequent_sklearn',
 'x7_infrequent_sklearn',
 'x8_outlier']

In [11]:
mlflow.end_run()