# Basic ML Models

In [1]:
import pandas as pd
print("pandas version: {}". format(pd.__version__))

# numpy: support for large, multi-dimensional arrays and matrices and high-level mathematical functions
import numpy as np
print("numpy version: {}". format(np.__version__))

import sklearn
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit, train_test_split, cross_val_score, learning_curve
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import precision_recall_curve, roc_curve, confusion_matrix, roc_auc_score
print("sklearn version: {}". format(sklearn.__version__))

import optuna
print("optuna version: {}". format(optuna.__version__))

import mlflow
from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID
from mlflow.tracking import MlflowClient
print("mlflow version: {}". format(mlflow.__version__))

import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
import warnings
warnings.simplefilter('ignore')


import yaml
with open('ml_parameter.yaml') as file:
  config_data= yaml.safe_load(file)

pandas version: 1.4.3
numpy version: 1.21.5
sklearn version: 1.1.1


  from .autonotebook import tqdm as notebook_tqdm


optuna version: 2.10.1
mlflow version: 1.28.0


In [2]:
client = MlflowClient()
try:
    experiment = client.create_experiment(config_data["experiment_name"])
except:
    experiment = client.get_experiment_by_name(config_data["experiment_name"]).experiment_id

In [3]:
def load_data():
    # load prepared training and test dataset
    df_train = pd.read_pickle(config_data['path_df_train'])
    df_test = pd.read_pickle(config_data['path_df_test'])

    # split the training and test dataset to the input features (x_train, x_test) and the survival class (y_train)
    y_train = df_train['Survived']
    x_train = df_train.drop(['Survived'], axis=1)
    x_test = df_test

    x_train, x_validate, y_train, y_validate = train_test_split(x_train, y_train, test_size=0.2, random_state=config_data["RANDOM_STATE"])

    return x_train, y_train, x_validate, y_validate, x_test

x_train, y_train, x_validate, y_validate, x_test = load_data()

In [4]:
list(x_train)

['Pclass',
 'SibSp',
 'Parch',
 'Family',
 'TNlen',
 'LeadingDigit',
 'Fare_bin',
 'Age_bin',
 'x0_male',
 'x1_C',
 'x1_Q',
 'x1_S',
 'x2_adult',
 'x2_children',
 'x3_Alone',
 'x3_Small',
 'x5_Miss',
 'x5_Mr',
 'x5_Mrs',
 'x6_NoPrefix',
 'x6_pc',
 'x8_outlier']

In [14]:
transformer_not_num = [x for x in list(x_train) if (x.startswith("x") & x[1].isnumeric())]
print(transformer_not_num)

transformer_num = [x for x in list(x_train) if x not in transformer_not_num]
print(transformer_num)


['x0_male', 'x1_C', 'x1_Q', 'x1_S', 'x2_adult', 'x2_children', 'x3_Alone', 'x3_Small', 'x5_Miss', 'x5_Mr', 'x5_Mrs', 'x6_NoPrefix', 'x6_pc', 'x8_outlier']
['Pclass', 'SibSp', 'Parch', 'Family', 'TNlen', 'LeadingDigit', 'Fare_bin', 'Age_bin']


In [4]:
# only scale the features that are not one-hot-encoded [0,1]
transformer_num = [
    "Pclass",
    "SibSp",
    "Parch",
    "Family",
    "TNlen",
    "LeadingDigit",
    "Fare_bin",
    "Age_bin",
    ]



# define cross validation
cv = ShuffleSplit(
    n_splits = 10,
    test_size = 0.2,
    random_state = config_data["RANDOM_STATE"]
    )

In [5]:
def create_model(trial, model_type, parent_run):   

    child_run = client.create_run(
        experiment_id=experiment,
        tags={
            MLFLOW_PARENT_RUN_ID: parent_run.info.run_id
        }
    )



    ''' columnprep '''
    columnprep__transformers_num = trial.suggest_categorical("columnprep__transformers_num", ["StandardScaler", "MinMaxScaler"])

    if columnprep__transformers_num == "StandardScaler":
        col_transform = ColumnTransformer(
                    transformers=[
                        ('num', StandardScaler(), transformer_num)
                    ], remainder='passthrough'
                )
    elif columnprep__transformers_num == "MinMaxScaler":
        col_transform = ColumnTransformer(
            transformers=[
                ('num', MinMaxScaler(), transformer_num)
            ], remainder='passthrough'
        )



    ''' algo '''
    if model_type == 'svm':
        svm_kernel = trial.suggest_categorical('svm_kernel', config_data["svm_kernel"])
        svm_C = trial.suggest_float('svm_C', config_data["svm_C"][0], config_data["svm_C"][1], log=True)
        svm_degree = trial.suggest_discrete_uniform('svm_degree', config_data["svm_degree"][0], config_data["svm_degree"][1], config_data["svm_degree"][2])
        
        model = SVC(
            kernel=svm_kernel,
            C=svm_C,
            degree=svm_degree,
            probability=True,
            random_state=config_data["RANDOM_STATE"]
        )

        client.log_param(child_run.info.run_id, "svm_kernel", svm_kernel)
        client.log_param(child_run.info.run_id, "svm_C", svm_C)
        client.log_param(child_run.info.run_id, "svm_degree", svm_degree)
    

    if model_type == 'logistic-regression':
        lr_C = trial.suggest_float("lr_C", config_data["lr_C"][0], config_data["lr_C"][1], log=True)
        lr_penalty = trial.suggest_categorical('lr_penalty', config_data["lr_penalty"])
        if lr_penalty == 'l1':
            lr_solver = 'saga'
        else:
            lr_solver = 'lbfgs'
        
        model = LogisticRegression(
            C=lr_C,
            penalty=lr_penalty,
            solver=lr_solver,
            random_state=config_data["RANDOM_STATE"],
            n_jobs=-1
        )

        client.log_param(child_run.info.run_id, "lr_C", lr_C)
        client.log_param(child_run.info.run_id, "lr_penalty", lr_penalty)
        client.log_param(child_run.info.run_id, "lr_solver", lr_solver)


    if model_type == 'decision-tree':
        dt_max_depth = trial.suggest_int('dt_max_depth', config_data["dt_max_depth"][0], x_train.shape[1])
        dt_criterion = trial.suggest_categorical("dt_criterion", config_data["dt_criterion"])
        dt_max_leaf_nodes = trial.suggest_int("dt_max_leaf_nodes", config_data["dt_max_leaf_nodes"][0], config_data["dt_max_leaf_nodes"][1])
        
        model = DecisionTreeClassifier(
            max_depth=dt_max_depth,
            criterion=dt_criterion,
            max_leaf_nodes=dt_max_leaf_nodes,
            random_state=config_data["RANDOM_STATE"]
          )
    
        client.log_param(child_run.info.run_id, "dt_max_depth", dt_max_depth)
        client.log_param(child_run.info.run_id, "dt_criterion", dt_criterion)
        client.log_param(child_run.info.run_id, "dt_max_leaf_nodes", dt_max_leaf_nodes)
    
    
    client.log_param(child_run.info.run_id, "algo", model.__class__.__name__)


    pipeline = Pipeline(steps=[
        ('columnprep', col_transform),
        ('algo', model)
    ])

            
    return pipeline, child_run

In [6]:
def evaluate_model(x_train, y_train, y_validate, y_validate_pred, y_validate_scores, pipeline, child_run):
    """
    evaluate the classification model with
    - classification report
    - precision-recall-curve
    - ROC curve
    """

    def plot_learning_curve(pipeline, x_train, y_train):

        train_sizes, train_scores, test_scores = learning_curve(
            pipeline,
            x_train,
            y_train,
            cv=5,
            n_jobs=-1,
            train_sizes=np.linspace(.1, 1.0, 8)
            )


        fig1, ax1 = plt.subplots()
        ax1.set_xlabel("Training examples")
        ax1.set_ylabel("Score")
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)
        ax1.grid()

        ax1.fill_between(train_sizes, train_scores_mean - train_scores_std,
                        train_scores_mean + train_scores_std, alpha=0.1,
                        color="r")
        ax1.fill_between(train_sizes, test_scores_mean - test_scores_std,
                        test_scores_mean + test_scores_std, alpha=0.1, color="g")
        ax1.plot(train_sizes, train_scores_mean, 'o-', color="r",
                label="Training score")
        ax1.plot(train_sizes, test_scores_mean, 'o-', color="g",
                label="Cross-validation score")

        ax1.legend(loc="best")
        ax1.set_title("Difference between training and CV: "\
            + str(round(test_scores_mean[7] / train_scores_mean[7] * 100, 2))\
            + "%")
        client.log_figure(child_run.info.run_id, fig1, 'plot_learning_curve.png')
        plt.close()


    def plot_confusion_matrix(y_validate, y_validate_pred):
        group_names = ["True Neg", "False Pos", "False Neg", "True Pos"]
        group_counts = ["{0:0.0f}".format(value) for value in
                        confusion_matrix(y_validate, y_validate_pred).flatten()]
        group_percentages = ["{0:.2%}".format(value) for value in
                             confusion_matrix(y_validate, y_validate_pred).flatten()/np.sum(confusion_matrix(y_validate, y_validate_pred))]
        labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
                  zip(group_names,group_counts,group_percentages)]
        labels = np.asarray(labels).reshape(2,2)

        fig2, ax2 = plt.subplots()
        sns.heatmap(confusion_matrix(y_validate, y_validate_pred), annot=labels, fmt="", cmap='Blues')
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        client.log_figure(child_run.info.run_id, fig2, 'plot_confusion_matrix.png')
        plt.close()


    def plot_precision_recall_vs_threshold(y_validate, y_scores, child_run):
        precisions, recalls, thresholds = precision_recall_curve(y_validate, y_scores)

        # convert to f score
        fscore = (2 * precisions * recalls) / (precisions + recalls)
        # locate the index of the largest f score
        ix = np.argmax(fscore)
        
        client.log_metric(child_run.info.run_id, "f1_score", round(fscore[ix], 5))

        fig3, ax3 = plt.subplots()
        ax3.plot(thresholds, precisions[:-1], "b", label="Precision")
        ax3.plot(thresholds, recalls[:-1], "g", label="Recall")
        ax3.plot(thresholds, fscore[:-1], "r", label="F1 Score")
        ax3.axvline(x=thresholds[ix], color='red', linestyle='--')
        plt.axhline(y=precisions[ix], color='b', linestyle='--')
        plt.axhline(y=recalls[ix], color='g', linestyle='--')
        ax3.set_xlabel("Threshold")
        ax3.legend(loc="upper left")
        ax3.set_ylim([0,1])
        client.log_figure(child_run.info.run_id, fig3, 'plot_f1.png')
        plt.close()

        fig4, ax4 = plt.subplots()
        ax4.plot(recalls, precisions, marker='.', label='Logistic')
        ax4.scatter(recalls[ix], precisions[ix], 200, marker='o', color='red', label='Best')
        ax4.set_xlabel('Recall')
        ax4.set_ylabel('Precision')
        client.log_figure(child_run.info.run_id, fig4, 'plot_precision_recall.png')
        plt.close()
        

    def plot_roc_curve(y_validate, y_scores, child_run):
        fpr, tpr, thresholds = roc_curve(y_validate, y_scores)

        roc_auc = round(roc_auc_score(y_validate, y_scores), 3)
        
        optimal_idx = np.argmax(tpr - fpr)

        fig5, ax5 = plt.subplots()
        ax5.plot(fpr, tpr, linewidth=2)
        ax5.plot([0,1], [0,1], 'k--')
        ax5.axis([0,1,0,1])
        ax5.scatter(fpr[optimal_idx], tpr[optimal_idx], 200, marker='o', color='red', label='Best')
        ax5.set_xlabel('False Positive Rate')
        ax5.set_ylabel('True Positive Rate')
        client.log_figure(child_run.info.run_id, fig5, 'plot_roc_curve.png')
        plt.close()

        client.log_metric(child_run.info.run_id, "roc_auc", roc_auc)

        

    plot_confusion_matrix(y_validate, y_validate_pred)
    plot_precision_recall_vs_threshold(y_validate, y_validate_scores, child_run)
    plot_roc_curve(y_validate, y_validate_scores, child_run)
    plot_learning_curve(pipeline, x_train, y_train)

In [7]:
class Objective:
    
    def __init__(self, model_type, parent_run):
        self.best_model = None
        self._model = None
        self.model_type = model_type
        self.parent_run = parent_run

    
    def __call__(self, trial):
    
        pipeline, child_run = create_model(trial, self.model_type, self.parent_run)
        self._model = pipeline

        score = cross_val_score(
            pipeline,
            x_train,
            y_train,
            cv=cv,
            scoring="accuracy",
            n_jobs=-1
        ).mean()

        client.log_metric(child_run.info.run_id, "cv_score", score)

        return score

    def callback(self, study, trial):
        if study.best_trial == trial:
            self.best_model = self._model


In [8]:
def create_submission(best_model, x_test, parent_run):
    # predict the test values with the training classification model
    y_pred = best_model.predict(x_test).astype(int)
    
    df_submission = pd.read_csv("../01_RawData/gender_submission.csv")
    df_submission['Survived'] = y_pred
    
    df_submission.to_csv('submissions/%s.csv'%parent_run.info.run_id, index=False)


In [9]:
def create_new_mlrun(model_type):
  parent_run = client.create_run(experiment_id=experiment)

  objective = Objective(model_type, parent_run)

  study = optuna.create_study(
    sampler = optuna.samplers.TPESampler(),
    direction="maximize"
    )

  study.optimize(
    objective,
    n_trials=config_data["N_TRAILS"],
    timeout=config_data["TIMEOUT"],
    n_jobs=-1,
    callbacks=[objective.callback]
    )

  print("Best trial:")
  print(study.best_value)
  print(study.best_params)

  client.log_metric(parent_run.info.run_id, "best_cv_score", round(study.best_value, 3))
  client.log_param(parent_run.info.run_id, "transformer_num", str(transformer_num))

  client.log_param(parent_run.info.run_id, "cv_n_splits", cv.n_splits)
  client.log_param(parent_run.info.run_id, "cv_train_size", cv.train_size)
  client.log_param(parent_run.info.run_id, "cv_test_size", cv.test_size)
  client.log_param(parent_run.info.run_id, "cv_random_state", cv.random_state)

  for param in study.best_params:
    client.log_param(parent_run.info.run_id, param, study.best_params[param])


  # save the best model as file
  best_model = objective.best_model
  mlflow.sklearn.save_model(best_model, "models/%s/"%parent_run.info.run_id)


  # fit the pipeline to compute the validation results
  best_model.fit(x_train, y_train)

  # create submission of best model
  create_submission(best_model, x_test, parent_run)


  # predict the training outcome
  y_validate_pred = best_model.predict(x_validate)

  # predict probabilities
  y_validate_proba = best_model.predict_proba(x_validate)
  # keep probabilities for the positive outcome only
  y_validate_scores = y_validate_proba[:, 1]

  evaluate_model(x_train, y_train, y_validate, y_validate_pred, y_validate_scores, best_model, parent_run)

  mlflow.end_run()

  return study

In [10]:
study_lr = create_new_mlrun(model_type='logistic-regression')
optuna.visualization.plot_optimization_history(study_lr)

[32m[I 2022-10-19 20:36:13,208][0m A new study created in memory with name: no-name-bbdd6b86-bd56-47fe-bc7d-a4cd3798a3a4[0m
[32m[I 2022-10-19 20:36:16,758][0m Trial 0 finished with value: 0.8244755244755245 and parameters: {'columnprep__transformers_num': 'StandardScaler', 'lr_C': 2.840989527345764, 'lr_penalty': 'l2'}. Best is trial 0 with value: 0.8244755244755245.[0m
[32m[I 2022-10-19 20:36:16,846][0m Trial 2 finished with value: 0.8244755244755245 and parameters: {'columnprep__transformers_num': 'StandardScaler', 'lr_C': 84.2330153610002, 'lr_penalty': 'l2'}. Best is trial 0 with value: 0.8244755244755245.[0m
[32m[I 2022-10-19 20:36:16,950][0m Trial 3 finished with value: 0.8258741258741258 and parameters: {'columnprep__transformers_num': 'MinMaxScaler', 'lr_C': 10.900449044464558, 'lr_penalty': 'l1'}. Best is trial 3 with value: 0.8258741258741258.[0m
[32m[I 2022-10-19 20:36:16,963][0m Trial 1 finished with value: 0.8244755244755245 and parameters: {'columnprep__tran

Best trial:
0.8293706293706293
{'columnprep__transformers_num': 'MinMaxScaler', 'lr_C': 3.5558573518232306, 'lr_penalty': 'l2'}


In [11]:
study_svm = create_new_mlrun(model_type='svm')
optuna.visualization.plot_optimization_history(study_svm)

[32m[I 2022-10-19 20:36:29,984][0m A new study created in memory with name: no-name-0d5b980a-f66c-43a5-929b-e21558dbde11[0m
[32m[I 2022-10-19 20:36:30,340][0m Trial 1 finished with value: 0.6685314685314685 and parameters: {'columnprep__transformers_num': 'StandardScaler', 'svm_kernel': 'sigmoid', 'svm_C': 71.55526942442215, 'svm_degree': 4.0}. Best is trial 1 with value: 0.6685314685314685.[0m
[32m[I 2022-10-19 20:36:32,613][0m Trial 3 finished with value: 0.7909090909090909 and parameters: {'columnprep__transformers_num': 'StandardScaler', 'svm_kernel': 'rbf', 'svm_C': 33.35709610622676, 'svm_degree': 2.0}. Best is trial 3 with value: 0.7909090909090909.[0m
[32m[I 2022-10-19 20:36:32,696][0m Trial 0 finished with value: 0.806993006993007 and parameters: {'columnprep__transformers_num': 'StandardScaler', 'svm_kernel': 'rbf', 'svm_C': 19.478800324737588, 'svm_degree': 1.0}. Best is trial 0 with value: 0.806993006993007.[0m
[32m[I 2022-10-19 20:36:32,876][0m Trial 4 finish

Best trial:
0.8307692307692308
{'columnprep__transformers_num': 'MinMaxScaler', 'svm_kernel': 'rbf', 'svm_C': 1.9756400270883048, 'svm_degree': 1.0}


In [12]:
study_dt = create_new_mlrun(model_type='decision-tree')
optuna.visualization.plot_optimization_history(study_dt)

[32m[I 2022-10-19 20:39:55,458][0m A new study created in memory with name: no-name-7e78166f-dc6c-4040-83ec-a667efded12c[0m
[32m[I 2022-10-19 20:39:55,888][0m Trial 2 finished with value: 0.813986013986014 and parameters: {'columnprep__transformers_num': 'StandardScaler', 'dt_max_depth': 14, 'dt_criterion': 'entropy', 'dt_max_leaf_nodes': 5}. Best is trial 2 with value: 0.813986013986014.[0m
[32m[I 2022-10-19 20:39:55,955][0m Trial 1 finished with value: 0.8111888111888111 and parameters: {'columnprep__transformers_num': 'StandardScaler', 'dt_max_depth': 18, 'dt_criterion': 'gini', 'dt_max_leaf_nodes': 5}. Best is trial 2 with value: 0.813986013986014.[0m
[32m[I 2022-10-19 20:39:55,975][0m Trial 3 finished with value: 0.7895104895104895 and parameters: {'columnprep__transformers_num': 'StandardScaler', 'dt_max_depth': 7, 'dt_criterion': 'entropy', 'dt_max_leaf_nodes': 3}. Best is trial 2 with value: 0.813986013986014.[0m
[32m[I 2022-10-19 20:39:55,993][0m Trial 0 finished

Best trial:
0.83006993006993
{'columnprep__transformers_num': 'MinMaxScaler', 'dt_max_depth': 8, 'dt_criterion': 'entropy', 'dt_max_leaf_nodes': 7}


In [13]:
mlflow.end_run()