# Basic ML Models of the Kaggle House Price Dataset

In [1]:
import pandas as pd
print("pandas version: {}". format(pd.__version__))

# numpy: support for large, multi-dimensional arrays and matrices and high-level mathematical functions
import numpy as np
print("numpy version: {}". format(np.__version__))

import sklearn
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import Lasso, Ridge, BayesianRidge, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import ShuffleSplit, train_test_split, cross_val_score, learning_curve
print("sklearn version: {}". format(sklearn.__version__))

import optuna
print("optuna version: {}". format(optuna.__version__))

import mlflow
from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID
from mlflow.tracking import MlflowClient
print("mlflow version: {}". format(mlflow.__version__))

import matplotlib.pyplot as plt
import seaborn as sns

import os
from datetime import datetime
import warnings
warnings.simplefilter('ignore')

import yaml
with open('ml_parameter.yaml') as file:
  config_data= yaml.safe_load(file)


from create_algorithm import create_algorithm

pandas version: 1.5.2
numpy version: 1.23.5
sklearn version: 1.2.0


  from .autonotebook import tqdm as notebook_tqdm


optuna version: 2.10.1
mlflow version: 2.1.1


In [2]:
prepared_data = "log"
# prepared_data = "no log"

VERSION = 0.3
SCRIPT = "houseprices_BasicModels_mlflow_optuna"

client = MlflowClient()
try:
    experiment = client.create_experiment(f"{SCRIPT}_{VERSION}")
except:
    experiment = client.get_experiment_by_name(f"{SCRIPT}_{VERSION}").experiment_id

In [3]:
def load_data(prepared_data):
    # load prepared training and test dataset

    if prepared_data == "log":
        df_train = pd.read_pickle("../03_DataPreprocessing/df_train_prepared_reduced_log.pkl")
        df_test = pd.read_pickle("../03_DataPreprocessing/df_test_prepared_reduced_log.pkl")
    else:
        df_train = pd.read_pickle("../03_DataPreprocessing/df_train_prepared_reduced.pkl")
        df_test = pd.read_pickle("../03_DataPreprocessing/df_test_prepared_reduced.pkl")

    # split the training and test dataset to the input features (x_train, x_test) and the survival class (y_train)
    y_train = df_train['SalePrice']
    x_train = df_train.drop(['SalePrice'], axis=1)
    x_test = df_test

    x_train, x_validate, y_train, y_validate = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

    return x_train, y_train, x_validate, y_validate, x_test

x_train, y_train, x_validate, y_validate, x_test = load_data(prepared_data)

In [4]:
# define cross validation
cv = ShuffleSplit(
    n_splits = 5,
    test_size = 0.2,
    random_state = 42
    )


transformer_not_num = [x for x in list(x_train) if ((x.startswith("x") & x[1].isnumeric()) | (x.endswith("_bin")))]
transformer_num = [x for x in list(x_train) if x not in transformer_not_num]

In [5]:
def create_model(self, trial):

    child_run = client.create_run(
        experiment_id=experiment,
        tags={
            MLFLOW_PARENT_RUN_ID: self.parent_run.info.run_id
        }
    )



    ''' columnprep '''
    columnprep__transformers_num = trial.suggest_categorical(
        "columnprep__transformers_num", ["StandardScaler", "MinMaxScaler"])

    if columnprep__transformers_num == "StandardScaler":
        col_transform = ColumnTransformer(
                    transformers=[
                        ('num', StandardScaler(), transformer_num)
                    ], remainder='passthrough'
                )
    elif columnprep__transformers_num == "MinMaxScaler":
        col_transform = ColumnTransformer(
            transformers=[
                ('num', MinMaxScaler(), transformer_num)
            ], remainder='passthrough'
        )

    model = create_algorithm(self.model_type, trial, client, child_run, config_data)
    

    pca_n_components = trial.suggest_float('pca_n_components', 0, 1)
    client.log_param(child_run.info.run_id, "pca_n_components", pca_n_components)

    pipeline = Pipeline(steps=[
        ('columnprep', col_transform),
        ('reduce_dim', PCA(pca_n_components)),
        ('algo', model)
    ])

    return pipeline, child_run

In [6]:
def create_submission(best_model, x_test, parent_run):
    # check if folder exists
    if not os.path.exists(f'submissions/{SCRIPT}/{VERSION}'):
        os.makedirs(f'submissions/{SCRIPT}/{VERSION}')
    
    # predict the test values with the training classification model
    if prepared_data == "log":
        y_pred = np.expm1(best_model.predict(x_test))
    else:
        y_pred = best_model.predict(x_test)
    
    df_submission = pd.read_csv("../01_RawData/sample_submission.csv")
    df_submission.iloc[:, 1] = y_pred
    
    df_submission.to_csv(f'submissions/{SCRIPT}/{VERSION}/{parent_run.info.run_id}.csv', index=False)

In [7]:
def evaluate_model(x_train, y_train, y_validate, y_validate_pred, pipeline, parent_run):

    def plot_learning_curve(estimator, X, y, scoring, cv=None, train_sizes=np.linspace(.1, 1.0, 50)):
        train_sizes, train_scores, test_scores = learning_curve(
            pipeline,
            x_train,
            y_train,
            cv=cv,
            n_jobs=-1,
            train_sizes=np.linspace(.1, 1.0, 8)
            )


        fig1, ax1 = plt.subplots()
        ax1.set_xlabel("Training examples")
        ax1.set_ylabel("Score")
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)
        ax1.grid()

        ax1.fill_between(train_sizes, train_scores_mean - train_scores_std,
                        train_scores_mean + train_scores_std, alpha=0.1,
                        color="r")
        ax1.fill_between(train_sizes, test_scores_mean - test_scores_std,
                        test_scores_mean + test_scores_std, alpha=0.1, color="g")
        ax1.plot(train_sizes, train_scores_mean, 'o-', color="r",
                label="Training score")
        ax1.plot(train_sizes, test_scores_mean, 'o-', color="g",
                label="Cross-validation score")

        ax1.legend(loc="best")
        ax1.set_title("Difference between training and CV: "\
            + str(round(test_scores_mean[7] / train_scores_mean[7] * 100, 2))\
            + "%")
        client.log_figure(parent_run.info.run_id, fig1, 'plot_learning_curve.png')
        plt.close()

    plot_learning_curve(pipeline, x_train, y_train, "neg_mean_squared_error", cv=cv)


    def plot_scatter(y_true, y_pred):
        fig2, ax2 = plt.subplots()
        ax2.scatter(y_pred, y_true.values)
        ax2.plot([min(y_pred), max(y_pred)], [min(y_pred), max(y_pred)], c="red")
        client.log_figure(parent_run.info.run_id, fig2, 'plot_regression.png')
        plt.close()

    plot_scatter(y_validate, y_validate_pred)

In [8]:
class Objective:
    
    def __init__(self, model_type, parent_run):
        self.best_model = None
        self._model = None
        self.model_type = model_type
        self.parent_run = parent_run

    
    def __call__(self, trial):
    
        pipeline, child_run = create_model(self, trial)
        self._model = pipeline

        rmse = np.sqrt(
            -cross_val_score(
                pipeline,
                x_train, y_train,
                cv=cv,
                scoring="neg_mean_squared_error",
                n_jobs=-1
                )
            ).mean()

        client.log_metric(child_run.info.run_id, "cv_rmse", rmse)

        return rmse

    def callback(self, study, trial):
        if study.best_trial == trial:
            self.best_model = self._model

In [9]:
# model_type='Lasso'
# model_type='Ridge'
# model_type='BayesianRidge'
# model_type='ElasticNet'
model_type='GradientBoostingRegressor'
# model_type='RandomForestRegressor'





parent_run = client.create_run(
    experiment_id=experiment, 
    tags={"script_name": SCRIPT, "script_version": VERSION, "prepared_data": prepared_data}
    )

objective = Objective(model_type, parent_run)

study = optuna.create_study(direction="minimize")

study.optimize(
    objective,
    n_trials=config_data["N_TRAILS"],
    timeout=config_data["TIMEOUT"],
    n_jobs=-1,
    callbacks=[objective.callback]
    )

print("Best trial:")
print(study.best_value)
print(study.best_params)

print("Log CV Params")
client.log_metric(parent_run.info.run_id, "best_cv_score", round(study.best_value, 3))
# client.log_param(parent_run.info.run_id, "transformer_num", str(transformer_num))

client.log_param(parent_run.info.run_id, "cv_n_splits", cv.n_splits)
client.log_param(parent_run.info.run_id, "cv_train_size", cv.train_size)
client.log_param(parent_run.info.run_id, "cv_test_size", cv.test_size)
client.log_param(parent_run.info.run_id, "cv_random_state", cv.random_state)

# for param in study.best_params:
#     client.log_param(parent_run.info.run_id, param, study.best_params[param])

best_model = objective.best_model
client.log_param(parent_run.info.run_id, "algo", best_model.get_params()["steps"][-1][1].__class__.__name__)


# fit the pipeline to compute the validation results
print("Fit Best Model")
best_model.fit(x_train, y_train)

# predict the training outcome
print("Predict Best Model")
if prepared_data == "log":
    y_validate = np.expm1(y_validate)
    y_validate_pred = np.expm1(best_model.predict(x_validate))
else:
    y_validate_pred = best_model.predict(x_validate)


# evaluate model
print("Evaluate Best Model")
evaluate_model(x_train, y_train, y_validate, y_validate_pred, best_model, parent_run)

# create submission of best model
print("Create submission")
create_submission(best_model, x_test, parent_run)

mlflow.end_run()

[32m[I 2023-02-01 21:03:14,815][0m A new study created in memory with name: no-name-db1a97c9-74f6-4b30-a017-1271302a9559[0m
[32m[I 2023-02-01 21:03:24,585][0m Trial 2 finished with value: 0.3888811471869183 and parameters: {'columnprep__transformers_num': 'MinMaxScaler', 'gbr_n_estimators': 468, 'gbr_learning_rate': 0.0993583162333292, 'gbr_subsample': 0.2538781777214185, 'gbr_min_samples_split': 0.44399884706980475, 'gbr_min_samples_leaf': 0.42478737359929797, 'gbr_max_depth': 6, 'gbr_max_features': 0.44057711698236246, 'gbr_alpha': 0.31710591160448487, 'pca_n_components': 0.7677125917615424}. Best is trial 2 with value: 0.3888811471869183.[0m
[32m[I 2023-02-01 21:03:26,169][0m Trial 3 finished with value: 3.9869887343340773e+46 and parameters: {'columnprep__transformers_num': 'MinMaxScaler', 'gbr_n_estimators': 293, 'gbr_learning_rate': 2.4636596556747667, 'gbr_subsample': 0.47682189681772313, 'gbr_min_samples_split': 0.21614986658222352, 'gbr_min_samples_leaf': 0.23300816751

Best trial:
0.13372467647392397
{'columnprep__transformers_num': 'StandardScaler', 'gbr_n_estimators': 4992, 'gbr_learning_rate': 0.014624294461265161, 'gbr_subsample': 0.5725439124153987, 'gbr_min_samples_split': 0.15660656059382191, 'gbr_min_samples_leaf': 0.010315523103051434, 'gbr_max_depth': 2, 'gbr_max_features': 0.24496863721031364, 'gbr_alpha': 0.009949518624652084, 'pca_n_components': 0.9898532490959658}
Log CV Params
Fit Best Model
Predict Best Model
Evaluate Best Model
Create submission


In [10]:
def add_Kaggle_score(run_id, kaggle_score):
    # show if kaggle_score is already present
    if "kaggle_score" not in dict(dict(mlflow.get_run(run_id))["data"])["metrics"].keys():
        # if no kaggle_score is present, start run and write kaggle_score
        with mlflow.start_run(run_id=run_id):
            mlflow.log_metric("kaggle_score", kaggle_score)

In [12]:
add_Kaggle_score(run_id="eab21fd5a7f94493925c987347293087", kaggle_score=0.13808) # Ridge
add_Kaggle_score(run_id="6f25f31ec3e146818f7279cb16cccaa6", kaggle_score=0.13304) # Ridge -> Position 1207
add_Kaggle_score(run_id="3e6489469e424c9fbba08a232940e2c3", kaggle_score=0.26041) # Lasso
add_Kaggle_score(run_id="3724cc35ec0b440d89b307e39c302261", kaggle_score=0.13313) # BayesianRidge
add_Kaggle_score(run_id="90e0b665fb8c4367ae99691fb788eabc", kaggle_score=0.16332) # ElasticNet
add_Kaggle_score(run_id="da0a81cd09fe4ddab4924118843fab6a", kaggle_score=0) # GradientBoostingRegressor -> inf
add_Kaggle_score(run_id="467208197c064ef292167f0ad24f4484", kaggle_score=0.16307) # GradientBoostingRegressor
add_Kaggle_score(run_id="a4941fe4f93846fd8cd559a94291532a", kaggle_score=0.27496) # RandomForestRegressor