# HOUSE PRICES TRAINER 

In [1]:
!python --version

Python 3.9.6


## DATA AND GLOBAL PARAMETERS

### Imports

In [2]:
# LIRBARIES ------
import warnings

warnings.filterwarnings("ignore")

import multiprocessing

# Misc
import os
import shutil
import sys
from datetime import datetime

# Combinatorics
from itertools import product
from pickle import dump

# Matrices
import numpy as np

# DF
import pandas as pd

# Boosting machine
import xgboost as xgb
from IPython.display import Markdown as md
from IPython.display import display as printmd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.impute import KNNImputer
from sklearn.metrics import (
    make_scorer,
    mean_absolute_error,
    mean_squared_error,
    mean_squared_log_error,
)
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC, SVR

# SKLEARN ---
# * metrics ---

# * Preprocess ---

# * Imputation

# * CV ---

# Regression


# utils 
base_path = "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire"  # laptop : /home/jovyan/work/CM_ML/TP3_TP4 other tower: /home/jovyan/work/TP3_TP4
os.chdir(base_path)
sys.path.append(base_path)
from utils.utils import rmsle_scorer

In [3]:
# START TIME ------
s = datetime.now()
# time as str
init_time = s.strftime("%d-%m-%Y_%H:%M:%S")
print(f"Starting time and time used for files {init_time}")
pd.set_option("display.float_format", lambda x: "%.5f" % x)

Starting time and time used for files 07-02-2022_10:08:23


### Helper functions 

In [4]:
# FROM DATA TO MODEL DATA -----
def preprocess_data_as_model(data, dependent, categorical):
    # * Divide X,y ---
    data_subset = data.drop(dependent, axis=1)
    y = data[dependent].values

    # handle categorical variables ---
    cols_beginning = data_subset.columns.values

    all_cols = cols_beginning
    numeric_cols = data_subset.select_dtypes(["number"]).columns
    numeric_cols_index = np.in1d(all_cols, numeric_cols)

    categorical_cols = data_subset.select_dtypes(["object"]).columns
    categorical_cols_index = np.in1d(all_cols, categorical_cols)

    # Transformations ---
    transformations = []

    if categorical:
        pass  # todo
    else:
        # Training cols ---
        data_subset = data_subset.select_dtypes(["number"])  # drop categorical
        all_cols = cols_beginning[numeric_cols_index]  # only numerical cols ....
        numeric_cols_index = (
            data_subset.columns.values != None
        )  # apply transformation to all columns
    return (
        data_subset,
        y,
        all_cols,
        numeric_cols,
        numeric_cols_index,
        categorical_cols,
        categorical_cols_index,
    )

In [5]:
# MODEL TRAINERS ------


def train_mlp(data=None, best_params=None):
    # * Init xgb instance ---
    mlp = MLPRegressor(max_iter=3000, random_state=123, early_stopping=True)
    # * Set best params, normally found by a previous Grid search ---
    if best_params:
        print(f"Setting best parameters: {best_params}")
        mlp.set_params(**best_params)
    if data:
        # * Fit mlp: ((x_train,y_train)) ---
        mlp.fit(X=data[0], y=data[1])
    return mlp


def train_svr(data=None, best_params=None):
    # * Init xgb instance ---
    svr = SVR()
    # * Set best params, normally found by a previous Grid search ---
    if best_params:
        print(f"Setting best parameters: {best_params}")
        svr.set_params(**best_params)
    if data:
        # * Fit svr: ((x_train,y_train)) ---
        svr.fit(X=data[0], y=data[1])
    return svr


def train_rf(data=None, best_params=None):
    # * Init xgb instance ---
    rf = RandomForestRegressor()

    # * Set best params, normally found by a previous Grid search ---
    if best_params:
        print(f"Setting best parameters: {best_params}")
        rf.set_params(**best_params)
    if data:
        # * Fit rf: ((x_train,y_train)) ---
        rf.fit(X=data[0], y=data[1])
    return rf


def train_xgb(params, data=None, best_params=None):
    # * Init xgb instance ---
    boost = xgb.XGBRegressor(**params)

    # * Set best params, normally found by a previous Grid search ---
    if best_params:
        print(f"Setting best parameters: {best_params}")
        boost.set_params(**best_params)

    # * Fit booster: ((x_train,y_train), (x_test,y_test)) ---
    if data:
        boost.fit(
            X=data[0][0],
            y=data[0][1],
            eval_set=data,  # Validation set for early stopping (validates on test -- last data tuple)
            early_stopping_rounds=10,
            verbose=0,
            eval_metric=["mae", "rmse", "rmsle"],
        )
    return boost

In [6]:
def train_mlp_classification(data=None, best_params=None):
    # * Init xgb instance ---
    mlp = MLPClassifier(max_iter=3000, random_state=123, early_stopping=True)
    # * Set best params, normally found by a previous Grid search ---
    if best_params:
        print(f"Setting best parameters: {best_params}")
        mlp.set_params(**best_params)
    if data:
        # * Fit mlp: ((x_train,y_train)) ---
        mlp.fit(X=data[0], y=data[1])
    return mlp


def train_svr_classification(data=None, best_params=None):
    # * Init xgb instance ---
    svr = SVC()
    # * Set best params, normally found by a previous Grid search ---
    if best_params:
        print(f"Setting best parameters: {best_params}")
        svr.set_params(**best_params)
    if data:
        # * Fit svr: ((x_train,y_train)) ---
        svr.fit(X=data[0], y=data[1])
    return svr


def train_rf_classification(data=None, best_params=None):
    # * Init xgb instance ---
    rf = RandomForestClassifier()

    # * Set best params, normally found by a previous Grid search ---
    if best_params:
        print(f"Setting best parameters: {best_params}")
        rf.set_params(**best_params)
    if data:
        # * Fit rf: ((x_train,y_train)) ---
        rf.fit(X=data[0], y=data[1])
    return rf


def train_xgb_classification(params, data=None, best_params=None):
    # * Init xgb instance ---
    boost = xgb.XGBClassifier(**params)

    # * Set best params, normally found by a previous Grid search ---
    if best_params:
        print(f"Setting best parameters: {best_params}")
        boost.set_params(**best_params)

    # * Fit booster: ((x_train,y_train), (x_test,y_test)) ---
    if data:
        boost.fit(
            X=data[0][0],
            y=data[0][1],
            eval_set=data,  # Validation set for early stopping (validates on test -- last data tuple)
            early_stopping_rounds=10,
            verbose=0,
            eval_metric=["mae", "rmse", "rmsle"],
        )
    return boost

In [7]:
# MODEL SAVERS ------


def save_model(
    model,
    model_name,
    dataset,
    kind,
    objective,
    toScale,
    best_score,
    init_time,
    columns_used,
    Preprocess,
    preprocessing_path,
    categorical,
    scores,
) -> str:
    print("\n")
    print("--- Saving model ---")
    model_path = f"model_dump/{model_name}_{dataset}_{kind}_{objective}_{toScale}_{np.round(best_score,5)}_{init_time}.pkl"

    dump(
        {  # options
            "scaled": toScale,
            "categorical": categorical,
            # processing and model
            "columns_used": columns_used,
            "preprocess": Preprocess,
            "model": model,
            "preprocessing_path": preprocessing_path,
            # outcome
            "scores": scores,
        },
        open(model_path, "wb"),
    )

    print(f"--- {model_name} model saved to {model_path}---")
    return model_path


def save_grid(
    model_path,
    model_name,
    dataset,
    init_time,
    preprocessing_path,
    toScale,
    categorical,
    search,
):

    res = pd.DataFrame(search.cv_results_)

    # * Save estimator ---
    res["model_path"] = model_path

    # * Did we scale ?
    if preprocessing_path:
        res["preprocessing_path"] = preprocessing_path
    else:
        res["preprocessing_path"] = "no preprocess"

    res["categorical"] = categorical

    # * To csv ---
    grid_name = f"grid_search/Grid_{model_name}_{dataset}_{init_time}.csv"
    res.to_csv(grid_name)
    print(f"--- Grid search results saved to {grid_name}---")


def score_model(model, data):
    """
    data = ((xtrain, ytrain), (x_val, y_val), (x_test, y_test))
    """
    splits = ["train", "val", "test"]
    scores = {}
    for i, split in enumerate(splits):
        metrics = {}
        metrics["rmsle"] = rmsle(model, data[i][0], data[i][1])
        metrics["rmse"] = mean_squared_error(model.predict(data[i][0]), data[i][1])
        metrics["mae"] = mean_absolute_error(model.predict(data[i][0]), data[i][1])
        scores[split] = metrics
    return scores


def scores_to_df(scores, verbose=True):
    scores_df = pd.DataFrame(scores)
    scores_df["difference"] = scores_df["train"] - scores_df["val"]
    if verbose:
        print("---- BEST SCORES ---")
        print(scores_df)
    return scores_df

### FITTERS 

In [8]:
def get_results(model, data):
    # PRINT BEST RESULTS ------
    scores = score_model(model, data)
    scores_df = scores_to_df(scores)

    best_params = model.get_params()
    printmd(md(f"Paramètres du modèle: {best_params}"))
    return scores_df

In [9]:
def fit_baseline_model(trainer, data, pass_val=False, params=False, **kwargs):
    # --- --- --- BASELINE --- --- ---
    kind = "baseline"

    # * Fit if requested ---
    if pass_val and params:
        model = trainer(params, data, best_params=None)
    else:
        model = trainer((data[0][0], data[0][1]), best_params=None)

    # Best rmsle ---
    best_score = rmsle(estimator=model, X=data[1][0], y_true=data[1][1])

    return (model, kind, best_score)

In [10]:
def fit_grid_model(
    trainer,
    data,
    grid_params,
    grid_search_kwargs,
    pass_val=False,
    params=False,
    verbose=True,
):
    # --- --- --- BASELINE --- --- ---
    kind = "grid"

    if params:
        # * Init instance ---
        model = trainer(params=params)
    else:
        model = trainer()

    # * GridSearchCV ---
    search = GridSearchCV(model, param_grid=grid_params, **grid_search_kwargs)

    # * Fit if requested ---
    if pass_val:
        search.fit(
            data[0][0],
            data[0][1],
            eval_set=[data[1]],  # Validation set for early stopping
            early_stopping_rounds=15,
            verbose=0,
        )
    else:
        search.fit(data[0][0], data[0][1])

    # PRINT BEST RESULTS ------
    if verbose:
        best_score = search.best_score_
        printmd(
            md(
                f"Le meilleur score obtenu par notre grid search (à savoir, le score est le RMSLE): {best_score}"
            )
        )

        best_params = search.best_params_
        printmd(md(f"Le meilleurs paramètres: {best_params}"))

    return search

In [11]:
def refit_with_params(trainer, best_params, data, pass_val=False):
    # * Fit if requested ---
    if pass_val and params:
        model = trainer(params, data, best_params=best_params)
    else:
        model = trainer((data[0][0], data[0][1]), best_params=best_params)

    return model

In [12]:
def refit_with_selection(
    trainer,
    data,
    model_with_importances,
    all_cols,
    grid_params,
    grid_search_kwargs,
    pass_val,
    params,
    best_params,
    th=0.001,
    verbose=False,
):
    # which features ---
    subset = model_with_importances.feature_importances_ > th
    important_cols = all_cols[subset].tolist()

    if verbose:
        print(f"IMPORTANT COLUMNS KEPT FOR RETRAINING {important_cols}")

    data_th = [(data[0][0][:, subset], data[0][1]), (data[1][0][:, subset], data[1][1])]
    search = fit_grid_model(
        trainer, data_th, grid_params, grid_search_kwargs, pass_val, params, verbose
    )

    model = refit_with_params(
        trainer, search.best_params_, data=data_th, pass_val=pass_val
    )

    best_score = search.best_score_
    return model, important_cols, subset, best_score

### Data Paths 

In [13]:
# --- --- --- FOLDERS AND FILES --- --- ---

# * Wine dataset paths ---
data_path = "data/ordinalEncoder_imputed_knn/"
knnImputed_path= os.path.join(base_path, data_path, "knn_imputed.csv")

# init scaler
scaler_name = None
Preprocess = None

In [14]:
# --- --- --- MODEL FITTING PARAMETERS --- --- ---

# FIT OR LOAD MODEL ------
toFitXGB = True
toFitRF = True
toFitSVM = True
toFitMLP = True

# * Fitting options ---
toScale = True

# *Handle categorical vars
categorical = True

# DEFAULT STORAGE PARAMETERS ------
to_rm_storage = True
### DATA PARAMETERS ---
training_data_path = None
dataset = None
dependent = None

In [15]:
# Parameters
training_data_path = "knnImputed_path"
dataset = "knnImputed"
dependent = "PAT_AGE"
categorical = False


### Folders 

In [16]:
%%time
# FOLDERS TO STORE ------
paths_to_create = [
    os.path.join(base_path, "scale_dump"),
    os.path.join(base_path, "model_dump"),
    os.path.join(base_path, "grid_search"),
]

# REMOVE PREVIOUS STORAGE IF NEEDED ------
if to_rm_storage:
    print("--- Removing previous saved models, grids and scalers ---")
    for folder in paths_to_create:
        shutil.rmtree(folder, ignore_errors=True)

# (RE)CREATE STORAGE FOLDERS ------
for folder in paths_to_create:
    os.makedirs(folder, exist_ok=True)

--- Removing previous saved models, grids and scalers ---
CPU times: user 1.44 ms, sys: 0 ns, total: 1.44 ms
Wall time: 1.07 ms


## MODEL PARAMETERS


### MODEL PARAMETERS 

In [17]:
# Multiprocessing ---
cpus = multiprocessing.cpu_count()
cpu_ratio = 0.8
cpus_to_use = int(cpus * cpu_ratio)
print(f"CPUS DETECTED : {cpus}")
print(f"CPUS TO USE : {cpus_to_use}")

CPUS DETECTED : 8
CPUS TO USE : 6


### CV PARAMETERS

In [18]:
# --- --- --- CV PARAMS --- --- ---
folds = 3
rstate = 123

# * K Fold ---
skf = KFold(n_splits=folds, shuffle=True, random_state=rstate)

In [19]:
# --- --- --- XGB HYPER PARAMS --- --- ---
# * Grid for xgb ---
xgb_params = {
    "min_child_weight": [1, 5, 10],
    "gamma": [0.001, 0.02, 0.04, 0.08, 0.1, 0.5],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "max_depth": [1, 2, 4, 5],
    "lambda": [0, 0.01, 0.02, 0.5, 1],  # no much pbs of overfting
}
objective = "reg:squaredlogerror"  # metric RMSLE
objective = "reg:squarederror"  # RMSLE STUCKED ...
metric = "mae"
rmsle = rmsle_scorer()

In [20]:
# --- --- --- RANDOM FOREST HYPER PARAMS --- --- ---
max_depth_val = 18
max_depth = np.arange(1, max_depth_val, 1)
max_leaf_nodes = np.arange(
    2, max_depth_val * max_depth_val, 1
)  # si on ajoute un max_depth x il faut au plus accepter x*x feuilles...

params_combinations = list(product(max_depth, max_leaf_nodes))
print("RF Original Number of combinations : %s" % len(params_combinations))

# REMOVE ILOGIC COMBINATIONS cf.TP2 ------
correct_params = []
incorrect_params = []
for depth, leaves in params_combinations:
    max_feuilles = depth * depth
    if leaves > max_feuilles or leaves < np.round(0.5 * max_feuilles):  # critères
        incorrect_params.append((depth, leaves))
    else:
        correct_params.append((depth, leaves))

rf_params = []
for depth, leaves in correct_params:
    grille = {
        "max_depth": [depth],  # wrap in one list element
        "max_leaf_nodes": [leaves],
    }
    rf_params.append(grille)

print("RF Number of correct combinations : %s" % len(correct_params))

RF Original Number of combinations : 5474
RF Number of correct combinations : 912


In [21]:
# --- --- --- SVM HYPER PARAMS --- --- ---

# possibilities ---
Cs = np.linspace(0.5, 40, 7)
KERNELS = ["linear", "poly", "rbf", "sigmoid"]
# grid ---
svm_params = {"C": Cs, "kernel": KERNELS}

In [22]:
# --- --- --- MLP HYPER PARAMS --- --- ---

mlp_params = {
    "hidden_layer_sizes": [(50, 50, 50), (50, 100, 50), (100,)],
    "activation": ["tanh", "relu"],
    "solver": ["sgd", "adam"],
    "alpha": [0.0001, 0.05],
    "learning_rate": ["constant", "adaptive"],
}

## IMPORT DATA 

### House 

In [23]:
%%time
# --- --- --- DATA --- --- ---
# * Load data ---
data = pd.read_csv(eval(training_data_path), sep=",")

(
    data_subset,
    y,
    all_cols,
    numeric_cols,
    numeric_cols_index,
    categorical_cols,
    categorical_cols_index,
) = preprocess_data_as_model(data, dependent, categorical)
cols_beginning = data_subset.columns.values

# DEFINE TRANSFORMATION BASED ON OPTIONS ------
# Transformations ---
transformations = []
if categorical:
    # One hot encoder ---
    encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
    transformer = ("cat_cols", encoder, categorical_cols_index)  # on cat cols
    transformations.append(transformer)

else:
    # Training cols ---
    data_subset = data_subset.select_dtypes(["number"])  # drop categorical
    all_cols = cols_beginning[numeric_cols_index]  # only numerical cols ....
    numeric_cols_index = (
        data_subset.columns.values != None
    )  # apply transformation to all columns


# * Optional Scaling ---
if toScale:
    # unit variance scaler ---
    scaler = StandardScaler()
    transformer = ("num_cols", scaler, numeric_cols_index)  # on num cols
    transformations.append(transformer)

# * TRAIN VAL TEST SPLIT ---
# train test
X_train, XHold_test, y_train, yHold_test = train_test_split(
    data_subset.values, y, test_size=0.15, random_state=rstate
)
# train validation
X_train, X_validation, y_train, y_validation = train_test_split(
    X_train, y_train, test_size=0.15, random_state=rstate
)

#

if toScale or categorical:
    print(
        f"--- Column Transformation. Scaling:{toScale} , OneHotEncoder : {categorical}  ---"
    )
    Preprocess = ColumnTransformer(
        transformations, n_jobs=cpus_to_use, remainder="passthrough"
    )
    # fit ---
    X_train = Preprocess.fit_transform(X_train)
    X_validation = Preprocess.transform(X_validation)
    XHold_test = Preprocess.transform(XHold_test)

    # get names ---
    all_cols = Preprocess.get_feature_names_out()

    # dump to reuse
    scaler_name = f"scale_dump/ColumnTransformer_{dataset}_{init_time}.pkl"
    dump(Preprocess, open(scaler_name, "wb"))

# TRAIN VAL TEST SPLIT ------
data_splits = [
    (X_train, y_train),
    (X_validation, y_validation),
    (XHold_test, yHold_test),
]

--- Column Transformation. Scaling:True , OneHotEncoder : False  ---


CPU times: user 24.3 ms, sys: 52.1 ms, total: 76.4 ms
Wall time: 907 ms


In [24]:
grid_search_kwargs = dict(
    scoring=rmsle, n_jobs=cpus_to_use, refit=True, cv=skf, verbose=1
)

## XGBOOST 

In [25]:
saver_params = dict(
    dataset=dataset,
    model_name="xgb",
    objective=objective,
    toScale=toScale,
    init_time=init_time,
    columns_used=all_cols,
    Preprocess=Preprocess,
    preprocessing_path=scaler_name,
    categorical=categorical,
)

saver_grid_params = {}
for i in [
    "model_name",
    "dataset",
    "init_time",
    "preprocessing_path",
    "toScale",
    "categorical",
]:
    saver_grid_params[i] = saver_params.get(i)

params = {
    "objective": objective,
    "learning_rate": 0.02,
    "n_estimators": 700,
    "n_jobs": 1,  # if not defaults to -1...
}

### BASIC MODEL (NO HPT)

In [26]:
%%time
# --- --- --- XGBOOST --- --- ---
if toFitXGB:
    model, kind, best_score = fit_baseline_model(
        train_xgb,
        data=[(X_train, y_train), (X_validation, y_validation)],
        pass_val=True,
        params=params,
    )

    scores_df = get_results(model, data_splits)
    # SAVE RESULTS ------
    save_model(
        model=model, kind=kind, best_score=best_score, scores=scores_df, **saver_params
    )
    # boost.get_booster().feature_names= important_cols

---- BEST SCORES ---
         train      val     test  difference
rmsle -0.01049 -0.01909 -0.01410     0.00860
rmse   1.98536  3.76963  2.41602    -1.78427
mae    0.95977  1.24324  1.07212    -0.28347


Paramètres du modèle: {'objective': 'reg:squarederror', 'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1, 'enable_categorical': False, 'gamma': 0, 'gpu_id': -1, 'importance_type': None, 'interaction_constraints': '', 'learning_rate': 0.02, 'max_delta_step': 0, 'max_depth': 6, 'min_child_weight': 1, 'missing': nan, 'monotone_constraints': '()', 'n_estimators': 700, 'n_jobs': 1, 'num_parallel_tree': 1, 'predictor': 'auto', 'random_state': 0, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'subsample': 1, 'tree_method': 'exact', 'validate_parameters': 1, 'verbosity': None}



--- Saving model ---
--- xgb model saved to model_dump/xgb_knnImputed_baseline_reg:squarederror_True_-0.01909_07-02-2022_10:08:23.pkl---
CPU times: user 1.52 s, sys: 3.56 ms, total: 1.52 s
Wall time: 1.52 s


### GRID SEARCH

In [27]:
%%time
# --- --- --- XGBOOST GRID SEARCH --- --- ---
if toFitXGB:
    kind = "grid"

    search = fit_grid_model(
        train_xgb,
        data=[(X_train, y_train), (X_validation, y_validation)],
        grid_params=xgb_params,
        grid_search_kwargs=grid_search_kwargs,
        pass_val=True,
        params=params,
        verbose=True,
    )

    # REFIT TO GET TRAINING ERROR AT EACH STEP ------

    model = refit_with_params(
        train_xgb,
        search.best_estimator_.get_xgb_params(),
        data=[(X_train, y_train), (X_validation, y_validation)],
        pass_val=True,
    )
    # Score model ---
    scores_df = get_results(model, data_splits)
    best_score = search.best_score_
    # Save model ---
    model_path = save_model(
        model=model, kind=kind, best_score=best_score, scores=scores_df, **saver_params
    )
    # Save grid search results ---
    save_grid(model_path=model_path, search=search, **saver_grid_params)
    # boost.get_booster().feature_names= important_cols

Fitting 3 folds for each of 3240 candidates, totalling 9720 fits


Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Le meilleur score obtenu par notre grid search (à savoir, le score est le RMSLE): -0.01784645917233836

Le meilleurs paramètres: {'colsample_bytree': 1.0, 'gamma': 0.5, 'lambda': 0.5, 'max_depth': 5, 'min_child_weight': 5, 'subsample': 1.0}

Setting best parameters: {'objective': 'reg:squarederror', 'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1.0, 'gamma': 0.5, 'gpu_id': -1, 'interaction_constraints': '', 'learning_rate': 0.02, 'max_delta_step': 0, 'max_depth': 5, 'min_child_weight': 5, 'monotone_constraints': '()', 'n_jobs': 1, 'num_parallel_tree': 1, 'predictor': 'auto', 'random_state': 0, 'reg_alpha': 0, 'reg_lambda': 0.5, 'scale_pos_weight': 1, 'subsample': 1.0, 'tree_method': 'exact', 'validate_parameters': 1, 'verbosity': None, 'lambda': 0.5}


---- BEST SCORES ---
         train      val     test  difference
rmsle -0.01288 -0.01871 -0.01343     0.00583
rmse   2.25100  3.71635  2.37214    -1.46535
mae    0.98972  1.21245  1.06662    -0.22273


Paramètres du modèle: {'objective': 'reg:squarederror', 'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1.0, 'enable_categorical': False, 'gamma': 0.5, 'gpu_id': -1, 'importance_type': None, 'interaction_constraints': '', 'learning_rate': 0.02, 'max_delta_step': 0, 'max_depth': 5, 'min_child_weight': 5, 'missing': nan, 'monotone_constraints': '()', 'n_estimators': 700, 'n_jobs': 1, 'num_parallel_tree': 1, 'predictor': 'auto', 'random_state': 0, 'reg_alpha': 0, 'reg_lambda': 0.5, 'scale_pos_weight': 1, 'subsample': 1.0, 'tree_method': 'exact', 'validate_parameters': 1, 'verbosity': None, 'lambda': 0.5}



--- Saving model ---
--- xgb model saved to model_dump/xgb_knnImputed_grid_reg:squarederror_True_-0.01785_07-02-2022_10:08:23.pkl---
--- Grid search results saved to grid_search/Grid_xgb_knnImputed_07-02-2022_10:08:23.csv---
CPU times: user 24 s, sys: 1.8 s, total: 25.8 s
Wall time: 1h 40min 31s


### Refit on feature selection 

In [28]:
if toFitXGB:
    model, important_cols, subset, best_score = refit_with_selection(
        train_xgb,
        [(X_train, y_train), (X_validation, y_validation)],
        model_with_importances=model,
        all_cols=all_cols,
        grid_params=xgb_params,
        grid_search_kwargs=grid_search_kwargs,
        pass_val=True,
        params=params,
        best_params=search.best_estimator_.get_xgb_params(),
        th=0.001,
        verbose=False,
    )

    model.get_booster().feature_names = important_cols

    data_splits_th = [
        (data_splits[0][0][:, subset], data_splits[0][1]),
        (data_splits[1][0][:, subset], data_splits[1][1]),
        (data_splits[2][0][:, subset], data_splits[2][1]),
    ]

    scores_df = get_results(model, data_splits_th)
    # Save model ---
    kind = "refit"
    model_path = save_model(
        model=model, kind=kind, best_score=best_score, scores=scores_df, **saver_params
    )

Fitting 3 folds for each of 3240 candidates, totalling 9720 fits


Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/jovyan/work/CM_ML/TP5/Maturite_dentaire/utils/utils.py", line 23, in rmsle
    loss=mean_squared_log_error(y_true, y_pred,squared=True) # MSLE
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Setting best parameters: {'colsample_bytree': 1.0, 'gamma': 0.5, 'lambda': 0.5, 'max_depth': 5, 'min_child_weight': 5, 'subsample': 1.0}


---- BEST SCORES ---
         train      val     test  difference
rmsle -0.01288 -0.01871 -0.01343     0.00583
rmse   2.25100  3.71635  2.37214    -1.46535
mae    0.98972  1.21245  1.06662    -0.22273


Paramètres du modèle: {'objective': 'reg:squarederror', 'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1.0, 'enable_categorical': False, 'gamma': 0.5, 'gpu_id': -1, 'importance_type': None, 'interaction_constraints': '', 'learning_rate': 0.02, 'max_delta_step': 0, 'max_depth': 5, 'min_child_weight': 5, 'missing': nan, 'monotone_constraints': '()', 'n_estimators': 700, 'n_jobs': 1, 'num_parallel_tree': 1, 'predictor': 'auto', 'random_state': 0, 'reg_alpha': 0, 'reg_lambda': 0.5, 'scale_pos_weight': 1, 'subsample': 1.0, 'tree_method': 'exact', 'validate_parameters': 1, 'verbosity': None, 'lambda': 0.5}



--- Saving model ---
--- xgb model saved to model_dump/xgb_knnImputed_refit_reg:squarederror_True_-0.01785_07-02-2022_10:08:23.pkl---


## RF 

In [29]:
# * Split for learning ---
# train test
if toFitRF or toFitMLP or toFitSVM:

    X_train, XHold_test, y_train, yHold_test = train_test_split(
        data_subset.values, y, test_size=0.15, random_state=rstate
    )
    # train validation
    X_train, X_validation, y_train, y_validation = train_test_split(
        X_train, y_train, test_size=0.15, random_state=rstate
    )

    knnI = KNNImputer()
    #
    if toScale or categorical:
        print(
            f"--- Column Transformation. Scaling:{toScale} , OneHotEncoder : {categorical}, Imputer : knnImputer  ---"
        )
        Preprocess = ColumnTransformer(
            transformations, n_jobs=cpus_to_use, remainder="passthrough"
        )

        Preprocess = Pipeline([("col_trans", Preprocess), ("imputer", knnI)])
    else:
        Preprocess = Pipeline([("imputer", knnI)])

    # fit ---
    Preprocess.fit(X_train)
    X_train = Preprocess.transform(X_train)
    X_validation = Preprocess.transform(X_validation)
    XHold_test = Preprocess.transform(XHold_test)

    # get names ---
    if toScale or categorical:

        all_cols = Preprocess[0].get_feature_names_out()

    # dump to reuse
    preprocessing_name = f"scale_dump/ColumnTransformer__rf_{init_time}.pkl"
    dump(Preprocess, open(preprocessing_name, "wb"))

# TRAIN VAL TEST SPLIT ------
data_splits = [
    (X_train, y_train),
    (X_validation, y_validation),
    (XHold_test, yHold_test),
]

--- Column Transformation. Scaling:True , OneHotEncoder : False, Imputer : knnImputer  ---


### BASIC MODEL (NO HPT)

In [30]:
%%time

saver_params = dict(
    dataset=dataset,
    model_name="rf",
    objective="squared_error",
    toScale=toScale,
    init_time=init_time,
    columns_used=all_cols,
    Preprocess=Preprocess,
    preprocessing_path=scaler_name,
    categorical=categorical,
)
# --- --- --- BASELINE RF --- --- ---
if toFitRF:
    model, kind, best_score = fit_baseline_model(
        train_rf,
        data=[(X_train, y_train), (X_validation, y_validation)],
        pass_val=False,
        params=None,
    )

    scores_df = get_results(model, data_splits)
    # SAVE RESULTS ------
    save_model(
        model=model, kind=kind, best_score=best_score, scores=scores_df, **saver_params
    )

---- BEST SCORES ---
         train      val     test  difference
rmsle -0.00384 -0.01955 -0.01798     0.01571
rmse   0.58154  3.84069  3.02403    -3.25915
mae    0.45849  1.20548  1.14821    -0.74699


Paramètres du modèle: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}



--- Saving model ---
--- rf model saved to model_dump/rf_knnImputed_baseline_squared_error_True_-0.01955_07-02-2022_10:08:23.pkl---
CPU times: user 605 ms, sys: 19.6 ms, total: 625 ms
Wall time: 624 ms


### Grid search

In [31]:
%%time
# --- --- --- RF GRID SEARCH --- --- ---
if toFitRF:
    kind = "grid"

    search = fit_grid_model(
        train_rf,
        data=[(X_train, y_train), (X_validation, y_validation)],
        grid_params=rf_params,
        grid_search_kwargs=grid_search_kwargs,
        pass_val=False,
        params=None,
        verbose=True,
    )

    # REFIT TO GET TRAINING ERROR AT EACH STEP ------

    model = refit_with_params(
        train_rf,
        search.best_params_,
        data=[(X_train, y_train), (X_validation, y_validation)],
        pass_val=False,
    )
    # Score model ---
    scores_df = get_results(model, data_splits)
    best_score = search.best_score_
    # Save model ---
    model_path = save_model(
        model=model, kind=kind, best_score=best_score, scores=scores_df, **saver_params
    )
    # Save grid search results ---
    save_grid(model_path=model_path, search=search, **saver_grid_params)
    # boost.get_booster().feature_names= important_cols

Fitting 3 folds for each of 912 candidates, totalling 2736 fits


Le meilleur score obtenu par notre grid search (à savoir, le score est le RMSLE): -0.019474025651396994

Le meilleurs paramètres: {'max_depth': 5, 'max_leaf_nodes': 25}

Setting best parameters: {'max_depth': 5, 'max_leaf_nodes': 25}


---- BEST SCORES ---
         train      val     test  difference
rmsle -0.01638 -0.01951 -0.01338     0.00313
rmse   2.74508  3.78490  2.34551    -1.03981
mae    1.12566  1.25610  1.06956    -0.13044


Paramètres du modèle: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 5, 'max_features': 'auto', 'max_leaf_nodes': 25, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}



--- Saving model ---
--- rf model saved to model_dump/rf_knnImputed_grid_squared_error_True_-0.01947_07-02-2022_10:08:23.pkl---
--- Grid search results saved to grid_search/Grid_xgb_knnImputed_07-02-2022_10:08:23.csv---
CPU times: user 5.22 s, sys: 486 ms, total: 5.7 s
Wall time: 2min 47s


### REFIT on subsample of cols

In [32]:
if toFitRF:
    model, important_cols, subset, best_score = refit_with_selection(
        train_rf,
        [(X_train, y_train), (X_validation, y_validation)],
        model_with_importances=model,
        all_cols=all_cols,
        grid_params=rf_params,
        grid_search_kwargs=grid_search_kwargs,
        pass_val=False,
        params=None,
        best_params=search.best_params_,
        th=0.001,
        verbose=False,
    )
    data_splits_th = [
        (data_splits[0][0][:, subset], data_splits[0][1]),
        (data_splits[1][0][:, subset], data_splits[1][1]),
        (data_splits[2][0][:, subset], data_splits[2][1]),
    ]

    scores_df = get_results(model, data_splits_th)
    # Save model ---
    kind = "refit"
    model_path = save_model(
        model=model, kind=kind, best_score=best_score, scores=scores_df, **saver_params
    )
    # Save grid search results ---
    # save_grid(model_path=model_path, search=search, **saver_grid_params)

Fitting 3 folds for each of 912 candidates, totalling 2736 fits


## SVM 

### BASELINE SVM 

In [None]:
%%time

saver_params = dict(
    dataset=dataset,
    model_name="svm",
    objective="misclassification",
    toScale=toScale,
    init_time=init_time,
    columns_used=all_cols,
    Preprocess=Preprocess,
    preprocessing_path=scaler_name,
    categorical=categorical,
)

# --- --- --- BASELINE RF --- --- ---
if toFitSVM:
    model, kind, best_score = fit_baseline_model(
        train_svr,
        data=[(X_train, y_train), (X_validation, y_validation)],
        pass_val=False,
        params=None,
    )

    scores_df = get_results(model, data_splits)
    # SAVE RESULTS ------
    save_model(
        model=model, kind=kind, best_score=best_score, scores=scores_df, **saver_params
    )

### GRID SEARCH

In [None]:
%%time
# --- --- --- SVM GRID SEARCH --- --- ---
if toFitSVM:
    kind = "grid"

    search = fit_grid_model(
        train_svr,
        data=[(X_train, y_train), (X_validation, y_validation)],
        grid_params=svm_params,
        grid_search_kwargs=grid_search_kwargs,
        pass_val=False,
        params=None,
        verbose=True,
    )

    # REFIT TO GET TRAINING ERROR AT EACH STEP ------

    model = search.best_estimator_
    # Score model ---
    scores_df = get_results(model, data_splits)
    best_score = search.best_score_
    # Save model ---
    model_path = save_model(
        model=model, kind=kind, best_score=best_score, scores=scores_df, **saver_params
    )
    # Save grid search results ---
    save_grid(model_path=model_path, search=search, **saver_grid_params)

## MLP

### BASELINE MLP

In [None]:
grid_search_kwargs = dict(
    scoring="neg_mean_absolute_error", n_jobs=cpus_to_use, refit=True, cv=skf, verbose=1
)

In [None]:
%%time

saver_params = dict(
    dataset=dataset,
    model_name="mlp",
    objective="mae",
    toScale=toScale,
    init_time=init_time,
    columns_used=all_cols,
    Preprocess=Preprocess,
    preprocessing_path=scaler_name,
    categorical=categorical,
)

# --- --- --- BASELINE MLP --- --- ---
if toFitMLP:
    model, kind, best_score = fit_baseline_model(
        train_mlp,
        data=[(X_train, y_train), (X_validation, y_validation)],
        pass_val=False,
        params=None,
    )

    scores_df = get_results(model, data_splits)
    # SAVE RESULTS ------
    save_model(
        model=model, kind=kind, best_score=best_score, scores=scores_df, **saver_params
    )

### GRID SEARCH

In [None]:
%%time
# --- --- --- SVM GRID SEARCH --- --- ---
if toFitMLP:
    kind = "grid"

    search = fit_grid_model(
        train_mlp,
        data=[(X_train, y_train), (X_validation, y_validation)],
        grid_params=mlp_params,
        grid_search_kwargs=grid_search_kwargs,
        pass_val=False,
        params=None,
        verbose=True,
    )

    # REFIT TO GET TRAINING ERROR AT EACH STEP ------
    model = search.best_estimator_
    # Score model ---
    scores_df = get_results(model, data_splits)
    best_score = search.best_score_
    # Save model ---
    model_path = save_model(
        model=model, kind=kind, best_score=best_score, scores=scores_df, **saver_params
    )
    # Save grid search results ---
    save_grid(model_path=model_path, search=search, **saver_grid_params)

## END 

In [None]:
# PRINT EXECUTION TIME ------
e = datetime.now()  # end time
delta = e - s  # timedelta
# extract ---
days = delta.days
seconds = delta.seconds
# calcultate hours, minutes
hours = seconds // 3600
minutes = (seconds // 60) % 60
print("------ EXECUTION TIME ------")
print("days:", days, "hours:", hours, "minutes:", minutes)