In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import make_scorer

In [2]:
path_eval = "/kaggle/input/dataprojet/evaluation.csv"
path_clean = "/kaggle/input/df-no-out-norms/development_noNoise_noRMS_noOutliers.csv"

In [3]:
def prediction(eval_path, results_path, model):
    df_test = pd.read_csv(eval_path)
    ## Remove noise columns
    ix = 0,7,12,15,16,17
    noise = [f"rms[{i}]" for i in range(18)]
    noise += [f"pmax[{i}]" for i in ix]
    noise += [f"negpmax[{i}]" for i in ix]
    noise += [f"tmax[{i}]" for i in ix]
    noise += [f"area[{i}]" for i in ix]
    df_test = df_test.drop(columns = noise)
    y_pred_eval = model.predict(df_test.drop(columns = ["Id"]))
    df_test[["x","y"]] =  y_pred_eval
    df_test['Predicted'] = df_test['x'].astype(str) + '|' + df_test['y'].astype(str)
    df_test[['Id', 'Predicted']].to_csv(results_path, index = False)

In [4]:
## Metric to be used to evaluate the performances of models 
def distance(y_true, y_pred):
    n = len(y_true)
    d = np.sum(np.sqrt(np.sum((y_true - y_pred)**2, axis=1))) / n
    return d
scorer = make_scorer(distance, greater_is_better=False)

In [5]:
def multi_pipeline_train(df_path, model, tuning_params=None, train_size=0.7):
    # Load the data
    df = pd.read_csv(df_path)
    np.random.seed(124)
    print("Start performing a hypeparameter Tunning for multi output regresor using ET Model")
    ## Split the dataset
    X = df.drop(columns=["x", "y"])
    y = df[["x", "y"]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, random_state=124, shuffle = True)

    print("Data splited")
    # Multi-output regressor without hyperparameter tuning
    if tuning_params is None:
        print("Trainig the base model")
        multi_output_reg = MultiOutputRegressor(model)
        multi_output_reg.fit(X_train, y_train)
        y_pred = multi_output_reg.predict(X_test)
        distance_metric = distance(y_test, y_pred)
        print(f"Base Multi-output regressor using ET distance on test dataset: {distance_metric}")
        return multi_output_reg
    # Multi-output regressor with hyperparameter tuning
    else:
        print("Hyperparamter Tunning Performing start")
        multi_output_reg = MultiOutputRegressor(model)
        cvparam = ShuffleSplit(test_size=0.3, n_splits=1, random_state=124)
        grid_search = GridSearchCV(estimator = multi_output_reg, param_grid=tuning_params, cv=cvparam,
                                   scoring=scorer, n_jobs=1, verbose = 4)
        grid_search.fit(X, y)
        best_model = grid_search.best_estimator_
        #distance_metric_tuned = distance(y_test, y_pred_best)
        #print(f"Tuned Multi-output regressor using ET distance on test dataset: {distance_metric_tuned}")
        print("Best Hyperparameters found:", grid_search.best_params_)
        return best_model, grid_search.best_params_

In [6]:
et_regressor = ExtraTreesRegressor(random_state=42)
#model_base = multi_pipeline_train(path_clean, et_regressor)

In [7]:
#result_path = "/kaggle/working/et_base_eval.csv"
#prediction(path_eval, result_path, model_base)
#print("Base model evaluation saved on", result_path)

In [8]:
#Extra Trees Regressor and hyperparameter tuning
et_regressor = ExtraTreesRegressor(random_state=42)
et_param_grid = {
    'estimator__n_estimators': [200, 300, 800],
    'estimator__max_depth': [None, 15, 30],
}

best_model_et, hyperparameters_et = multi_pipeline_train(path_clean, et_regressor, tuning_params=et_param_grid)

Start performing a hypeparameter Tunning for multi output regresor using ET Model
Data splited
Hyperparamter Tunning Performing start
Fitting 1 folds for each of 9 candidates, totalling 9 fits
[CV 1/1] END estimator__max_depth=None, estimator__n_estimators=200;, score=-4.205 total time=19.6min
[CV 1/1] END estimator__max_depth=None, estimator__n_estimators=300;, score=-4.193 total time=29.6min
[CV 1/1] END estimator__max_depth=None, estimator__n_estimators=800;, score=-4.181 total time=83.4min
[CV 1/1] END estimator__max_depth=15, estimator__n_estimators=200;, score=-4.943 total time=14.5min
[CV 1/1] END estimator__max_depth=15, estimator__n_estimators=300;, score=-4.935 total time=20.4min
[CV 1/1] END estimator__max_depth=15, estimator__n_estimators=800;, score=-4.942 total time=53.1min
[CV 1/1] END estimator__max_depth=30, estimator__n_estimators=200;, score=-4.204 total time=21.5min
[CV 1/1] END estimator__max_depth=30, estimator__n_estimators=300;, score=-4.193 total time=32.2min
[

In [9]:
result_path = "/kaggle/working/et_best_eval.csv"
prediction(path_eval, result_path, best_model_et)
print("Base model evaluation saved on", result_path)

Base model evaluation saved on /kaggle/working/et_best_eval.csv
