In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit
from sklearn.neighbors import KNeighborsRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import make_scorer
from sklearn.neural_network import MLPRegressor

In [None]:
path_eval = "/kaggle/input/dataprojet/evaluation.csv"
path_clean = "/kaggle/input/df-no-out-norms/development_noNoise_noRMS_noOutliers.csv"

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

def prediction(eval_path, results_path, model):
    df_test = pd.read_csv(eval_path)

    # Remove noise columns
    ix = [0, 7, 12, 15, 16, 17]
    noise = [f"rms[{i}]" for i in range(18)]
    noise += [f"pmax[{i}]" for i in ix]
    noise += [f"negpmax[{i}]" for i in ix]
    noise += [f"tmax[{i}]" for i in ix]
    noise += [f"area[{i}]" for i in ix]
    df_test = df_test.drop(columns=noise)

    X_eval = df_test.drop(columns=["Id"])
    scaler = MinMaxScaler()
    X_eval_scaled = scaler.fit_transform(X_eval)
    y_pred_eval = model.predict(X_eval_scaled)
    df_test[["x", "y"]] = y_pred_eval
    df_test['Predicted'] = df_test['x'].astype(str) + '|' + df_test['y'].astype(str)
    df_test[['Id', 'Predicted']].to_csv(results_path, index=False)


In [None]:
## Metric to be used to evaluate the performances of models 
def distance(y_true, y_pred):
    n = len(y_true)
    d = np.sum(np.sqrt(np.sum((y_true - y_pred)**2, axis=1))) / n
    return d
scorer = make_scorer(distance)

In [None]:
def multi_pipeline_train(df_path, model, tuning_params=None, train_size=0.7):
    # Load the data
    df = pd.read_csv(df_path)
    np.random.seed(124)
    print("Start performing hyperparameter tuning for Multi-Output Regressor using MLP Model")
    
    # Split the dataset
    X = df.drop(columns=["x", "y"])
    y = df[["x", "y"]]
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, train_size=train_size, random_state=124, shuffle=True)

    print("Data split")

    # Multi-output regressor without hyperparameter tuning
    if tuning_params is None:
        print("Training the base model")
        multi_output_reg = MultiOutputRegressor(model)
        multi_output_reg.fit(X_train, y_train)
        y_pred = multi_output_reg.predict(X_test)
        # Assuming 'distance' function calculates distance metric
        distance_metric = distance(y_test, y_pred)
        print(f"Base Multi-Output Regressor using MLP distance on test dataset: {distance_metric}")
        return multi_output_reg

    # Multi-output regressor with hyperparameter tuning
    else:
        print("Hyperparameter Tuning Performed using 3-fold validation")
        multi_output_reg = MultiOutputRegressor(model)
        cvparam = ShuffleSplit(test_size=0.2, n_splits=1, random_state=0)
        grid_search = GridSearchCV(estimator=multi_output_reg, param_grid=tuning_params, cv=cvparam,
                                   scoring=scorer, n_jobs=-1, verbose=4)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        y_pred_best = best_model.predict(X_test)
        # Assuming 'distance' function calculates distance metric
        distance_metric_tuned = distance(y_test, y_pred_best)
        print(f"Tuned Multi-Output Regressor using MLP distance on test dataset: {distance_metric_tuned}")
        print("Best Hyperparameters found:", grid_search.best_params_)
        return best_model, grid_search.best_params_

In [None]:
mlp_regressor = MLPRegressor()
model_base_mlp = multi_pipeline_train(path_clean, mlp_regressor)

In [None]:
result_path = "/kaggle/working/mlp_base_eval.csv"
prediction(path_eval, result_path, model_base_mlp)
print("Base model evaluation saved on", result_path)

In [None]:
mlp_param_grid = {
    'estimator__hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'estimator__activation': ['relu', 'tanh'],
    'estimator__alpha': [0.0001, 0.001, 0.01],
    'estimator__max_iter': [100, 200, 300]
}
best_model_mlp, hyperparameters_mlp = multi_pipeline_train(path_clean, MLPRegressor(), tuning_params=mlp_param_grid)

In [None]:
result_path = "/kaggle/working/mlp_best_eval.csv"
prediction(path_eval, result_path, best_model_mlp)
print("Base model evaluation saved on", result_path)