### <span style="color:#00aba1;"> Requirements </span>


In [None]:
import pandas as pd
import warnings
from sklearn.preprocessing import StandardScaler
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.model_selection import LeaveOneOut, train_test_split
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import sys
from numpy import shape
sys.path.append('C:\\Users\\Admin\\Desktop\\Tese\\Code')
import functions as fc
import importlib
importlib.reload(fc)
import ANN
import csv
importlib.reload(ANN)
import joblib

### <span style="color:#00aba1;"> Data preprocessing </span>


#### Loading data:

In [71]:
#Upload training set
TRAIN_X = pd.read_csv("C:\\Users\\Admin\\Desktop\\Tese\\47\\Experiences\\Regression\\Case_2\\x_train.csv", index_col="ID")
TRAIN_Y = pd.read_csv("C:\\Users\\Admin\\Desktop\\Tese\\47\\Experiences\\Regression\\Case_2\\y_train.csv", index_col="ID")

#Upload testing set

TEST_X = pd.read_csv("C:\\Users\\Admin\\Desktop\\Tese\\47\\Experiences\\Regression\\Case_2\\x_test.csv", index_col="ID")
TEST_Y = pd.read_csv("C:\\Users\\Admin\\Desktop\\Tese\\47\\Experiences\\Regression\\Case_2\\y_test.csv", index_col="ID")


In [72]:
#Create index and columns
train_x_index = TRAIN_X.index
train_x_columns = TRAIN_X.columns

train_y_index = TRAIN_Y.index
train_y_columns = TRAIN_Y.columns

test_x_index = TEST_X.index
test_x_columns = TEST_X.columns

test_y_index = TEST_Y.index
test_y_columns = TEST_Y.columns

#### Scaling training data:

In [73]:
scaler_x = StandardScaler()
scaler_x.fit(TRAIN_X)
train_x_scaled = pd.DataFrame(scaler_x.transform(TRAIN_X), index=train_x_index, columns=train_x_columns)


scaler_y = StandardScaler()
scaler_y.fit(TRAIN_Y)
train_y_scaled = pd.DataFrame(scaler_y.transform(TRAIN_Y),index=train_y_index,columns=train_y_columns)

#### Scaling testing data:

In [74]:
test_x_scaled = pd.DataFrame(scaler_x.transform(TEST_X), index=test_x_index, columns=test_x_columns)

test_y_scaled = pd.DataFrame(scaler_y.transform(TEST_Y),index=test_y_index,columns=test_y_columns)

#### Transforming data into arrays:

In [75]:
train_y_as_array = TRAIN_Y.values.ravel()
test_y_as_array = TEST_Y.values.ravel()

train_y_scaled_as_array = train_y_scaled.values.ravel()
test_y_scaled_as_array = test_y_scaled.values.ravel()

train_x_as_array = TRAIN_X.values.ravel()
test_x_as_array = TEST_X.values.ravel()

train_x_scaled_as_array = train_x_scaled.values.ravel()
test_x_scaled_as_array = test_x_scaled.values.ravel()

### <span style="color:#00aba1;"> Feature Selection </span>


#### Creating subsets:

In [None]:
features_pca = fc.create_pca_features(n_components=10,
                                      train_x_scaled=train_x_scaled,
                                      columns=train_x_columns,
                                      train_x=TRAIN_X)

features_anova = fc.create_anova_features(k_features=26,
                                          train_x=TRAIN_X,
                                          train_y=TRAIN_Y,
                                          train_x_columns=train_x_columns)

features_lasso = fc.create_lasso_features(cv=5,
                                          max_iter=10000,
                                          train_x_scaled=train_x_scaled,
                                          train_y_array=train_y_as_array,
                                          train_x_columns=train_x_columns,
                                          train_x=TRAIN_X)

features_correlation = fc.create_correlation_features(train_x=TRAIN_X,
                                                      correlation_threshold=0.6)

features_rfe = fc.create_RFE_features(n_features_to_select=26,
                                      train_x_scaled=train_x_scaled,
                                      train_y_scaled_array=train_y_scaled_as_array,
                                      train_x=TRAIN_X)

#### Figures for feature selection:

In [77]:
# # PCA
# fc.plot_of_the_cumulative_sum_of_eigenvalues(train_x_scaled=train_x_scaled,
#                                              type_of_experience="Regression",
#                                              number_of_case=2,
#                                              save="y")

# # ANOVA
# fc.plot_ANOVA_F_values(train_x=TRAIN_X,
#                        train_y_array=train_y_as_array,
#                        train_x_columns=train_x_columns,
#                        type_of_experience="Regression",
#                        number_of_case=2,
#                        save="y")

# # LASSO
# fc.plot_lasso_coef_values(cv=5,
#                           max_iter=10000,
#                           train_x_scaled=train_x_scaled,
#                           train_y_array=train_y_as_array,
#                           train_x_columns=train_x_columns,
#                           type_of_experience="Regression",
#                           number_of_case=2,
#                           save="y")

# #RFE
# fc.plot_RFE_ranking(train_x_columns=train_x_columns,
#                     n_features_to_select=26,
#                     train_x_scaled=train_x_scaled,
#                     train_y_scaled_array=train_y_scaled_as_array,
#                     type_of_experience="Regression",
#                     number_of_case=2,
#                     save="y"
#                     )

#### Initial Conditions:

In [78]:
cv = LeaveOneOut()

scoring = {
    "MSE" : mean_squared_error,
    "MAE": mean_absolute_error
}

features = {
    "features_pca": features_pca,
    "features_anova": features_anova,
    "features_lasso": features_lasso,
    "features_correlation": features_correlation,
    "features_rfe": features_rfe,
    "all_features": TRAIN_X
}

### <span style="color:#00aba1;"> Grid Search </span>


#### Validation split:

In [79]:
x_val_train, x_val_test,y_val_train, y_val_test = train_test_split(TRAIN_X,TRAIN_Y,test_size=0.18,random_state=42)
x_val_train_scaled, x_val_test_scaled,y_val_train_scaled, y_val_test_scaled = train_test_split(train_x_scaled,train_y_scaled,test_size=0.18,random_state=42)

In [80]:
y_val_train_as_array = y_val_train.values.ravel()
y_val_test_as_array = y_val_test.values.ravel()

y_val_train_scaled_as_array = y_val_train_scaled.values.ravel()
y_val_test_scaled_as_array = y_val_test_scaled.values.ravel()

x_val_train_as_array = x_val_train.values.ravel()
x_val_test_as_array = x_val_test.values.ravel()

x_val_train_scaled_as_array = x_val_train_scaled.values.ravel()
x_val_test_scaled_as_array = x_val_test_scaled.values.ravel()


#### Grid Search RF:

In [None]:
# GridSearch for RF

for subset in features:

    features_selected = features[subset]

    x = x_val_train[features_selected.columns]
    x_val_test_for_search = x_val_test[features_selected.columns]

    dict_search = fc.search_ML(
        dict_param=fc.RFR_param,
        algorithm=RandomForestRegressor,
        x_val_train=x,
        y_val_train=y_val_train_as_array, 
        x_val_test = x_val_test_for_search,
        y_val_test = y_val_test,
        train_y_index=train_y_index,
        train_y_columns=train_y_columns,
        scaler=scaler_y,
        train_y_as_array=y_val_train_as_array,
        scoring=scoring,
        mode="Regression",
        subset= subset,
        case=2,
        regressor = "Random Forest",
        scale=None,
        checkpoint_file = f'checkpoint_{subset}.pkl'
    )

    output_file_path = f"C:\\Users\\Admin\\Desktop\\Tese\\47\\Experiences\\Regression\\Case_2\\GridSearch\\Random Forest\\{subset}.csv"
    if dict_search:
        print(f"Writing results to {output_file_path}")
        with open(output_file_path, "w", newline='') as f:
            w = csv.writer(f)
            w.writerow(["Parameters", "Scores"])  
            for key, values in dict_search.items():
                w.writerow([key, values])
    else:
        print(f"No data to write for subset {subset}")

In [None]:
# #Grid Search for SVR

for subset in features:
    features_selected = features[subset]
    x = x_val_train_scaled[features_selected.columns]
    x_val_test_for_search = x_val_test_scaled[features_selected.columns]
    
    # Iterate over each kernel and accumulate the results
    for kernel in ["poly", "rbf", "sigmoid", "linear"]:
        dict_search = fc.search_ML(
            dict_param=fc.SVR_param[kernel],
            algorithm=SVR,
            x_val_train=x,
            y_val_train=y_val_train_scaled_as_array, 
            x_val_test = x_val_test_for_search,
            y_val_test = y_val_test_scaled,
            train_y_index=y_val_test.index,
            train_y_columns=y_val_test.columns,
            scaler=scaler_y,
            train_y_as_array=y_val_train_as_array,
            scoring=scoring,
            mode="Regression",
            subset = subset,
            case = 2,
            regressor = "Support vector",
            scale="y",
            checkpoint_file = f'checkpoint_{subset}_SVM.pkl'
        )

    output_file_path = f"C:\\Users\\Admin\\Desktop\\Tese\\47\\Experiences\\Regression\\Case_2\\GridSearch\\Support vector\\{subset}.csv"
    if dict_search:
        print(f"Writing results to {output_file_path}")
        with open(output_file_path, "w", newline='') as f:
            w = csv.writer(f)
            w.writerow(["Parameters", "Scores"])  
            for key, values in dict_search.items():
                w.writerow([key, values])
    else:
        print(f"No data to write for subset {subset}")

#### Grid Search NN:

In [None]:
## Grid Search ANN
NN_param = {
    "optimizer": ["adam"],
    "loss" : ["mae","mse"],
    "activation":["relu"],
    "layers_vector": [[64,32],
                      [128,64],
                      [128,64,32],
                      [64,32,16],
                      [128,64,32,16],
                      [256,128,64,32],
                      [64,32,16,4]],
    "output_layer": [1,2,4,8],
    "batch_size": [2,4,8]

}

for subset in features:

    features_selected = features[subset]

    x = x_val_train_scaled[features_selected.columns]
    x_val_test_for_search = x_val_test_scaled[features_selected.columns]

    grid = fc.stage_experiments(NN_param)

    store = fc.load_checkpoint(f"checkpoint_file_{subset}.pkl")

    print("Initial checkpoint data:", store)

    count = 0

    for experiment in grid:
            args = [v[experiment[i]] for i, v in enumerate(NN_param.values())]
            count += 1
            print(f"Experiment {count}/{len(grid)}: {args}")
            args_dict = {k: args[i] for i, k in enumerate(NN_param)}

            if str(args_dict) in store:
                print(f"Skipping experiment {count}/{len(grid)} (already completed)")
                continue
            else:
                try:
                    model = ANN.NeuralNetworkModel(input_shape= [x.shape[1]],
                                                    **args_dict)
                    model.train_model(train_x = x,
                    train_y = y_val_train_scaled,
                    validation_split = 0.1,
                    verbose=0,
                    epochs = 100)
                
                    model.load_model()
                    args_str = fc.sanitize_args_dict(args_dict)
                    model.save_model(case=2,subset=subset,args=args_str)
                    y_val_test_pred_scaled = model.predict(test_x=x_val_test_for_search, verbose=1)
                    y_val_test_pred_scaled = pd.DataFrame(y_val_test_pred_scaled) 
                    y_val_test_pred = pd.DataFrame(scaler_y.inverse_transform(y_val_test_pred_scaled))

                    for metric in scoring:
                        value = scoring[metric](y_val_test,y_val_test_pred)
                        if str(args_dict) not in store:
                            store[str(args_dict)] = []
                        store[str(args_dict)].append(value)
                    print(f"Results for experiment {count}: {store[str(args_dict)]}")
                    fc.save_checkpoint(store, f"checkpoint_file_{subset}_NN.pkl")
                except Exception as e:
                    store[str(args_dict)] = []
                    fc.save_checkpoint(store, f"checkpoint_file_{subset}_NN.pkl")
                    print(f"Error encountered with args: {args_dict}")
                    print(f"Exception: {e}")
                    continue

    output_file_path = f"C:\\Users\\Admin\\Desktop\\Tese\\47\\Experiences\\Regression\\Case_2\\GridSearch\\Neural Network\\{subset}.csv"
    if store:
        print(f"Writing results to {output_file_path}")
        with open(output_file_path, "w", newline='') as f:
            w = csv.writer(f)
            w.writerow(["Parameters", "Scores"])  
            for key, values in store.items():
                w.writerow([key, values])
    else:
        print(f"No data to write for subset {subset}")

### <span style="color:#00aba1;"> Testing  </span>


#### Results:

In [None]:
# Search results
results = fc.transform_csv_into_dict("C:\\Users\\Admin\\Desktop\\Tese\\47\\Experiences\\Regression\\Case_2\\GridSearch\\Support Vector\\all_features.csv")

fc.get_best_param(results)

In [116]:
neural_network = {
    "features_pca": f"C:\\Users\\Admin\\Desktop\\Tese\\47\\Experiences\\Regression\\Case_2\\GridSearch\\Neural Network\\features_pca_adam_mse_relu_[64, 32, 16, 4]_8_4.keras",
    "features_anova": f"C:\\Users\\Admin\\Desktop\\Tese\\47\\Experiences\\Regression\\Case_2\\GridSearch\\Neural Network\\features_anova_adam_mse_relu_[128, 64]_4_4.keras",
    "features_lasso": f"C:\\Users\\Admin\\Desktop\\Tese\\47\\Experiences\\Regression\\Case_2\\GridSearch\\Neural Network\\features_lasso_adam_mae_relu_[64, 32, 16]_1_2.keras",
    "features_correlation": f"C:\\Users\\Admin\\Desktop\\Tese\\47\\Experiences\\Regression\\Case_2\\GridSearch\\Neural Network\\features_correlation_adam_mae_relu_[64, 32, 16]_2_4.keras",
    "features_rfe": f"C:\\Users\\Admin\\Desktop\\Tese\\47\\Experiences\\Regression\\Case_2\\GridSearch\\Neural Network\\features_rfe_adam_mae_relu_[128, 64, 32]_2_4.keras",
    "all_features": f"C:\\Users\\Admin\\Desktop\\Tese\\47\\Experiences\\Regression\\Case_2\\GridSearch\\Neural Network\\all_features_adam_mae_relu_[256, 128, 64, 32]_1_4.keras"
}


random_forest = {
    "features_pca": f"C:\\Users\\Admin\\Desktop\\Tese\\47\\Experiences\\Regression\\Case_2\\GridSearch\\Random Forest\\features_pca_1_absolute_error_2_7_log2_False.pkl",
    "features_anova": f"C:\\Users\\Admin\\Desktop\\Tese\\47\\Experiences\\Regression\\Case_2\\GridSearch\\Random Forest\\features_anova_1_absolute_error_None_5_sqrt_True.pkl",
    "features_lasso": f"C:\\Users\\Admin\\Desktop\\Tese\\47\\Experiences\\Regression\\Case_2\\GridSearch\\Random Forest\\features_lasso_1_squared_error_None_3_sqrt_True.pkl",
    "features_correlation": f"C:\\Users\\Admin\\Desktop\\Tese\\47\\Experiences\\Regression\\Case_2\\GridSearch\\Random Forest\\features_correlation_1_absolute_error_2_7_log2_False.pkl",
    "features_rfe": f"C:\\Users\\Admin\\Desktop\\Tese\\47\\Experiences\\Regression\\Case_2\\GridSearch\\Random Forest\\features_rfe_1_absolute_error_2_2_sqrt_True.pkl",
    "all_features": f"C:\\Users\\Admin\\Desktop\\Tese\\47\\Experiences\\Regression\\Case_2\\GridSearch\\Random Forest\\all_features_1_absolute_error_2_7_log2_False.pkl"
}



support_vector = {
    "features_pca": f"C:\\Users\\Admin\\Desktop\\Tese\\47\\Experiences\\Regression\\Case_2\\GridSearch\\Support vector\\features_pca_poly_2_0.001_0.001_75.pkl",
    "features_anova": f"C:\\Users\\Admin\\Desktop\\Tese\\47\\Experiences\\Regression\\Case_2\\GridSearch\\Support vector\\features_anova_poly_2_0.1_0.001_75.pkl",
    "features_lasso": f"C:\\Users\\Admin\\Desktop\\Tese\\47\\Experiences\\Regression\\Case_2\\GridSearch\\Support vector\\features_lasso_poly_4_0.001_auto_50.pkl",
    "features_correlation": f"C:\\Users\\Admin\\Desktop\\Tese\\47\\Experiences\\Regression\\Case_2\\GridSearch\\Support vector\\features_correlation_poly_3_0.1_0.01_1.pkl",
    "features_rfe": f"C:\\Users\\Admin\\Desktop\\Tese\\47\\Experiences\\Regression\\Case_2\\GridSearch\\Support vector\\features_rfe_linear_0.1_0.0001_10.pkl",
    "all_features": f"C:\\Users\\Admin\\Desktop\\Tese\\47\\Experiences\\Regression\\Case_2\\GridSearch\\Support vector\\all_features_linear_0.001_0.1_10.pkl"
}



In [None]:
results_test_RF = {}
results_test_ANN = {}
results_test_SVR = {}

y_pred_RF_test = {}
y_pred_ANN_test = {}
y_pred_SVR_test = {}

for subset in features:
    features_selected = features[subset]
    x_test = test_x_scaled[features_selected.columns]

    model = fc.load_model(support_vector[subset])
    y_pred_scaled = model.predict(x_test)

    y_pred_scaled = pd.DataFrame(y_pred_scaled, index=test_y_index, columns=test_y_columns)
    y_pred = pd.DataFrame(scaler_y.inverse_transform(y_pred_scaled), index=test_y_index, columns=test_y_columns)
    y_pred = y_pred.values.ravel()
    y_pred_SVR_test[subset] = y_pred

    for metric in scoring:
        value = fc.get_scores(y_true=test_y_as_array, y_pred=y_pred, metric=scoring[metric])
        results_test_SVR[f"{subset}_{metric}"] = value

for subset in features:
    features_selected = features[subset]
    x_test = TEST_X[features_selected.columns]

    model = fc.load_model(random_forest[subset])
    y_pred = model.predict(x_test)

    y_pred_RF_test[subset] = y_pred

    for metric in scoring:
        value = fc.get_scores(y_true=test_y_as_array, y_pred=y_pred, metric=scoring[metric])
        results_test_RF[f"{subset}_{metric}"] = value

for subset in features:
    features_selected = features[subset]
    x_test = test_x_scaled[features_selected.columns]

    model = ANN.NeuralNetworkModel(input_shape= [x_test.shape[1]],
                                   optimizer="adam",
                                   loss = "mse",
                                   activation="relu",
                                   layers_vector=[128,64,32,16],
                                   output_layer=10,
                                   batch_size = 5,)
    model.load_model(checkpoint_path=neural_network[subset])
    y_pred_scaled = model.predict(x_test)
    y_pred_scaled = pd.DataFrame(y_pred_scaled)
    y_pred = pd.DataFrame(scaler_y.inverse_transform(y_pred_scaled))
    y_pred = y_pred.values.ravel()
    y_pred_ANN_test[subset] = y_pred

    for metric in scoring:
        value = fc.get_scores(y_true=test_y_as_array, y_pred=y_pred, metric=scoring[metric])
        results_test_ANN[f"{subset}_{metric}"] = value

#### Graphics:

In [122]:
# def generate_plots(
#     results_test_dict, 
#     true_y, 
#     target, 
#     pred_y, 
#     model, 
#     save=False, 
#     plots_to_generate=None, 
#     xlim=None, 
#     ylim=None
# ):
#     """
#     Generate selected plots for model evaluation with customizable options.

#     Parameters:
#     - results_test_dict (dict): Dictionary containing test metrics (MSE, MAE, etc.) per feature subset.
#     - true_y (array-like): Ground truth values.
#     - target (str): Target variable name (e.g., 'Mel_4').
#     - pred_y (array-like): Predicted values.
#     - model (str): Name of the model (e.g., 'SVR', 'ANN', 'RF').
#     - save (bool): Whether to save the plots.
#     - plots_to_generate (list): List of plots to generate, options are: 
#         ['bar_chart', 'true_vs_pred', 'percentage_histogram', 'residual_histogram'].
#     - xlim (tuple): Limits for x-axis in residual histogram (default: None).
#     - ylim (tuple): Limits for y-axis in residual histogram (default: None).

#     Returns:
#     - None
#     """

#     if plots_to_generate is None:
#         plots_to_generate = ['bar_chart', 'true_vs_pred', 'percentage_histogram', 'residual_histogram']

#     # Prepare metrics for bar chart if included
#     if 'bar_chart' in plots_to_generate:
#         results = {}
#         filtered_metrics_dict = {k: v for k, v in results_test_dict.items() if 'explained_variance' not in k}

#         for key, value in filtered_metrics_dict.items():
#             feature, metric = key.rsplit("_", 1)
#             if feature not in results:
#                 results[feature] = [None, None]
#             if metric == "MSE":
#                 results[feature][0] = value
#             elif metric == "MAE":
#                 results[feature][1] = value

#         fc.bar_for_metrics(
#             dictionary=results, 
#             first_label="MSE", 
#             second_label="MAE", 
#             y_label=f"Feature Subsets for {model}", 
#             save=save
#         )

#     # Generate true vs predicted plot if included
#     if 'true_vs_pred' in plots_to_generate:
#         fc.plot_y_true_pred(
#             true_y=true_y, 
#             target=target, 
#             pred_y=pred_y, 
#             model=model, 
#             save=save
#         )

#     # Generate percentage histogram if included
#     if 'percentage_histogram' in plots_to_generate:
#         fc.percentage_histogram(
#             true_y=true_y, 
#             target=target, 
#             pred_y=pred_y, 
#             model=model, 
#             save=save
#         )

#     # Generate residual histogram if included
#     if 'residual_histogram' in plots_to_generate:
#         fc.residual_histogram(
#             true_y=true_y, 
#             target=target, 
#             pred_y=pred_y, 
#             model=model, 
#             xlim=xlim, 
#             ylim=ylim, 
#             save=save
#         )


# generate_plots(
#     results_test_dict=results_test_SVR, 
#     true_y=TEST_Y, 
#     target="Mel_7", 
#     pred_y=y_pred_SVR_test, 
#     model="SVR", 
#     save="", 
#     plots_to_generate=['bar_chart', 'true_vs_pred', 'residual_histogram',"percentage_histogram"], 
#     xlim=(-7, 10), 
#     ylim=(0, 6)
# )

In [None]:

# y_pred_RF_test 
y_pred_ANN_test 
# y_pred_SVR_test 

In [None]:
y_pred_diff_models ={
    "Random forest using LASSO features":([ 9.75337768,  1.47899637, -0.5765469 , -2.32778455,  4.36842594,
         5.73526332,  3.99796232, 35.69278525, 11.6508348 , 19.24383646]),
    "Neural network using ANOVA features": ([15.644586 ,  3.6246595,  4.7745075,  6.7107615, 11.799844 ,
         9.249467 , 10.7425995, 20.756308 ,  8.67581  , 14.93825  ]),
    "Support vector machine using PCA features": ([4.30016082, 7.02508091, 6.96744992, 5.27361968, 6.89364922,
        5.79509224, 6.71045266, 6.85101742, 6.77127458, 5.01115146])   
    }

In [None]:
fc.plot_y_true_pred(true_y=TEST_Y,target="Mel_7",pred_y=y_pred_diff_models,model= "",save="")