# XGBoost Regressor Models

## Libraries

In [10]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-v0_8')

import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer

from scipy.stats import pearsonr


In [None]:
# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Paths

In [2]:
DATA_ROOT = "./gaitrec_data"
# Train set
UNBALANCED_TRAIN_SET_METADATA_PATH = os.path.join(DATA_ROOT, "unbalanced_train_set_metadata.csv")
POSTPROCESSED_TRAIN_FOLDER = os.path.join(DATA_ROOT, 'postprocessed_train')
GROUPS_SESSIONS_DATA_TRAIN = os.path.join(DATA_ROOT, 'groups_sessions_data_train')
GRF_F_AP_PRO_DATA_TRAIN = os.path.join(DATA_ROOT, 'GRF_F_AP_PRO_data_train')
# Test set
TEST_SET_METADATA_PATH = os.path.join(DATA_ROOT, "test_set_metadata.csv")
TEST_FOLDER = os.path.join(DATA_ROOT, "test_set")
GROUPS_SESSIONS_DATA_TEST = os.path.join(DATA_ROOT, 'groups_sessions_data_test')
GRF_F_AP_PRO_DATA_TEST = os.path.join(DATA_ROOT, 'GRF_F_AP_PRO_data_test')

## Separar las features del target

- Separar las features (valores de la primera sesión) y el target (valores de la última sesión).
- Los valores de la primera sesión corresponden a las filas impares y los valores de la última sesión corresponden a las filas pares. 

In [3]:
def split_df_train_test(df_train: pd.DataFrame, df_test: pd.DataFrame):
    # Select only the columns of interest
    columns_of_interest = df_train.columns[3:]  # Exclude SUBJECT_ID, SESSION_ID, TRIAL_ID
    # Create X_train, y_train, X_test, y_test:
        # Extract the odd rows into X
        # Extract the even rows into y
    X_train = df_train.iloc[::2][columns_of_interest].values
    y_train = df_train.iloc[1::2][columns_of_interest].values
    X_test = df_test.iloc[::2][columns_of_interest].values
    y_test = df_test.iloc[1::2][columns_of_interest].values

    return X_train, y_train, X_test, y_test

## Load preprocessed data
- ANKLE data - 2 sessions

In [4]:
# train df
combined_legs_2sessions_A_train = pd.read_csv(os.path.join(GRF_F_AP_PRO_DATA_TRAIN, 
                              'groups_2sessions_data_train/A_data/combined.csv'))
# test df
combined_legs_2sessions_A_test = pd.read_csv(os.path.join(GRF_F_AP_PRO_DATA_TEST, 
                              'groups_2sessions_data_test/A_data/combined.csv'))

## Train - Test split

In [5]:
# train - test split 
X_train_A2, y_train_A2, X_test_A2, y_test_A2 = split_df_train_test(df_train = combined_legs_2sessions_A_train, 
                                                                   df_test = combined_legs_2sessions_A_test)

## Random Forest Regressor

### Training

In [27]:
def train_and_evaluate_xgboost(X_train, y_train):
    """
    Train and evaluate an XGBoost Regressor with hyperparameter tuning.

    Parameters:
        X_train (np.ndarray): Training data with shape (num_samples, num_features).
        y_train (np.ndarray): Training target data with shape (num_samples, num_targets).

    Returns:
        xgb.XGBRegressor: Fitted XGBoost model.
        float: Root mean square error (RMSE) score.
        float: R-squared score.
        float: Correlation coefficient (Pearson's r).
    """
    # Initialize an XGBoost model
    xgboost = xgb.XGBRegressor(random_state=42)

    # Define a range of hyperparameters for tuning
    param_dist = {
        'n_estimators': [10, 100, 300, 500, 800, 1000, 2000, 3000, 4000, 5000], # int ranging from 10 to 5000
        'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3, 1.0], # real ranging from 0.01 to 1.0
        'min_child_weight': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], # int between 1 and 10
        'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 35, 40, 45, 50], # int between 1 and 50 
        'max_delta_step': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20], # int between 0 and 20
        'subsample': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], # real from 0.1 to 1.0
        'colsample_bytree': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], # real from 0.1 to 1.0
        'colsample_bylevel': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], # real from 0.1 to 1.0
        'reg_lambda': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0], # real between 1e-9 and 100.0
        'reg_alpha': [1e-9, 0.1, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0], # real between 1e-9 and 100.0
        'gamma': [1e-9, 0.1, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0], # real between 1e-9 and 100.0
        'scale_pos_weight': [1e-6, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0, 200.0, 300.0, 400.0, 500.0] # real between 1e-6 and 500.0
    }

    # Create RandomizedSearchCV object with 10-fold cross-validation
    random_search = RandomizedSearchCV(estimator=xgboost,
                                       param_distributions=param_dist,
                                       cv=5,
                                       n_iter=10,
                                       scoring=make_scorer(mean_squared_error, squared=False, greater_is_better=False),
                                       n_jobs=-1)
    
    # Fit the RandomizedSearchCV object on the training data
    random_search.fit(X_train, y_train)

    # Get the best hyperparameters
    best_params = random_search.best_params_
    print("Best Hyperparameters:", best_params)

    # Train the XGBoost model with the best hyperparameters
    best_xgboost = xgb.XGBRegressor(**best_params, random_state=42)
    best_xgboost.fit(X_train, y_train)

    # Perform cross-validation and get predicted values
    predicted_y = cross_val_predict(best_xgboost, X_train, y_train, cv=10)

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_train, predicted_y))

    # Calculate R2 score
    r2 = r2_score(y_train, predicted_y)

    # Calculate correlation coefficient
    corr_coeff, _ = pearsonr(y_train.flatten(), predicted_y.flatten())

    # Print the results
    print("Results on Training Set:")
    print(f" RMSE: {rmse:.3f}")
    print(f" R2 Score: {r2:.3f}")
    print(f" Correlation coefficient: {corr_coeff:.3f}")

    return best_xgboost, rmse, r2, corr_coeff

### Training

In [28]:
# Train and evaluate the XGBoost Regressor (model 14)
xgb_model, rmse_xgb, r2_xgb, corr_xgb = train_and_evaluate_xgboost(X_train = X_train_A2, 
                                                                   y_train = y_train_A2)



Best Hyperparameters: {'subsample': 0.8, 'scale_pos_weight': 80.0, 'reg_lambda': 1e-06, 'reg_alpha': 0.8, 'n_estimators': 4000, 'min_child_weight': 8, 'max_depth': 30, 'max_delta_step': 2, 'learning_rate': 0.05, 'gamma': 1e-09, 'colsample_bytree': 0.2, 'colsample_bylevel': 0.9}
Results on Training Set:
 RMSE: 0.018
 R2 Score: 0.336
 Correlation coefficient: 0.982


Best Hyperparameters: {'subsample': 0.5, 'scale_pos_weight': 90.0, 'reg_lambda': 80.0, 'reg_alpha': 0.1, 'n_estimators': 1000, 'min_child_weight': 5, 'max_depth': 1, 'max_delta_step': 3, 'learning_rate': 0.01, 'gamma': 1.0, 'colsample_bytree': 0.1, 'colsample_bylevel': 0.1}
Results on Training Set:
 RMSE: 0.026
 R2 Score: -0.078
 Correlation coefficient: 0.960

