# Optimization

In the following, I ran optimization with `GridSearchCV` for the two best models: 

- `Random Forest` (only all features)
- `XGBoost`
- `CatBoost`

Since the full data set and Select-K-Best data set performed closely, I run the grid search for both data sets. 

## Loading libraries

In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate
import matplotlib.pyplot as plt
import joblib
import seaborn as sns

# for wrapper around XGBoostRegressor
from sklearn.base import BaseEstimator, RegressorMixin

# Explainer Dashboard
from explainerdashboard import RegressionExplainer, ExplainerDashboard

In [24]:
# User functions
# Prepare markdown table
def scores_to_markdown(scores):
    # Create table header
    header = '| Model | MAE | MSE | RMSE | R2 |\n'
    header += '|-------|-----|-----|------|-----|\n'
    
    # Create table rows
    table_rows = ''
    for model_name, metrics in scores.items():
        table_rows += f'| {model_name} | {metrics['MAE']:.4f} | {metrics['MSE']:.4f} | {metrics['RMSE']:.4f} | {metrics['R2']:.4f} |\n'
    
    return header + table_rows

# Fixed issues with __sklearn_tags__
class SklearnXGBRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, **kwargs):
        self.kwargs = kwargs  # Store the parameters for later use
        self.model = XGBRegressor(**self.kwargs)  # Pass them to XGBRegressor

    def fit(self, X, y):
        self.model.fit(X, y)
        return self

    def predict(self, X):
        return self.model.predict(X)

    def get_params(self, deep=True):
        return self.kwargs  # Return stored parameters for compatibility

    def set_params(self, **params):
        self.kwargs.update(params)  # Update the parameters
        self.model = XGBRegressor(**self.kwargs)  # Recreate the model with new parameters
        return self

# No selection data set

In [19]:
train = pd.read_csv('data/processed/train_eng.csv')

y_train = train['severity_score']
X_train = train.drop(columns='severity_score', axis=1)

## Grid Search across models

In [None]:
# Define models
models = {
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': SklearnXGBRegressor(random_state=42),
    'CatBoost': CatBoostRegressor(verbose=200)
}

names = ['Mean Prediction', 'Random Forest', 'XGBoost', 'CatBoost']

# Dictionary to store scores
metrics = ['MAE', 'MSE', 'RMSE', 'R2']
scores = {model_name: {metric: [] for metric in metrics} for model_name in names}

scoring = {
    'MAE': make_scorer(mean_absolute_error, greater_is_better=False),
    'MSE': make_scorer(mean_squared_error, greater_is_better=False),
    'R2': 'r2'
}

# Mean prediction as baseline model
mean_value = np.mean(y_train)
mean_predictions = np.full_like(y_train, mean_value)

# Store baseline model metrics
scores['Mean Prediction']['MAE'] = mean_absolute_error(y_train, mean_predictions)
scores['Mean Prediction']['MSE'] = mean_squared_error(y_train, mean_predictions)
scores['Mean Prediction']['RMSE'] = np.sqrt(mean_squared_error(y_train, mean_predictions))
scores['Mean Prediction']['R2'] = r2_score(y_train, mean_predictions)

# Hyperparameter grids for GridSearchCV
param_grids = {
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
    },
    'XGBoost': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 6, 10],
        'subsample': [0.7, 0.8, 1.0]
    },
    'CatBoost': {
        'iterations': [500, 1000],
        'learning_rate': [0.05, 0.1],
        'depth': [6, 10, 12],
        'l2_leaf_reg': [1, 3, 5],
    }
}

# Perform Grid Search and Cross-Validation for each model
for model_name, model in models.items():
    print(f'\nPerforming Grid Search for {model_name}...')

    # Set the parameter grid for the model
    param_grid = param_grids[model_name]

    # Grid Search with Cross-Validation
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring['R2'], cv=5, n_jobs=-1, verbose=1, refit=True)

    # Fit GridSearchCV to the training data
    grid_search.fit(X_train, y_train)

    # Best estimator after grid search
    best_model = grid_search.best_estimator_

    # Save the best model using joblib
    joblib.dump(best_model, f'output/grid_{model_name}_best_model.pkl')
    print(f'Best model for {model_name} saved as grid_{model_name}_best_model.pkl')

    # Get the best score and parameters
    best_score = grid_search.best_score_
    best_params = grid_search.best_params_

    print(f"Best R2 score for {model_name}: {best_score:.4f}")
    print(f"Best parameters for {model_name}: {best_params}")

    # Get the cross-validation results for the best model
    cv_results = cross_validate(best_model, X_train, y_train, cv=5, scoring=scoring, return_train_score=False)
    
    # Store the scores for this model
    scores[model_name]['MAE'] = -np.mean(cv_results['test_MAE'])  # Negate for negative MAE
    scores[model_name]['MSE'] = -np.mean(cv_results['test_MSE'])  # Negate for negative MSE
    scores[model_name]['RMSE'] = np.sqrt(scores[model_name]['MSE'])
    scores[model_name]['R2'] = np.mean(cv_results['test_R2'])

# Print scores
for model_name, metrics in scores.items():
    print(f'\n{model_name}:')
    for metric, value in metrics.items():
        print(f'{metric}: {value:.4f}')

In [5]:
# Generate the markdown table and save to file
markdown_table = scores_to_markdown(scores)
with open('output/gridsearch_model_scores.md', 'w') as f:
    f.write(markdown_table)

In [None]:
# Prepare data for plotting
plot_data = {metric: [scores[model][metric] for model in names] for metric in metrics}

# Plot
plt.figure(figsize=(10, 6))
for metric, values in plot_data.items():
    plt.plot(names, values, label=metric)

plt.xlabel('Model')
plt.ylabel('Score')
plt.title('Regression Model Performance Metrics')
plt.legend()
plt.show()

plt.savefig('output/gridsearch-all-performance.png', dpi=300)

## Test best model

In [20]:
test = pd.read_csv('data/processed/test_eng.csv')
y_test = test['severity_score']
X_test = test.drop(columns='severity_score', axis=1)

In [None]:
# training on whole train data set
final = SklearnXGBRegressor(learning_rate=0.1, max_depth=6,n_estimators=200, subsample=1.0)

final.fit(X_train, y_train)
y_pred = final.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)
print("R2 Score:", r2)

header = '| Model | MAE | MSE | RMSE | R2 |\n'
header += '|-------|-----|-----|------|-----|\n'
header += f'| XGBoost | {mae} | {mse} | {rmse} | {r2}| \n' 

print(header)

In [None]:
sns.scatterplot(x=y_pred, y=y_test)
plt.xlabel('Predicted')
plt.ylabel('Value')

## Building Explainer Dashboard

In [None]:
model = XGBRegressor(learning_rate=0.1, max_depth=6,n_estimators=200, subsample=1.0).fit(X_train, y_train)

explainer = RegressionExplainer(model, X_test, y_test)

ExplainerDashboard(explainer).run()

# K Selection data set

In [31]:
train = pd.read_csv('data/processed/train_ksel.csv', index_col=0)

y_train = train['severity_score']
X_train = train.drop(columns='severity_score', axis=1)

## Grid Search

In [None]:
# Define models
models = {
    'XGBoost': SklearnXGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
    'CatBoost': CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=10, verbose=200)
}

names = ['Mean Prediction', 'XGBoost', 'CatBoost']

# Dictionary to store scores
metrics = ['MAE', 'MSE', 'RMSE', 'R2']
scores_ksel = {model_name: {metric: [] for metric in metrics} for model_name in names}

scoring = {
    'MAE': make_scorer(mean_absolute_error, greater_is_better=False),
    'MSE': make_scorer(mean_squared_error, greater_is_better=False),
    'R2': 'r2'
}

# Mean prediction as baseline model
mean_value = np.mean(y_train)
mean_predictions = np.full_like(y_train, mean_value)

# Store baseline model metrics
scores_ksel['Mean Prediction']['MAE'] = mean_absolute_error(y_train, mean_predictions)
scores_ksel['Mean Prediction']['MSE'] = mean_squared_error(y_train, mean_predictions)
scores_ksel['Mean Prediction']['RMSE'] = np.sqrt(mean_squared_error(y_train, mean_predictions))
scores_ksel['Mean Prediction']['R2'] = r2_score(y_train, mean_predictions)

# Hyperparameter grids for GridSearchCV
param_grids = {
    'XGBoost': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 6, 10],
        'subsample': [0.7, 0.8, 1.0]
    },
    'CatBoost': {
        'iterations': [500, 1000],
        'learning_rate': [0.05, 0.1],
        'depth': [6, 10, 12],
        'l2_leaf_reg': [1, 3, 5],
    }
}

# Perform Grid Search and Cross-Validation for each model
for model_name, model in models.items():
    print(f'\nPerforming Grid Search for {model_name}...')

    # Set the parameter grid for the model
    param_grid = param_grids[model_name]

    # Grid Search with Cross-Validation
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring['R2'], cv=5, n_jobs=-1, verbose=1, refit=True)

    # Fit GridSearchCV to the training data
    grid_search.fit(X_train, y_train)

    # Best estimator after grid search
    best_model = grid_search.best_estimator_

    # Save the best model using joblib
    joblib.dump(best_model, f'output/grid_{model_name}_best_model_ksel.pkl')
    print(f'Best model for {model_name} saved as grid_{model_name}_best_model_ksel.pkl')

    # Get the best score and parameters
    best_score = grid_search.best_score_
    best_params = grid_search.best_params_

    print(f"Best R2 score for {model_name}: {best_score:.4f}")
    print(f"Best parameters for {model_name}: {best_params}")

    # Get the cross-validation results for the best model
    cv_results = cross_validate(best_model, X_train, y_train, cv=5, scoring=scoring, return_train_score=False)
    
    # Store the scores for this model
    scores_ksel[model_name]['MAE'] = -np.mean(cv_results['test_MAE'])  # Negate for negative MAE
    scores_ksel[model_name]['MSE'] = -np.mean(cv_results['test_MSE'])  # Negate for negative MSE
    scores_ksel[model_name]['RMSE'] = np.sqrt(scores[model_name]['MSE'])
    scores_ksel[model_name]['R2'] = np.mean(cv_results['test_R2'])

# Print scores
for model_name, metrics in scores_ksel.items():
    print(f'\n{model_name}:')
    for metric, value in metrics.items():
        print(f'{metric}: {value:.4f}')

In [45]:
# Generate the markdown table and save to file
markdown_table = scores_to_markdown(scores_ksel)
with open('output/gridsearch_model_scores_ksel.md', 'w') as f:
    f.write(markdown_table)

In [None]:
# Prepare data for plotting
plot_data = {metric: [scores_ksel[model][metric] for model in names] for metric in metrics}

# Plot
plt.figure(figsize=(10, 6))
for metric, values in plot_data.items():
    plt.plot(names, values, label=metric)

plt.xlabel('Model')
plt.ylabel('Score')
plt.title('Regression Model Performance Metrics')
plt.legend()
plt.show()

plt.savefig('output/gridsearch-ksel-performance.png', dpi=300)

## Test best model

In [30]:
test = pd.read_csv('data/processed/test_ksel.csv', index_col=0)
y_test = test['severity_score']
X_test = test.drop(columns='severity_score', axis=1)

In [None]:
# training on whole train data set
final = CatBoostRegressor(learning_rate=0.1, depth=6, n_estimators=1000, l2_leaf_reg=5)

final.fit(X_train, y_train)
y_pred = final.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)
print("R2 Score:", r2)

header = '| Model | MAE | MSE | RMSE | R2 |\n'
header += '|-------|-----|-----|------|-----|\n'
header += f'| CatBoost | {mae} | {mse} | {rmse} | {r2}| \n' 

print(header)

In [None]:
sns.scatterplot(x=y_pred, y=y_test)
plt.xlabel('Predicted')
plt.ylabel('Value')

## Building Explainer Dashboard

In [None]:
model = CatBoostRegressor(learning_rate=0.1, depth=6, n_estimators=1000, l2_leaf_reg=5).fit(X_train, y_train)

explainer = RegressionExplainer(model, X_test, y_test)

ExplainerDashboard(explainer).run()