In [1]:
#1. Importing necessary libraries.
import cudf
import cupy as cp
import cuml
from cuml.linear_model import Ridge, Lasso
from cuml.svm import SVR
from cuml.tree import DecisionTreeRegressor
from cuml.ensemble import RandomForestRegressor, GradientBoostingRegressor
from cuml.neighbors import KNeighborsRegressor
from cuml.neural_network import MLPRegressor
from cuml.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score
from cuml.preprocessing import StandardScaler
from cuml.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score

from xgboost import XGBRegressor
from catboost import CatBoostRegressor

ModuleNotFoundError: No module named 'cudf'

In [4]:
#2. Loading and splitting the data.
diabetes = load_diabetes()
X = cp.asarray(diabetes.data)
y = cp.asarray(diabetes.target)

NameError: name 'load_diabetes' is not defined

In [None]:
#3. Data standardization.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#4. List of algorithms.
models = [
    ('Ridge Regression', Ridge()),
    ('Lasso Regression', Lasso()),
    ('SVR', SVR()),
    ('Decision Tree', DecisionTreeRegressor()),
    ('Random Forest', RandomForestRegressor()),
    ('k-NN', KNeighborsRegressor()),
    ('Gradient Boosting', GradientBoostingRegressor()),
    ('Neural Network', MLPRegressor(max_iter=10000)),
    ('XGBoost', XGBRegressor(objective='reg:squarederror', tree_method='gpu_hist')),
    ('CatBoost', CatBoostRegressor(verbose=0, task_type='GPU'))
]

In [None]:
#5. Cross-validation parameters.
cv = KFold(n_splits=5, shuffle=True, random_state=345)

In [None]:
#6. Functions to generate model metrics.
def evaluate_model(name, model, X_train, y_train, cv):
    nmse_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error')
    nmae_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_absolute_error')
    r2_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')
    ev_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='explained_variance')

    return {
        'Model': name,
        'MSE': -nmse_scores.mean(),  # Convert NMSE to MSE
        'MAE': -nmae_scores.mean(),  # Convert NMAE to MAE
        'R2': r2_scores.mean(),
        'Explained Variance': ev_scores.mean()
    }

def evaluate_models(models, X_train, y_train, cv):
    return [evaluate_model(name, model, X_train, y_train, cv) for name, model in models]


In [None]:
#7. Generating metrics for default hyperparameters.
default_results = evaluate_models(models, X_train, y_train, cv)
df_default = cudf.DataFrame(default_results)

In [None]:
#8. Defining hyperparameter grids for each algorithm.

param_grids = {
    'Ridge Regression': {
        'alpha': [0.01, 0.1, 1.0, 2.0, 5.0, 10.0, 20.0, 50.0, 100.0]
    },
    'Lasso Regression': {
        'alpha': [0.01, 0.1, 1.0, 2.0, 5.0, 10.0, 20.0, 50.0, 100.0]
    },
    'SVR': {
        'C': [0.01, 0.1, 1.0, 2.0, 5.0, 10.0, 20.0, 50.0, 100.0],
        'gamma': [0.001, 0.01, 0.1, 0.2, 0.5, 1],
        'kernel': ['rbf', 'linear']
    },
    'Decision Tree': {
        'max_depth': [None, 2, 5, 10, 20, 30, 40, 50, 100],
        'min_samples_split': [2, 4, 5, 6, 8, 10, 12, 15],
        'min_samples_leaf': [1, 2, 4, 5, 8, 10]
    },
    'Random Forest': {
        'n_estimators': [10, 25, 40, 50, 80, 100],
        'max_depth': [None, 5, 10, 20, 25, 30, 40],
        'min_samples_split': [2, 5, 8, 10],
        'min_samples_leaf': [1, 2, 4, 5, 8, 10, 15]
    },
    'k-NN': {
        'n_neighbors': [3, 5, 7, 9, 11],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    },
    'Gradient Boosting': {
        'n_estimators': [10, 20, 25, 30, 40, 50, 80, 100, 150, 200],
        'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.5, 1],
        'max_depth': [3, 5, 7, 9, 11]
    },
    'Neural Network': {
        'hidden_layer_sizes': [(50,), (100,)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam', 'lbfgs'],
        'alpha': [0.0001, 0.01],
        'learning_rate': ['constant', 'adaptive']
    },
    'XGBoost': {
        'n_estimators': [50, 75, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2, 0.3],
        'max_depth': [3, 5, 7, 9]
    },
    'CatBoost': {
        'iterations': [100, 200, 500],
        'learning_rate': [0.01, 0.1, 0.2, 0.3],
        'depth': [3, 5, 7, 9]
    }
}

In [None]:
#9. Generating best hyperparameter sets based on specified metric using grid search.
best_models = []
for name, model in models:
    print(f"Tuning {name}...")
    grid_search = GridSearchCV(estimator=model, param_grid=param_grids[name], cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models.append((name, grid_search.best_estimator_, grid_search.best_params_))
    print(f"Best parameters for {name}: {grid_search.best_params_}")

In [None]:
#10. Generating additional metrics using the best hyperparameter sets.
tuned_results = evaluate_models([(name, model) for name, model, params in best_models], X_train, y_train, cv)
df_tuned = cudf.DataFrame(tuned_results)


In [None]:
#11. Formatting tables.
def styled_df(df):
    styled_df = df.style.background_gradient(subset=['R2', 'Explained Variance'], cmap='RdYlGn')
    styled_df = styled_df.background_gradient(subset=['MSE', 'MAE'], cmap='RdYlGn_r') #reverse color map for minimized metrics
    return styled_df


In [None]:
#12.1. Displaying the generated table.
styled_df(df_default)

In [None]:
#12.2. Displaying the generated table.
styled_df(df_tuned)