# Grid Search for SVR Hyperparameters
- `cuml` doesn't natively support `GridSearchCV` and `KFold` CV
    + Created my own implementation

In [3]:
device = "CPU" # CPU or GPU (must set manually, don't know how to do this automatically in rapids 22.10)
if device == "GPU":
    print("CUDA is available: using GPU")
    import cudf as pd
    import cupy as np
    import cuml
    from cuml.svm import SVR
    from cuml.model_selection import train_test_split
    from cuml.preprocessing import StandardScaler
    from cuml.metrics import mean_squared_error
else:
    print("CUDA not available: using CPU")
    import numpy as np
    import pandas as pd
    from sklearn.svm import SVR
    from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
from tqdm.auto import tqdm, trange
import itertools
import time
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from datetime import datetime
import os
import csv

CUDA not available: using CPU


In [None]:
noise = 10 # ADJUST level of gaussian noise added to outputs
mod_type = 'svr'
description = mod_type + '_noise-' + str(noise)
filename = '../datasets/fuchs_v3-2_seed-5_points_25000_noise_' + str(noise) + '.csv'  # CHANGE TO DESIRED DATA FILE
df = pd.read_csv(filename)

In [5]:
input_list = ['Intensity_(W_cm2)', 'Target_Thickness (um)', 'Focal_Distance_(um)'] # independent variables
output_list = ['Max_Proton_Energy_(MeV)', 'Total_Proton_Energy_(MeV)', 'Avg_Proton_Energy_(MeV)',
               'Max_Proton_Energy_Exact_(MeV)', 'Total_Proton_Energy_Exact_(MeV)', 'Avg_Proton_Energy_Exact_(MeV)'] # training outputs

X = df[input_list].copy()
y = df[output_list].copy()
X[X.columns[0]] = np.log(X[X.columns[0]]) # Apply log scaling to intensity
for col in y.columns:
    y[col] = np.log(y[col]) # Apply log scaling to energy

dataType = 'float32'

if device == "GPU":
    X = X.to_cupy().astype(dtype=dataType)
    y = y.to_cupy().astype(dtype=dataType)
else:
    X = X.to_numpy().astype(dtype=dataType)
    y = y.to_numpy().astype(dtype=dataType)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle = False)
y_train = y_train[:, 0:3]
y_test = y_test[:, 3:6] # Not used for grid search

pct = 25 # Using 5.000/20,000 points in training/validation set
len_df = int(len(X_train)*(pct/100))
X_train = X_train[0:len_df]
y_train = y_train[0:len_df]

# Apply standard scaler z-score normalization
ss_in = StandardScaler()
ss_in.fit(X_train)
X_train_norm = ss_in.transform(X_train)
X_test_norm = ss_in.transform(X_test)

ss_out = StandardScaler()
ss_out.fit(y_train)
y_train_norm = ss_out.transform(y_train)

In [1]:
param_grid = {
    'C': [0.1, 0.25, 1, 2.5, 10, 25],
    'epsilon': [1e-2, 1e-3, 1e-4],
    'tol': [1e-2, 1e-3, 1e-4]
}

param_nested_list = [param_grid[key] for key in param_grid.keys()]
param_list = list(itertools.product(*param_nested_list))

def k_fold_split(X, k=5):
    # Calculate the size of each fold
    fold_size = len(X) // k

    # Initialize a list to store the folds
    folds = []

    # Create each fold
    for i in range(k):
        # Calculate the start and end indices of the current fold
        start = i * fold_size
        end = (i + 1) * fold_size if i < k - 1 else len(X)

        # Get the fold data
        fold = X[start:end]

        # Add the fold to the list of folds
        folds.append(fold)

    return folds
    
def k_fold_cv(cv=5, C=1, epsilon=1e-3, tol=1e-4):
    print('starting CV for C={}, eps={}, tol={}'.format(C, epsilon, tol))
    folds_X = k_fold_split(X_train_norm, k=cv)
    folds_y = k_fold_split(y_train_norm, k=cv)
    num_outputs = len(y_train_norm[0])
    mse_list = np.zeros((cv, num_outputs))
    
    for i in trange(cv, desc='CV'):
        idx_list = list(range(cv))
        del idx_list[i] # Delete current validation fold from training list
        
        X_train_cv = np.concatenate([folds_X[k] for k in idx_list], axis=0)
        y_train_cv = np.concatenate([folds_y[k] for k in idx_list], axis=0)
        X_val_cv = folds_X[i]
        y_val_cv = folds_y[i]
        
        svrs = []
        for j in range(num_outputs):
            svrs.append(SVR(C=C, epsilon=epsilon, tol=tol))
            svrs[j].fit(X_train_cv, y_train_cv[:, j])
            y_train_predict = svrs[j].predict(X_train_cv)
            y_val_predict = svrs[j].predict(X_val_cv)
            mse_list[i, j] = mean_squared_error(y_val_cv[:, j], y_val_predict)
            X_train_cv = np.concatenate([X_train_cv, y_train_predict.reshape(-1, 1)], axis=1)
            X_val_cv = np.concatenate([X_val_cv, y_val_predict.reshape(-1, 1)], axis=1)
        
    mse_list_energy_averaged = np.mean(mse_list, axis=1)
    mean_cv_mse = np.mean(mse_list_energy_averaged)
    std_cv_mse = np.std(mse_list_energy_averaged)
    return [mean_cv_mse, std_cv_mse]

def GridSearchCV(param_list, cv=5):
    mse_list = np.zeros(len(param_list))
    std_list = np.zeros(len(param_list))
    for k, param in enumerate(param_list):
        C = param[0]
        epsilon = param[1]
        tol = param[2]
        mse, std = k_fold_cv(cv=cv, C=C, epsilon=epsilon, tol=tol)
        mse_list[k] = mse
        std_list[k] = std
    best_idx = np.argmin(mse_list)
    print('best (lowest) mse: ', mse_list[best_idx], ' with σ=', std_list[best_idx])
    print('with params C={}, ϵ={}, tol={}'.format(param_list[int(best_idx)][0], param_list[int(best_idx)][1], param_list[int(best_idx)][2]))
    return pd.DataFrame({'Params (C, ϵ, tol)':param_list, 'Mean Squared Error':mse_list, 'Standard Deviation':std_list})
        
        
output_df = GridSearchCV(param_list)

NameError: name 'itertools' is not defined

In [7]:
display(output_df)

Unnamed: 0,"Params (C, ϵ, tol)",Mean Squared Error,Standard Deviation
0,"(0.1, 0.01, 0.01)",0.005913,0.000299
1,"(0.1, 0.01, 0.001)",0.00591,0.000303
2,"(0.1, 0.01, 0.0001)",0.00591,0.000302
3,"(0.1, 0.001, 0.01)",0.005893,0.000293
4,"(0.1, 0.001, 0.001)",0.005885,0.000295
5,"(0.1, 0.001, 0.0001)",0.005886,0.000295
6,"(0.1, 0.0001, 0.01)",0.005899,0.000297
7,"(0.1, 0.0001, 0.001)",0.005889,0.000297
8,"(0.1, 0.0001, 0.0001)",0.005887,0.000297
9,"(0.25, 0.01, 0.01)",0.005021,0.000242


In [None]:
import os
os.makedirs('svr_cv_results', exist_ok=True)

# Then save
output_df.to_csv('svr_cv_results/grid_search.csv', index=False)