In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Import and process data

In [None]:
X_train = pd.read_pickle('../../data/Gelderman_SOD_cohort/...')
y_train = pd.read_pickle('../../data/Gelderman_SOD_cohort/...')
X_test = pd.read_pickle('../../data/Gelderman_SOD_cohort/...')
y_test = pd.read_pickle('../../data/Gelderman_SOD_cohort/...')

In [10]:
print(len(X_train), len(y_train), len(X_test), len(y_test))

121 121 53 53


### Scale data
Since SVR is a distance-based algorithm, scaling is an important preprocessing step that can improve the accuracy and stability of the model. Specifically, all continuous features will be standarized. 

In [None]:
# copy of datasets
X_train_stand = X_train.copy()
X_test_stand = X_test.copy()

# numerical features
num_cols = ['age_at_death', 'temp_1_3_mean', 'temp_1_3_std', 'hum_1_3_mean',
       'hum_1_3_std', 'temp_4_7_mean', 'temp_4_7_std', 'hum_4_7_mean',
       'hum_4_7_std', 'temp_8_21_mean', 'temp_8_21_std', 'hum_8_21_mean',
       'hum_8_21_std', 'temp_22_56_mean', 'temp_22_56_std', 'hum_22_56_mean',
       'hum_22_56_std', 'temp_57_154_mean', 'temp_57_154_std',
       'hum_57_154_mean', 'hum_57_154_std']

# apply standardization on numerical features
for i in num_cols:
    
    # fit on training data column
    scale = StandardScaler().fit(X_train_stand[[i]])
    
    # transform the training data column
    X_train_stand[i] = scale.transform(X_train_stand[[i]])
    
    # transform the testing data column
    X_test_stand[i] = scale.transform(X_test_stand[[i]])

# Train model

In [19]:
# define hyperparameter grid
param_grid = {'kernel': ['linear', 'rbf', 'poly'],
              'gamma': ['scale', 'auto', 1, 0.1, 0.01, 0.001, 0.0001], 
              'C': [0.1, 1, 10, 100, 1000]  
             }

In [None]:
# initialize model 
svr = SVR(random_state=1, verbose=1)

In [None]:
# initialize GridSearchCV
grid_search = GridSearchCV(estimator=svr, param_grid=param_grid, cv=5, n_jobs=-1)

In [None]:
# fit model
grid_search.fit(X_train_stand, y_train)

In [None]:
# get best parameters and best estimator
best_params = grid_search.best_params_
best_svr = grid_search.best_estimator_

print('Best hyperparameters are: '+str(best_params))
print('Best score is: '+str(grid_search.best_score_))

# Test model

In [16]:
def evaluate(model, X_test, y_test):
    # predict data
    predictions = model.predict(X_test)
    
    # calculate evaluation metrics
    oob_score = model.oob_score_  # estimates model's generalization performance
    print(f'Out-of-Bag Score: {oob_score}')
    
    mse = mean_squared_error(y_test, predictions)
    print(f'Mean Squared Error: {mse}')
    
    rmse = np.sqrt(mse)
    print("Root Mean Squared Error (RMSE):", rmse)
    
    r2 = r2_score(X_test, predictions)
    print(f'R-squared: {r2}')

In [None]:
# evaluate best estimator
evaluate(best_svr, X_test_stand, y_test)