# 3b. Support Vector Regressor

We can now load either our PCA data or our UMAP data and use supervised learning algorithms to predict our outcome variable using the scaled and reduced data. I have created two separate files for this step, differing on which algorithm they use (Random Forest Regressor vs. Support Vector Regressor). These are designed to work completely separate from one another, and only running one of the two is necessary, but it is helpful to run both and compare them.

In [21]:
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

In [22]:
class SVregressor:
    
    '''
        INIT FUNCTION:
        
        -- This __init__ function is slightly different from the ones used in KNN Imputation, PCA, and UMAP.
        
        -- In addition to specifying "IDs", specify an outcome variable to predict. "IDs" should still include this variable.
        
        -- self.X is an array containing all the data except the IDs; self.y is an array of the values of the outcome variable.
    '''
    
    def __init__(self, datafile, outcome, IDs = []):
        self.df = pd.read_csv(datafile)
        self.X = np.array(self.df.drop(IDs, 1))
        self.y = np.array(self.df[outcome])
        self.Xdf = pd.DataFrame(self.X)
        self.ydf = pd.DataFrame(self.y)
        
    '''
        REGRESS METHOD:
        
        --Specify a kernel, a scoring method, and a number of folds for cross-validation.
        
        --self.predictions outputs predicted values; self-scores outputs specified metrics of model performance.
    '''    
    
    def regress(self, kernel, scoring, cv=[]):
        self.svr_model = SVR(kernel=kernel)
        self.predictions = cross_val_predict(self.svr_model, self.X, self.y, cv=cv)
        self.scores = cross_val_score(self.svr_model, self.X, self.y, cv=cv, scoring=scoring)
        print(self.scores)

In [40]:
def avg(lst): 
    return sum(lst) / len(lst)

In [23]:
#Linear kernel
#Inputting PCA data for this example
#Outcome variable is presence, IDs are presence and labvisitid
linear = SVregressor("SCALED_PCA_DATA.csv", 'presence', IDs = ['labvisitid', 'presence'])

In [37]:
#Outputting (negative) root mean squared error
#For some reason, sklearn outputs negative values for mean squared error and some related metrics,...
#...so they changed the name to 'neg_root_mean_squared_error'
linear.regress('linear','neg_root_mean_squared_error', cv=5)
linear_RMSE = linear.scores

[-0.75750606 -0.93750831 -0.70761028 -0.81994488 -0.76403244]


In [29]:
#RBF kernel
rbf = SVregressor("SCALED_PCA_DATA.csv", 'presence', IDs = ['labvisitid', 'presence'])

In [36]:
#Outputting (negative) root mean squared error
rbf.regress('rbf', 'neg_root_mean_squared_error', cv=5)
rbf_RMSE = rbf.scores

[-0.79357557 -0.83623336 -0.70359135 -0.74795484 -0.75423522]


In [31]:
#Comparing scores of (negative) root mean squared error from both kernels
#The RBF kernel performs better on average
print("Linear:", linear.scores)
print("RBF:", rbf.scores)

Linear: [-0.75750606 -0.93750831 -0.70761028 -0.81994488 -0.76403244]
RBF: [-0.79357557 -0.83623336 -0.70359135 -0.74795484 -0.75423522]


In [38]:
#Same thing but with different scoring method
linear.regress('linear','r2', cv=5)
linear_R2 = linear.scores
rbf.regress('rbf', 'r2', cv=5)
rbf_R2 = rbf.scores

[-0.32383179 -0.70649201 -0.04977441 -0.07504883 -0.2217531 ]
[-0.45290479 -0.35771615 -0.03788374  0.10543968 -0.1906208 ]


In [35]:
#Comparing scores of (negative) R^2 from both kernels
#The RBF kernel performs better on average
print("Linear:", linear.scores)
print("RBF:", rbf.scores)

Linear: [-0.32383179 -0.70649201 -0.04977441 -0.07504883 -0.2217531 ]
RBF: [-0.45290479 -0.35771615 -0.03788374  0.10543968 -0.1906208 ]


In [39]:
print("Linear RMSE:", linear_RMSE)
print("RBF RMSE:", rbf_RMSE)
print("Linear R^2:", linear_R2)
print("RBF R^2:", rbf_R2)

Linear RMSE: [-0.75750606 -0.93750831 -0.70761028 -0.81994488 -0.76403244]
RBF RMSE: [-0.79357557 -0.83623336 -0.70359135 -0.74795484 -0.75423522]
Linear R^2: [-0.32383179 -0.70649201 -0.04977441 -0.07504883 -0.2217531 ]
RBF R^2: [-0.45290479 -0.35771615 -0.03788374  0.10543968 -0.1906208 ]


In [41]:
linRMSEavg = avg(linear_RMSE)
rbfRMSEavg = avg(rbf_RMSE)
linR2avg = avg(linear_R2)
rbfR2avg = avg(rbf_R2)

In [42]:
print("Average linear RMSE:", linRMSEavg)
print("Average linear R^2:", linR2avg)
print("Average RBF RMSE:", rbfRMSEavg)
print("Average RBF R^2:", rbfR2avg)

Average linear RMSE: -0.7973203947974418
Average linear R^2: -0.27538002924021543
Average RBF RMSE: -0.7671180684617258
Average RBF R^2: -0.18673716034307558
