In [14]:
#IMPORTS
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SequentialFeatureSelector, VarianceThreshold
from sklearn.ensemble import BaggingRegressor,RandomForestRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV

from sklearn.svm import SVR

In [15]:
#Load data

X_train = pd.read_csv('..\data\processed\X_train.csv', index_col=0) 
y_train = pd.read_csv('..\data\processed\y_train.csv', index_col=0)

X_test = pd.read_csv('..\data\processed\X_test.csv', index_col=0)
y_test = pd.read_csv('..\data\processed\y_test.csv', index_col=0)

In [4]:
'''cor_matrix = X_train.corr().abs()
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]

new_X_train = X_train.drop(to_drop, axis=1)
new_X_test = X_test.drop(to_drop, axis=1)'''

In [16]:
#train SVM model
scaler = StandardScaler()
X_train_SVM = scaler.fit_transform(X_train, y_train)

X_test_SVM = scaler.transform(X_test)

In [8]:
#Best model so far

#SVM with bagging 0.7988249929422202
modelBag = BaggingRegressor(estimator=SVR(kernel='rbf', C= 185000, gamma = 0.0126), n_estimators=90, max_features=0.65, random_state=42).fit(X_train_SVM, y_train.to_numpy().ravel())

print("SVM with bagging Stats:")
print(f"Test Accuracy: {modelBag.score(X_test_SVM, y_test)}")
print(f"Train Accuracy: {modelBag.score(X_train_SVM, y_train)}")


SVM with bagging Stats:
Test Accuracy: 0.7961273603274456
Train Accuracy: 0.9395726435969665


In [10]:

#SVM with bagging 0.7988249929422202
modelBag = BaggingRegressor(estimator=SVR(kernel='rbf', C= 185000, gamma = 0.0126), n_estimators=90, max_features=0.5, random_state=42).fit(X_train_SVM, y_train.to_numpy().ravel())

print("SVM with bagging Stats:")
print(f"Test Accuracy: {modelBag.score(X_test_SVM, y_test)}")
print(f"Train Accuracy: {modelBag.score(X_train_SVM, y_train)}")


SVM with bagging Stats:
Test Accuracy: 0.7919687763621599
Train Accuracy: 0.9103374946991729


In [68]:
max_features = [0.54]

param_grid = {
    'n_estimators' : [40,50,60,70,90],
    'max_features' : [0.5,0.6,0.7,0.8,0.9],
    'random_state': [42]
}
model =  BaggingRegressor(estimator=SVR(kernel='rbf', C= 180000, gamma = 0.0125))

grid = GridSearchCV(cv = 2, estimator=model, param_grid=param_grid).fit(X_train_SVM, y_train.to_numpy().ravel())

In [69]:
print(grid.best_estimator_.get_params())

best_model = grid.best_estimator_

print(f"Test Accuracy: {best_model.score(X_test_SVM, y_test)}")
print(f"Train Accuracy: {best_model.score(X_train_SVM, y_train)}")

{'base_estimator': 'deprecated', 'bootstrap': True, 'bootstrap_features': False, 'estimator__C': 180000, 'estimator__cache_size': 200, 'estimator__coef0': 0.0, 'estimator__degree': 3, 'estimator__epsilon': 0.1, 'estimator__gamma': 0.0125, 'estimator__kernel': 'rbf', 'estimator__max_iter': -1, 'estimator__shrinking': True, 'estimator__tol': 0.001, 'estimator__verbose': False, 'estimator': SVR(C=180000, gamma=0.0125), 'max_features': 0.6, 'max_samples': 1.0, 'n_estimators': 90, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}
Test Accuracy: 0.7956509560486852
Train Accuracy: 0.9327432745007836


In [6]:
scores = cross_val_score(cv = 10, estimator=BaggingRegressor(estimator=SVR(kernel='rbf', C= 180000, gamma = 0.013), n_estimators=90, max_features=0.70),X = X_train_SVM, y = y_train.to_numpy().ravel())

In [10]:
print(scores)

[0.78723692 0.79676134 0.78766412 0.82800363 0.72965792 0.84578952
 0.78077894 0.84442538 0.77618615 0.75882597]


In [68]:
#SVM Feature selection and then bagging
sfs = SequentialFeatureSelector(estimator=SVR(kernel='rbf', C= 97000, gamma = 0.01), n_features_to_select=10)
sfs.fit(X_train_SVM, y_train.to_numpy().ravel())

In [73]:
X_train_new = sfs.transform(X_train_SVM)
X_test_new = sfs.transform(X_test_SVM)

In [115]:
modelBag = BaggingRegressor(estimator=SVR(kernel='rbf', C= 97000, gamma = 0.01), n_estimators=10,random_state=42).fit(X_train_new, y_train.to_numpy().ravel())

print("SVM with bagging Stats:")
print(f"Test Accuracy: {modelBag.score(X_test_new, y_test)}")
print(f"Train Accuracy: {modelBag.score(X_train_new, y_train)}")

SVM with bagging Stats:
Test Accuracy: 0.6876600912932719
Train Accuracy: 0.7430295156858931


In [108]:
#find the 10 features selected
selected_features = sfs.get_support()

X_train.columns[selected_features]

Index(['CCBASIC', 'UGDS_ASIAN', 'INEXPFTE', 'PCTFLOAN', 'RPY_7YR_RT',
       'MALE_DEBT_N', 'MD_FAMINC', 'FAMINC_IND', 'PCT_WHITE', 'UGDS_MEN'],
      dtype='object')