In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import scale, StandardScaler
from sklearn import model_selection
from sklearn.svm import SVR

In [2]:
from warnings import filterwarnings
filterwarnings("ignore")

In [3]:
# Data
df = pd.read_csv("Hitters.csv")
df = df.dropna()
dms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])

y = df["Salary"]
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
X = pd.concat([X_, dms[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42)

In [4]:
# Model & Prediction
svr_model = SVR("linear").fit(X_train, y_train)
svr_model

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='linear', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

In [6]:
svr_model.predict(X_test)[0:10]

array([679.14754685, 633.72883069, 925.68640849, 270.28463621,
       530.26659184, 272.22606026, 549.4423173 , 446.55263946,
       892.83096429, 677.96855527])

In [7]:
svr_model.intercept_

array([-80.15196151])

In [8]:
svr_model.coef_

array([[ -1.21839037,   6.09602969,  -3.67574533,   0.14217075,
          0.51435919,   1.28388986,  12.55922537,  -0.08693755,
          0.46597184,   2.98259944,   0.52944523,  -0.79820799,
         -0.16015534,   0.30872794,   0.28842348,  -1.79560067,
          6.41868985, -10.74313783,   1.33374317]])

In [9]:
# Test Error
y_pred = svr_model.predict(X_test)

In [10]:
# MSE
np.sqrt(mean_squared_error(y_test, y_pred))

370.04084185624924

In [11]:
# Model Tuning 
svr_model = SVR("linear")

In [14]:
svr_params = {"C" : [0.1, 0.5, 1, 3]}

In [20]:
svr_cv_model = GridSearchCV(svr_model, svr_params, cv = 5, verbose = 2, n_jobs = -1).fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  1.4min finished


In [21]:
svr_cv_model.best_params_

{'C': 0.5}

In [22]:
svr_tuned = SVR("linear", C = 0.5).fit(X_train, y_train)

In [23]:
y_pred = svr_tuned.predict(X_test)

In [24]:
np.sqrt(mean_squared_error(y_test, y_pred))

367.9874739022889