# Support Vector Machines (SVM)

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

%cd "G:/Archive"

G:\Archive


## Data Prep

In [2]:
#load data
data = pd.read_csv("data/wine quality red.csv", header = 0)

#aggregate into binary classification problem
data["quality"] = pd.Categorical(np.where(data["quality"] > 5, "above_avg", "below_avg"))

#split data
X = data.iloc[:, :11]
y = data.iloc[:, 11]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1234)

## Searching Optimal Parameters
Standardize data first

In [4]:
#use same scaling to train and test sets
scale1 = StandardScaler().fit(X_train)
X_train_scale1 = scale1.transform(X_train)
X_test_scale1 = scale1.transform(X_test)

In [8]:
#tuning parameter: kernel function, gamma, cost
#10-fold cv with accuracy as target metrics
grid1 = {"kernel": ["rbf", "linear"],
         "gamma": 0.1 * np.power(10, range(0, 4)),
         "C":  0.1 * np.power(10, range(0, 4))}
model1 = GridSearchCV(SVC(), param_grid = grid1, cv = 10, scoring = "accuracy")
model1.fit(X_train_scale1, y_train)

#show tuning results
print("Best score: ", model1.best_score_)
print("Corresponding parms: ", model1.best_params_)
print("Model description: ", model1.best_estimator_)

#predict on test set
pred1 = model1.predict(X_test_scale1)

#show prediction performance
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, pred1)))
print("Recall: {0:.4f}".format(metrics.recall_score(y_test, pred1, pos_label = "above_avg", average = "binary")))
print("Precision: {0:.4f}".format(metrics.precision_score(y_test, pred1, pos_label = "above_avg", average = "binary")))
print("F1 score: {0:.4f}".format(metrics.f1_score(y_test, pred1, pos_label = "above_avg", average = "binary")))

Best score:  0.7599409448818898
Corresponding parms:  {'C': 10.0, 'gamma': 1.0, 'kernel': 'rbf'}
Model description:  SVC(C=10.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1.0, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
Accuracy: 0.7656
Recall: 0.8547
Precision: 0.7462
F1 score: 0.7967


## Visualization