# Module 6: Linear Support Vector Machine - Practice

In this session, you will practice using Linear SVM on **red wine** dataset
with the typical train/validate workflow.

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import os, sys
import itertools
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

tf.logging.set_verbosity(tf.logging.ERROR)

## Load dataset

In the following cell, **print out class distribution** before and after labels are binarized respectively.

In [2]:
# Dataset location
DATASET = '/dsa/data/all_datasets/wine-quality/winequality-red.csv'
assert os.path.exists(DATASET)

# Load and shuffle
dataset = pd.read_csv(DATASET, sep=';').sample(frac = 1).reset_index(drop=True)

In [3]:
# Pull features and labels
selected_features = [1,6,9,10]
X = scale(np.array(dataset.iloc[:, selected_features]))
y = np.array(dataset.quality)

# Complete code below this comment  (Question #P6001)
# ----------------------------------
print('Class distribution (before binarization):', {i:np.sum(y==i) for i in np.unique(y)})

# Binarize labels
y = y>=6
print('Class distribution (after binarization):', {i: np.sum(y==i) for i in np.unique(y)})
# ----------------------------------

# Create training/validation split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

dataset.describe()

Class distribution (before binarization): {3: 10, 4: 53, 5: 681, 6: 638, 7: 199, 8: 18}
Class distribution (after binarization): {False: 744, True: 855}


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


## Create linear SVM model

Collect names for all selected feature columns.

In [4]:
feature_names = [dataset.columns[i].replace('\x20', '_') for i in selected_features]
print(feature_names)

['volatile_acidity', 'total_sulfur_dioxide', 'sulphates', 'alcohol']


Create feature columns placeholders for TensorFlow SVM.

In [5]:
# Complete code below this comment  (Question #P6002)
# ----------------------------------
feature_columns = [tf.contrib.layers.real_valued_column(i) for i in feature_names]

Create a linear classifier.

In [6]:
# Complete code below this comment  (Question #P6003)
# ----------------------------------
classifier = tf.contrib.learn.SVM('example_id', feature_columns=feature_columns, l2_regularization=1.0)

## Training and preparation

Create input_fn() to supply training data for linear SVM.

In [7]:
# Complete code below this comment  (Question #P6004)
# ----------------------------------
def my_input_fn():
    columns = {
        feature_name: tf.constant(np.expand_dims(X_train[:,i], 1))
            for i,feature_name in enumerate(feature_names)
    }
    columns['example_id'] = tf.constant([str(i+1) for i in range(len(X_train))])
    labels = tf.constant(y_train)
    return columns, labels


Train SVM.

In [8]:
# Add code below this comment  (Question #P6005)
# ----------------------------------
classifier.fit(input_fn =my_input_fn, steps=30)


SVM(params={'optimizer': <tensorflow.contrib.linear_optimizer.python.sdca_optimizer.SDCAOptimizer object at 0x7f7e1aaf3710>, 'head': <tensorflow.contrib.learn.python.learn.estimators.head._BinarySvmHead object at 0x7f7e1aaf3748>, 'feature_columns': [_RealValuedColumn(column_name='volatile_acidity', dimension=1, default_value=None, dtype=tf.float32, normalizer=None), _RealValuedColumn(column_name='total_sulfur_dioxide', dimension=1, default_value=None, dtype=tf.float32, normalizer=None), _RealValuedColumn(column_name='sulphates', dimension=1, default_value=None, dtype=tf.float32, normalizer=None), _RealValuedColumn(column_name='alcohol', dimension=1, default_value=None, dtype=tf.float32, normalizer=None)], 'weight_column_name': None, 'update_weights_hook': <tensorflow.contrib.learn.python.learn.estimators.linear._SdcaUpdateWeightsHook object at 0x7f7e1aaf3898>})

## Evaluation

Create a predict_fn() to supply data to make predictions.  
Then call classifier.predict() to create y_pred.

In [9]:
# Complete code below this comment  (Question #P6006)
# ----------------------------------
def predict_fn():
    columns = {
        feature_name: tf.constant(np.expand_dims(X_test[:,i], 1))
            for i,feature_name in enumerate(feature_names)
    }
    columns['example_id'] = tf.constant([str(i+1) for i in range(len(X_test))])
    return columns

Then call classifier.predict() to create **y_pred** as predictions.

**Hint**: See LinearSVM lab.

In [12]:
# Add code below this comment  (Question #P6007)
# ----------------------------------
y_pred= classifier.predict(input_fn = predict_fn)



Feed predictions **y_pred** along with ground truth **y_test** to confusion_matrix() to create a confusion matrix.

In [13]:
# Add code below this comment  (Question #P6008)
# ----------------------------------
y_pred = list(map(lambda i: i['classes'], y_pred))
confusion_matrix(y_test,y_pred)


array([[145,  44],
       [ 62, 149]])

# Save your notebook!