In [1]:
# We will first do the digits example again ...


from sklearn.datasets import load_digits
digits = load_digits()

In [4]:
# Here are the libraries we will need ...
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import pandas as pd

In [3]:
x_train, x_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.25, random_state=0)

In [5]:
# We start with a simple linear classifier
svc_linear = svm.SVC(kernel='linear', C=1)
svc_linear.fit(x_train, y_train)
predicted= svc_linear.predict(x_test)
cnf_matrix = confusion_matrix(y_test, predicted)
print(cnf_matrix)

[[37  0  0  0  0  0  0  0  0  0]
 [ 0 42  0  0  0  0  0  0  1  0]
 [ 0  0 44  0  0  0  0  0  0  0]
 [ 0  0  0 43  0  0  0  0  1  1]
 [ 0  0  0  0 38  0  0  0  0  0]
 [ 0  0  0  0  0 47  0  0  0  1]
 [ 0  1  0  0  0  0 51  0  0  0]
 [ 0  0  0  0  1  0  0 47  0  0]
 [ 0  3  1  0  0  0  0  0 44  0]
 [ 0  0  0  1  0  1  0  0  1 44]]


In [6]:
# Pretty good!

In [2]:
# Next we apply the method to the Abalone data set ...

data = pd.read_csv('PythonData/abalone.data', header=0)

In [10]:
X = data.iloc[:,1:]
y = data.iloc[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [13]:
# Notice we don't even change the target to numbers!
svc_linear.fit(X_train, y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [14]:
predicted= svc_linear.predict(X_test)
cnf_matrix = confusion_matrix(y_test, predicted)
print(cnf_matrix)

[[  0  57 259]
 [  2 294  63]
 [  1  87 282]]


In [15]:
# OK, not great. Let's try to change the parameter C ...
svc_linear = svm.SVC(kernel='linear', C=100)
svc_linear.fit(X_train, y_train)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [16]:
predicted= svc_linear.predict(X_test)
cnf_matrix = confusion_matrix(y_test, predicted)
print(cnf_matrix)

[[ 57  56 203]
 [  7 295  57]
 [ 46  90 234]]


In [12]:
# Worse - we need a different model. Not linearly separable. 
# We'll try one more dataset - wifi_localization. This dataset gives various wifi readings
# a location value. The goal is predict location based upon wifi reading.

data = pd.read_csv('PythonData/wifi_localization.txt', sep='\t', header=None)

In [13]:
data.shape

(2000, 8)

In [14]:
data[0:1]

Unnamed: 0,0,1,2,3,4,5,6,7
0,-64,-56,-61,-66,-71,-82,-81,1


In [16]:
# We'll create our train and test sets
x_train, x_test, y_train, y_test = train_test_split(data.iloc[:,0:6], data.iloc[:,7])

In [17]:
# We can now build various models and see how they do ...
svc_linear = svm.SVC(kernel='linear', C=1)
svc_linear.fit(x_train, y_train)
predicted= svc_linear.predict(x_test)
cnf_matrix = confusion_matrix(y_test, predicted)
print(cnf_matrix)

[[126   0   3   1]
 [  0 114   2   0]
 [  1   1 118   1]
 [  1   0   0 132]]


In [None]:
# Pretty good.