# Estimator Usage & Data Preparation

In [2]:
from sklearn import svm
from sklearn import datasets

In [4]:
clf = svm.SVC()   #classifier/predictor has constructor
#c.f. regr = linear_model.LinearRegression()

In [5]:
iris = datasets.load_iris()
X,y = iris.data, iris.target    #data=input vector x, target: labelling

In [14]:
clf.fit(X,y) #fit() equiv to train()

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

**alternatively, we can output predictions as class labels**

In [16]:
clf.fit(iris.data, iris.target_names[iris.target])
list(clf.predict(iris.data[0:3]))   #predict the first 3 samples

['setosa', 'setosa', 'setosa']

# Store model as a variable

In [7]:
import pickle

In [13]:
spare = pickle.dumps(clf)  #stored current model as a variable 'spare'
clf2 = pickle.loads(spare)  #load the stored model
clf2.predict(X[0:1])   #use classifier to predict() to predict the first sample x
#y[0]  #verify prediction

array([0])

# Side notes

+ all inputs & regression results casted to float64
+ sklearn.pickle has security & compatiability issues

# Update of hyperparameters
+ sklearn.pipeline.Pipeline.set_params() allows tuning of transformation functions even after the model is built

In [19]:
clf2.set_params(kernel='rbf').fit(iris.data, iris.target_names[iris.target])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [20]:
clf2.predict(iris.data[0:3])

array(['setosa', 'setosa', 'setosa'],
      dtype='<U10')

# Multiclass vs. multilabel fitting

In [22]:
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer
# turn multiclass labels into binary representations

In [30]:
X = [[1, 2], [2, 4], [4, 5], [3, 2], [3, 1]]  #sample
y = [0, 0, 1, 1, 2]    #labels, 1d
classif = OneVsRestClassifier(estimator=SVC(random_state=0))
classif.fit(X,y).predict(X)   #pipelining in computational graph

array([0, 0, 1, 1, 2])

In [29]:
print(y)

[[1 0 0]
 [1 0 0]
 [0 1 0]
 [0 1 0]
 [0 0 1]]


In [31]:
y = LabelBinarizer().fit_transform(y)
classif.fit(X,y).predict(X)

array([[1, 0, 0],
       [1, 0, 0],
       [0, 1, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [None]:
# http://scikit-learn.org/stable/tutorial/basic/tutorial.html#machine-learning-the-problem-setting