In [1]:
from sklearn import datasets

iris = datasets.load_iris()
digits = datasets.load_digits()

In [2]:
# The digits dataset
print(digits.data)

[[  0.   0.   5. ...,   0.   0.   0.]
 [  0.   0.   0. ...,  10.   0.   0.]
 [  0.   0.   0. ...,  16.   9.   0.]
 ..., 
 [  0.   0.   1. ...,   6.   0.   0.]
 [  0.   0.   2. ...,  12.   0.   0.]
 [  0.   0.  10. ...,  12.   1.   0.]]


In [3]:
digits.target

array([0, 1, 2, ..., 8, 9, 8])

In [4]:
digits.images[0]

array([[  0.,   0.,   5.,  13.,   9.,   1.,   0.,   0.],
       [  0.,   0.,  13.,  15.,  10.,  15.,   5.,   0.],
       [  0.,   3.,  15.,   2.,   0.,  11.,   8.,   0.],
       [  0.,   4.,  12.,   0.,   0.,   8.,   8.,   0.],
       [  0.,   5.,   8.,   0.,   0.,   9.,   8.,   0.],
       [  0.,   4.,  11.,   0.,   1.,  12.,   7.,   0.],
       [  0.,   2.,  14.,   5.,  10.,  12.,   0.,   0.],
       [  0.,   0.,   6.,  13.,  10.,   0.,   0.,   0.]])

In [5]:
# In scikit-learn, an estimator for classification is a Python object that implements the methods fit(X, y) and predict(T).
# http://scikit-learn.org/stable/tutorial/basic/tutorial.html
from sklearn import svm
clf = svm.SVC(gamma=0.001, C=100.)
clf.fit(digits.data[:-1], digits.target[:-1]) # train on all but last digit

SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [6]:
# Predict last digit we didn't train on
clf.predict(digits.data[-1:])

array([8])

In [7]:
# The iris dataset
print(iris.data[0:10])

[[ 5.1  3.5  1.4  0.2]
 [ 4.9  3.   1.4  0.2]
 [ 4.7  3.2  1.3  0.2]
 [ 4.6  3.1  1.5  0.2]
 [ 5.   3.6  1.4  0.2]
 [ 5.4  3.9  1.7  0.4]
 [ 4.6  3.4  1.4  0.3]
 [ 5.   3.4  1.5  0.2]
 [ 4.4  2.9  1.4  0.2]
 [ 4.9  3.1  1.5  0.1]]


In [8]:
#It is possible to save a model in the scikit by using Python’s built-in persistence model, namely pickle:
#from sklearn import svm
#from sklearn import datasets
clf = svm.SVC()
#iris = datasets.load_iris()
X, y = iris.data, iris.target
clf.fit(X, y) 

import pickle
s = pickle.dumps(clf)
clf2 = pickle.loads(s) 
clf2.predict(X[0:1]) # Predicts using pickled classifier

array([0])

In [9]:
y[0]

0

In [10]:
#In the specific case of the scikit, it may be more interesting to use joblib’s replacement of pickle (joblib.dump & joblib.load), which is more efficient on big data, but can only pickle to the disk and not to a string:
from sklearn.externals import joblib
joblib.dump(clf, 'filename.pkl') 
clf3 = joblib.load('filename.pkl') 
clf3.predict(X[0:1]) # Predicts using pickled classifier

array([0])

In [11]:
#scikit-learn estimators follow certain rules to make their behavior more predictive.
#Unless otherwise specified, input will be cast to float64:
import numpy as np
from sklearn import random_projection

rng = np.random.RandomState(0)
X = rng.rand(10, 2000)
X = np.array(X, dtype='float32')
X.dtype

dtype('float32')

In [12]:
#In this example, X is float32, which is cast to float64 by fit_transform(X).
transformer = random_projection.GaussianRandomProjection()
X_new = transformer.fit_transform(X)
X_new.dtype

dtype('float64')

In [13]:
#Regression targets are cast to float64, classification targets are maintained:
#Here, the first predict() returns an integer array, since iris.target (an integer array) was used in fit. The second predict() returns a string array, since iris.target_names was for fitting.
from sklearn import datasets
from sklearn.svm import SVC
iris = datasets.load_iris()
clf = SVC()
clf.fit(iris.data, iris.target) 
list(clf.predict(iris.data[:3]))

[0, 0, 0]

In [14]:
# This time we fit using the string names as the classes 
# and get back string named classes
clf.fit(iris.data, iris.target_names[iris.target])
list(clf.predict(iris.data[:3]))

['setosa', 'setosa', 'setosa']

In [15]:
# Hyper-parameters of an estimator can be updated after it has been constructed via the sklearn.pipeline.Pipeline.set_params method. Calling fit() more than once will overwrite what was learned by any previous fit():
import numpy as np
from sklearn.svm import SVC

rng = np.random.RandomState(0)
X = rng.rand(100, 10)
y = rng.binomial(1, 0.5, 100)
X_test = rng.rand(5, 10)

clf = SVC()
clf.set_params(kernel='linear').fit(X, y)  

clf.predict(X_test)

array([1, 0, 1, 1, 0])

In [16]:
# Now change kernel to rbf and refit
clf.set_params(kernel='rbf').fit(X, y)  
clf.predict(X_test)

array([0, 0, 0, 1, 0])

In [17]:
# When using multiclass classifiers, the learning and prediction task that is performed is dependent on the format of the target data fit upon:
# In the [below] case, the classifier is fit on a 1d array of multiclass labels and the predict() method therefore provides corresponding multiclass predictions. It is also possible to fit upon a 2d array of binary label indicators:
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer

X = [[1, 2], [2, 4], [4, 5], [3, 2], [3, 1]]
y = [0, 0, 1, 1, 2]

classif = OneVsRestClassifier(estimator=SVC(random_state=0))
classif.fit(X, y).predict(X)

array([0, 0, 1, 1, 2])

In [18]:
#Here, the classifier is fit() on a 2d binary label representation of y, using the LabelBinarizer. In this case predict() returns a 2d array representing the corresponding multilabel predictions.
y = LabelBinarizer().fit_transform(y)
print(y)
classif.fit(X, y).predict(X)
# I.e. I one hot-encoded Y, so it returns it one-hot-encoded as its predictions
# Note that the fourth and fifth instances returned all zeroes, indicating that they matched none of the three labels fit upon.

[[1 0 0]
 [1 0 0]
 [0 1 0]
 [0 1 0]
 [0 0 1]]


array([[1, 0, 0],
       [1, 0, 0],
       [0, 1, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [19]:
# With multilabel outputs, it is similarly possible for an instance to be assigned multiple labels:
from sklearn.preprocessing import MultiLabelBinarizer
# In this case, the classifier is fit upon instances each assigned multiple labels. The MultiLabelBinarizer is used to binarize the 2d array of multilabels to fit upon. As a result, predict() returns a 2d array with multiple predicted labels for each instance.
from sklearn.preprocessing import MultiLabelBinarizer
y = [[0, 1], [0, 2], [1, 3], [0, 2, 3], [2, 4]]
y = MultiLabelBinarizer().fit_transform(y)
print(y)
classif.fit(X, y).predict(X)

[[1 1 0 0 0]
 [1 0 1 0 0]
 [0 1 0 1 0]
 [1 0 1 1 0]
 [0 0 1 0 1]]


array([[1, 1, 0, 0, 0],
       [1, 0, 1, 0, 0],
       [0, 1, 0, 1, 0],
       [1, 0, 1, 0, 0],
       [1, 0, 1, 0, 0]])