# 2.1 An introduction to machine learning with scikit-learn

In [17]:
from sklearn import datasets

## 2.1.2 加载 sklearn 自带的数据集

In [18]:
iris = datasets.load_iris()
digits = datasets.load_digits()

## 2.1.3 Learning and predicting


下面的内容应用 SVM 模型，利用 digits 的数据集进行训练。
通过训练后， 该模型可以对手写的文字进行分类

In [20]:
from sklearn import svm
clf = svm.SVC(gamma=0.001, C=100)

In [21]:
digits.data.shape

(1797, 64)

In [22]:
clf.fit(digits.data[:-1], digits.target[:-1])

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [25]:
clf.predict(digits.data[-1:])

array([8])

In [26]:
digits.images[-1:]

array([[[ 0.,  0., 10., 14.,  8.,  1.,  0.,  0.],
        [ 0.,  2., 16., 14.,  6.,  1.,  0.,  0.],
        [ 0.,  0., 15., 15.,  8., 15.,  0.,  0.],
        [ 0.,  0.,  5., 16., 16., 10.,  0.,  0.],
        [ 0.,  0., 12., 15., 15., 12.,  0.,  0.],
        [ 0.,  4., 16.,  6.,  4., 16.,  6.,  0.],
        [ 0.,  8., 16., 10.,  8., 16.,  8.,  0.],
        [ 0.,  1.,  8., 12., 14., 12.,  1.,  0.]]])

## 2.1.4 Model persistence

Model 训练的结果可以通过 pickle 进行持久化保存

In [27]:
from sklearn import  svm
from sklearn import datasets
clf = svm.SVC()
iris = datasets.load_iris()
X, y = iris.data, iris.target
clf.fit(X, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [28]:
import pickle
s = pickle.dumps(clf)
clf2 = pickle.loads(s)
clf2.predict(X[0:1])

array([0])

In [29]:
y[0]

0

# 2.2 A tutorial on statistical-learning for scientific data processing

## 2.2.1 Statistical learning: the setting and the estimator object in scikit-learn

In [32]:
digits = datasets.load_digits()
digits.images.shape

(1797, 8, 8)

In [34]:
import matplotlib.pyplot as plt
plt.imshow(digits.images[-1], cmap=plt.cm.gray_r)

<matplotlib.image.AxesImage at 0x16766a38828>

## 2.2.2 Supervised learning: predicting an output variable from high-dimensional observations

In [1]:
import numpy as np
from sklearn import datasets
iris = datasets.load_iris()
iris_X = iris.data
iris_Y = iris.target

In [2]:
np.unique(iris_Y)

array([0, 1, 2])

In [4]:
len(iris_X)

150

In [9]:
np.random.seed(0)
indices = np.random.permutation?

In [10]:
indices = np.random.permutation(len(iris_X))
iris_X_train = iris_X[indices[:-10]]


array([114,  62,  33, 107,   7, 100,  40,  86,  76,  71, 134,  51,  73,
        54,  63,  37,  78,  90,  45,  16, 121,  66,  24,   8, 126,  22,
        44,  97,  93,  26, 137,  84,  27, 127, 132,  59,  18,  83,  61,
        92, 112,   2, 141,  43,  10,  60, 116, 144, 119, 108,  69, 135,
        56,  80, 123, 133, 106, 146,  50, 147,  85,  30, 101,  94,  64,
        89,  91, 125,  48,  13, 111,  95,  20,  15,  52,   3, 149,  98,
         6,  68, 109,  96,  12, 102, 120, 104, 128,  46,  11, 110, 124,
        41, 148,   1, 113, 139,  42,   4, 129,  17,  38,   5,  53, 143,
       105,   0,  34,  28,  55,  75,  35,  23,  74,  31, 118,  57, 131,
        65,  32, 138,  14, 122,  19,  29, 130,  49, 136,  99,  82,  79,
       115, 145,  72,  77,  25,  81, 140, 142,  39,  58,  88,  70,  87,
        36,  21,   9, 103,  67, 117,  47])