In [6]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib notebook
sns.set()

## 5.2 Scikit-Learn 简介

### 5.2.1 Scikit-Learn 的数据表示

#### 1.数据表

In [7]:
iris = sns.load_dataset("iris")

In [8]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


#### 3.目标数组

In [9]:
sns.set()
# sns.pairplot(iris, hue='species', size=1.5)
sns.pairplot(iris, hue='species', height=1.5)

<IPython.core.display.Javascript object>

<seaborn.axisgrid.PairGrid at 0x22ce4c0eb70>

In [12]:
# 从 DataFrame 中抽取特征矩阵和目标数组

X_iris = iris.drop('species', axis=1)
X_iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [13]:
y_iris = iris['species']
y_iris.head()

0    setosa
1    setosa
2    setosa
3    setosa
4    setosa
Name: species, dtype: object

### 5.2.2 Scikit-Learn 的评估器API

#### 2.有监督学习示例：简单线性回归

In [16]:
plt.figure()
rng = np.random.RandomState(42)
x = 10 * rng.rand(50)
y = 2 * x - 1 + rng.randn(50)
plt.scatter(x, y)

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x1bca863ceb8>

(1) 选择模型类

In [18]:
from sklearn.linear_model import LinearRegression

(2) 选择模型超参数

In [19]:
# 实例化 LinearRegression 类并用 fit_intercept 超参数设置是否想要拟合直线的截距

model = LinearRegression(fit_intercept=True)
model

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

(3) 将数据整理成特征矩阵和目标数组

In [20]:
X = x[ :, np.newaxis]
X.shape

(50, 1)

(4) 用模型拟合数据

In [21]:
model.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [22]:
model.coef_

array([1.9776566])

In [23]:
model.intercept_

-0.9033107255311164

(5) 预测新数据的标签

In [24]:
xfit = np.linspace(-1, 11)

In [25]:
# 首先，将这些 x 值转换成 [n_samples, n_features] 的特征矩阵形式，之后将其输入到模型中

Xfit = xfit[ : , np.newaxis]
yfit = model.predict(Xfit)

In [26]:
# 最后，把原始数据和拟合结果都可视化出来

plt.figure()
plt.scatter(x, y)
plt.plot(Xfit, yfit)

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x1bcaa7da7b8>]

#### 3.有监督学习示例：鸢尾花数据分类

In [27]:
# train_test_split(): 将数据分割成训练集(training set)和测试集(testing set)

from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(X_iris, y_iris, random_state=1)

In [28]:
# 预测标签

from sklearn.naive_bayes import GaussianNB  # 1. 选择模型类

model = GaussianNB()                       # 2. 初始化模型
model.fit(Xtrain, ytrain)                  # 3. 用模型拟合数据
y_model = model.predict(Xtest)              # 4. 对新数据进行预测

In [29]:
from sklearn.metrics import accuracy_score

accuracy_score(ytest, y_model)

0.9736842105263158

#### 4.无监督学习示例：鸢尾花数据降维

In [30]:
# 使用主成分分析(principal component analysis, PCA)

from sklearn.decomposition import PCA  # 1. 选择模型类
model = PCA(n_components=2)            # 2. 设置超参数，初始化模型
model.fit(X_iris)                      # 3. 拟合数据，注意这里不用 y 变量
X_2D = model.transform(X_iris)         # 4. 将数据转换为二维

In [31]:
iris['PCA1'] = X_2D[ : , 0]
iris['PCA2'] = X_2D[ : , 1]
sns.lmplot("PCA1", "PCA2", hue='species', data=iris, fit_reg=False)

<IPython.core.display.Javascript object>

<seaborn.axisgrid.FacetGrid at 0x1bcaa807898>

#### 5.无监督学习示例：鸢尾花数据聚类

In [32]:
# 拟合高斯混合模型
# ImportError: cannot import name 'GMM' from 'sklearn.mixture' \

from sklearn.mixture import GaussianMixture                      # 1. 选择模型类
model = GaussianMixture(n_components=3, covariance_type='full')  # 2. 设置超参数，初始化模型
model.fit(X_iris)                                                # 3. 拟合数据，注意不需要 y 变量
y_gmm = model.predict(X_iris)                                    # 4. 确定簇标签

In [33]:
iris['cluster'] = y_gmm
sns.lmplot("PCA1", "PCA2", data=iris, hue='species', col='cluster', fit_reg=False)

<IPython.core.display.Javascript object>

<seaborn.axisgrid.FacetGrid at 0x1bca9581048>

### 5.2.3 应用：手写数字探索

#### 1. 加载并可视化手写数字

In [34]:
from sklearn.datasets import load_digits

digits = load_digits()
digits.images.shape

(1797, 8, 8)

In [35]:
# 对前 100 张图进行可视化

import matplotlib.pyplot as plt

fig, axes = plt.subplots(10, 10, figsize=(8, 8),
                        subplot_kw={'xticks': [], 'yticks': []},
                        gridspec_kw=dict(hspace=0.1, wspace=0.1))

for i, ax in enumerate(axes.flat):
    ax.imshow(digits.images[i], cmap='binary', interpolation='nearest')
    ax.text(0.05, 0.05, str(digits.target[i]),
           transform=ax.transAxes, color='green')

<IPython.core.display.Javascript object>

In [36]:
X = digits.data
X.shape

(1797, 64)

In [37]:
y = digits.target
y.shape

(1797,)

#### 2.无监督学习：降维

In [38]:
# 用流形学习算法中的 Isomap 算法对数据进行降维

from sklearn.manifold import Isomap

iso = Isomap(n_components=2)
iso.fit(digits.data)
data_projected = iso.transform(digits.data)
data_projected.shape

(1797, 2)

In [51]:
plt.figure()
plt.scatter(data_projected[ : , 0], data_projected[ : , 1], c=digits.target,
           edgecolor='none', alpha=0.5,
           cmap=plt.cm.get_cmap('Spectral', 10))
plt.colorbar(label='digit label', ticks=range(10))
plt.clim(-0.5, 9.5)

<IPython.core.display.Javascript object>

#### 3. 数字分类

In [47]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=0)

In [48]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(Xtrain, ytrain)
y_model = model.predict(Xtest)

In [49]:
from sklearn.metrics import accuracy_score

accuracy_score(ytest, y_model)

0.8333333333333334

In [52]:
# 用 scikit-learn 计算混淆矩阵

from sklearn.metrics import confusion_matrix

mat = confusion_matrix(ytest, y_model)

plt.figure()
sns.heatmap(mat, square=True, annot=True, cbar=False)
plt.xlabel("predicted value")
plt.ylabel("true value")

<IPython.core.display.Javascript object>

Text(110.45000000000003, 0.5, 'true value')

In [57]:
# 将样本画出来，然后把预测标签放在左下角，用绿色表示预测正确，用红色表示预测错误

fig, axes = plt.subplots(10, 10, figsize=(8, 8),
                        subplot_kw={'xticks': [], 'yticks': []},
                        gridspec_kw=dict(hspace=0.1, wspace=0.1))

test_images = Xtest.reshape(-1, 8, 8)

for i, ax in enumerate(axes.flat):
    ax.imshow(test_images[i], cmap='binary', interpolation='nearest')
    ax.text(0.05, 0.05, str(y_model[i]),
           transform=ax.transAxes,
           color='green' if (ytest[i] == y_model[i]) else 'red')

<IPython.core.display.Javascript object>