# KNN为例的sklearn用法

In [1]:
# 导入 sklearn
from sklearn import neighbors, datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 加载数据
iris = datasets.load_iris()

# 划分训练集与测试集
X, y = iris.data[:, :2], iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33)

# 数据预处理
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# 创建模型
knn = neighbors.KNeighborsClassifier(n_neighbors=5)
# 模型拟合
knn.fit(X_train, y_train)

# 预测
y_pred = knn.predict(X_test)
# 评估
accuracy_score(y_test, y_pred)

0.631578947368421

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 加载数据

In [4]:
X = np.random.random((11, 5))
y = np.array(['M', 'M', 'F', 'F', 'M', 'F', 'M', 'M', 'F', 'F', 'F'])
X[X < 0.7] = 0

## 划分数据集

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# 数据预处理

## 标准化

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
standardized_X = scaler.transform(X_train)
standardized_X_test = scaler.transform(X_test)

## 归一化

In [7]:
from sklearn.preprocessing import Normalizer
scaler = Normalizer().fit(X_train)
normalized_X = scaler.transform(X_train)
normalized_X_test = scaler.transform(X_test)

## 二值化

In [8]:
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=0.0).fit(X)
binary_X = binarizer.transform(X)

## 编码分类特征

In [9]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
y = enc.fit_transform(y)

array([1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0])

In [55]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit([1, 2, 2, 6])
LabelEncoder()
le.classes_
# array([1, 2, 6])
le.transform([1, 1, 2, 6]) 
# array([0, 0, 1, 2]...)
le.inverse_transform([0, 0, 1, 2])
# array([1, 1, 2, 6])

array([1, 1, 2, 6])

In [56]:
le = preprocessing.LabelEncoder()
le.fit(["paris", "paris", "tokyo", "amsterdam"])
list(le.classes_)
le.transform(["tokyo", "tokyo", "paris"])
list(le.inverse_transform([2, 2, 1]))

['tokyo', 'tokyo', 'paris']

## 输入缺失值

In [11]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values=0, strategy='mean', axis=0)
imp.fit_transform(X_train)



array([[0.89729752, 0.81868764, 0.82756504, 0.97681789, 0.78382666],
       [0.89043026, 0.81958818, 0.82756504, 0.9193607 , 0.71365814],
       [0.89043026, 0.81868764, 0.94185773, 0.81793079, 0.81255574],
       [0.883563  , 0.84715792, 0.71327235, 0.9193607 , 0.70225249],
       [0.89043026, 0.81868764, 0.82756504, 0.95304538, 0.78382666],
       [0.89043026, 0.81868764, 0.82756504, 0.9193607 , 0.82908093],
       [0.89043026, 0.81868764, 0.82756504, 0.92964875, 0.91528112],
       [0.89043026, 0.78931683, 0.82756504, 0.9193607 , 0.73013153]])

## 生成多项式特征

In [12]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(5)
poly.fit_transform(X)

array([[1.        , 0.        , 0.        , ..., 0.66267608, 0.65243448,
        0.64235117],
       [1.        , 0.        , 0.        , ..., 0.35891582, 0.3565572 ,
        0.35421407],
       [1.        , 0.87776512, 0.        , ..., 0.        , 0.        ,
        0.62304971],
       ...,
       [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.89729752, 0.        , ..., 0.        , 0.        ,
        0.        ]])

# 创建模型估计器

## 监督学习

In [13]:
# 线性回归
from sklearn.linear_model import LinearRegression
lr = LinearRegression(normalize=True)
# 支持向量机(SVM)
from sklearn.svm import SVC
svc = SVC(kernel='linear')
# 朴素贝叶斯
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
# KNN
from sklearn import neighbors
knn = neighbors.KNeighborsClassifier(n_neighbors=5)

## 无监督学习

In [14]:
# 主成分分析（PCA)
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)
# K Means
k_means = KMeans(n_clusters=3, random_state=0)

# 拟合数据

## 监督学习

In [15]:
lr.fit(X, y)
knn.fit(X_train, y_train)
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

## 无监督学习

In [16]:
k_means.fit(X_train)
pca_model = pca.fit_transform(X_train)

# 预测

## 监督学习

In [22]:
# 预测标签
y_pred = svc.predict(np.random.random((2,5)))
# 预测标签
y_pred = lr.predict(X_test)
# 评估标签概率
y_pred = knn.predict_proba(X_test)

## 无监督学习

In [20]:
y_pred = k_means.predict(X_test)

# 评估模型性能

## 分类指标

In [27]:
# 准确率
knn.score(X_test, y_test)
from sklearn.metrics import accuracy_score
y_pred = lr.predict(X_test)
accuracy_score(y_test, y_pred)
# 分类预估评价函数
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
# 混淆矩阵
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

ValueError: Classification metrics can't handle a mix of binary and continuous targets

## 回归指标

In [30]:
# 平均绝对误差
from sklearn.metrics import mean_absolute_error
y_true = [3, -0.5, 2]
mean_absolute_error(y_true, y_pred)
# 均方误差
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)
# R2 评分
from sklearn.metrics import r2_score
r2_score(y_true, y_pred)



TypeError: ufunc 'subtract' did not contain a loop with signature matching types dtype('<U32') dtype('<U32') dtype('<U32')

## 群集指标

In [35]:
# 调整兰德系数
from sklearn.metrics import adjusted_rand_score
adjusted_rand_score(y_true, y_pred)
# 同质性
from sklearn.metrics import homogeneity_score
homogeneity_score(y_true, y_pred)
# V-measure
from sklearn.metrics import v_measure_score
v_measure_score(y_true, y_pred)

1.0

## 交叉验证

In [37]:
from sklearn.model_selection import cross_val_score
print(cross_val_score(knn, X_train, y_train, cv=4))
print(cross_val_score(lr, X, y, cv=2))

[0.  1.  0.5 0. ]
[ -0.51226328 -16.72583381]


# 模型调整
## 网格搜索

In [48]:
from sklearn.model_selection import GridSearchCV
params = {"n_neighbors": np.arange(1, 3),
          "metric": ["euclidean", "cityblock"]}
grid = GridSearchCV(estimator=knn,
                    param_grid=params)
grid.fit(X_train, y_train)
print(grid.best_score_)
print(grid.best_estimator_.n_neighbors)

0.5
2




## 随机参数优化

In [54]:
from sklearn.model_selection import RandomizedSearchCV
params = {"n_neighbors": range(1, 5),
          "weights": ["uniform", "distance"]}
rsearch = RandomizedSearchCV(estimator=knn,
                             param_distributions=params,
                             random_state=5)
rsearch.fit(X_train, y_train)
print(rsearch.best_score_)

0.5


