# 将数据均分成多份进行交叉验证

In [1]:
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
iris = load_iris()
X = iris.data
y = iris.target
#根据数据点附近的5个值来得出y值,调整n_neighbors可以控制拟合度
knn = KNeighborsClassifier(n_neighbors=5)
#cv，将X,y测试级数据均分5等分进行打分相对于train_test_split模式使用
scores = cross_val_score(knn,X,y,cv=5,scoring='accuracy')
print(scores)
print(scores.mean())

[0.96666667 1.         0.93333333 0.96666667 1.        ]
0.9733333333333334


# 寻找最佳n_neighbors值
n_neighbors 是指定搜寻最近的样本数据的数量

In [2]:
import matplotlib.pyplot as plt
k_range = range(1,31)
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
#     loss = -cross_val_score(knn,X,y,cv=10,scoring='neg_mean_squared_error') # for regression 查误差，越小越好，同时knn选择Regression系列
    scores = cross_val_score(knn,X,y,cv=10,scoring='accuracy') # for calssification 查匹配度，越接近1越好
    k_scores.append(scores.mean())
plt.plot(k_range,k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross Validated Accuracy')
plt.show()

<Figure size 640x480 with 1 Axes>

# Learning_curve学习曲线评估误差
在训练集的10%，25%，50%，75%，100%处通过负平均方差对训练拟合度评估

In [None]:
from sklearn.model_selection import learning_curve
from sklearn.datasets import load_digits
from sklearn.svm import SVC
import matplotlib.pyplot as plt 
import numpy as np
digits = load_digits()
X = digits.data
y = digits.target
#进行cv10次均分数据的交叉验证，在训练集的10%，25%，50%，75%，100%处通过负平均方差打分
train_sizes,train_loss,test_loss=learning_curve(SVC(gamma=0.001),X,y,cv=10,scoring='neg_mean_squared_error',train_sizes=[0.1,0.25,0.5,0.75,1])
train_loss_mean = -np.mean(train_loss,axis=1)
test_loss_mean = -np.mean(test_loss,axis=1)
plt.plot(train_sizes,train_loss_mean,'o-',color="r",label="Training")
plt.plot(train_sizes,test_loss_mean,'o-',color="g",label="Cross-validation")
plt.xlabel("Training examples")
plt.ylabel("Loss")
plt.legend(loc="best")
plt.show()

# Validation_curve验证曲线验证模型中的参数取值
SVC中的gamma参数取值为param_range时，求取负最低均方差

In [None]:
from sklearn.model_selection import validation_curve
from sklearn.datasets import load_digits
from sklearn.svm import SVC
import matplotlib.pyplot as plt 
import numpy as np
digits = load_digits()
X = digits.data
y = digits.target
param_range = np.logspace(-6,-2.3,5)
#进行cv10次均分数据的交叉验证，SVC中的gamma参数取值为param_range时，求取负最低均方差
train_loss,test_loss=validation_curve(SVC(),X,y,param_name='gamma',param_range=param_range,cv=10,scoring='neg_mean_squared_error')
train_loss_mean = -np.mean(train_loss,axis=1)
test_loss_mean = -np.mean(test_loss,axis=1)
plt.plot(param_range,train_loss_mean,'o-',color="r",label="Training")
plt.plot(param_range,test_loss_mean,'o-',color="g",label="Cross-validation")
plt.xlabel("gamma")
plt.ylabel("Loss")
plt.legend(loc="best")
plt.show() 