In [6]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
iris = datasets.load_iris()
train_x, test_x, train_y, test_y = train_test_split(iris.data, iris.target, test_size = 0.25)

# KNN算法

In [20]:
from sklearn.neighbors import KNeighborsClassifier

# 实例化，并且设定参数
knn = KNeighborsClassifier()

# 传入训练集并训练
knn.fit(train_x, train_y)

# 对测试集进行预测
predict_y = knn.predict(test_x)

# 查看测试集的正确率
knn_score = knn.score(test_x, test_y)
predict_y, knn_score

(array([2, 2, 0, 1, 0, 0, 0, 2, 0, 1, 2, 1, 2, 2, 1, 1, 0, 1, 2, 1, 2, 2,
        1, 2, 1, 1, 2, 2, 2, 0, 2, 0, 2, 0, 1, 1, 1, 2]),
 0.9736842105263158)

# 朴素贝叶斯

In [13]:
from sklearn.naive_bayes import MultinomialNB
bayes = MultinomialNB(alpha = 1)
bayes.fit(train_x, train_y)
predict_y = bayes.predict(test_x)
bayes_score = bayes.score(test_x, test_y)
predict_y, bayes_score

(array([2, 2, 0, 1, 0, 0, 0, 2, 0, 1, 2, 1, 2, 2, 1, 1, 0, 1, 1, 1, 1, 2,
        1, 2, 1, 1, 1, 2, 2, 0, 2, 0, 2, 0, 1, 1, 1, 2]),
 0.9473684210526315)

# 分类模型评估
- 准确率
- 召回率
- F1-score

In [18]:
from sklearn.metrics import classification_report
result = classification_report(test_y, predict_y, target_names = iris.target_names)
print(result)

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         9
  versicolor       0.88      1.00      0.93        14
   virginica       1.00      0.87      0.93        15

    accuracy                           0.95        38
   macro avg       0.96      0.96      0.95        38
weighted avg       0.95      0.95      0.95        38



# 交叉验证与网格搜索

In [34]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# 实例化，并且设定参数
knn = KNeighborsClassifier()

# 其中参数cv是指几折交叉验证
gc = GridSearchCV(knn,
                  param_grid = {'n_neighbors':[1,2,3,4,5]},
                  cv = 3)

# 开始交叉验证与网格搜索
gc.fit(train_x, train_y)

print('测试集上的准确率：\n', gc.score(test_x, test_y), '\n')
print('交叉验证中最好的结果：\n', gc.best_score_, '\n')
print('最好的模型参数是：\n', gc.best_params_, '\n')
print('每个超参数每次交叉验证的结果：\n', gc.cv_results_, '\n')

测试集上的准确率：
 0.9736842105263158 

交叉验证中最好的结果：
 0.9642010431484116 

最好的模型参数是：
 {'n_neighbors': 5} 

每个超参数每次交叉验证的结果：
 {'mean_fit_time': array([0.000657  , 0.00066535, 0.00033236, 0.00066527, 0.00034706]), 'std_fit_time': array([0.00046465, 0.00047047, 0.00047002, 0.00047042, 0.00049081]), 'mean_score_time': array([0.00232728, 0.00265861, 0.0016621 , 0.00232697, 0.00133093]), 'std_score_time': array([0.00046895, 0.00047036, 0.00046991, 0.0012435 , 0.00047013]), 'param_n_neighbors': masked_array(data=[1, 2, 3, 4, 5],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'n_neighbors': 1}, {'n_neighbors': 2}, {'n_neighbors': 3}, {'n_neighbors': 4}, {'n_neighbors': 5}], 'split0_test_score': array([0.92105263, 0.94736842, 0.97368421, 0.97368421, 0.97368421]), 'split1_test_score': array([0.91891892, 0.91891892, 0.89189189, 0.89189189, 0.91891892]), 'split2_test_score': array([1.        , 0.97297297, 1.        , 1.        , 1.       

# 决策树

In [38]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(train_x, train_y)
predict_y = tree.predict(test_x)
score = tree.score(test_x, test_y)
predict_y, score

(array([2, 2, 0, 1, 0, 0, 0, 2, 0, 1, 2, 1, 2, 2, 1, 1, 0, 1, 1, 1, 2, 2,
        1, 2, 1, 1, 2, 2, 2, 0, 2, 0, 2, 0, 1, 1, 1, 2]),
 0.9473684210526315)

# 随机森林
随机森林是一个包含多个决策树的分类器，并且其输出的类别是由个别树输出的类别的众数而定。

- 为什么要随机抽样训练集？

如果不进行随机抽样，每棵树的训练集都一样，那么最终训练出的树分类结果也是完全一样的。

- 为什么要有放回地抽样？

如果不是有放回的抽样，那么每棵树的训练样本都是不同的，都是没有交集的，这样每棵树都是“有偏的”，都是绝对“片面的”，也就是说每棵树训练出来都是有很大的差异的；而随机森林最后分类取决于多棵树（弱分类器）的投票表决。

In [40]:
from sklearn.ensemble import RandomForestClassifier
Rforest = RandomForestClassifier()
Rforest.fit(train_x, train_y)
Rforest.score(test_x, test_y)

0.9736842105263158

# 线性回归
正规方程一次性计算得出，只适用于线性模型，小规模数据集
- sklearn.linear_model.LinearRegression # 最小二乘法线性回归（正规方程）

梯度下降法适用于各种类型的模型
- sklearn.linear_model.SGDRegressor # 梯度下降法线性回归

In [70]:
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.preprocessing import StandardScaler
boston = datasets.load_boston()
train_x, test_x, train_y, test_y = train_test_split(boston.data, boston.target, test_size = 0.25)
scaler_x = StandardScaler()
train_x = scaler_x.fit_transform(train_x)
test_x = scaler_x.transform(test_x)
scaler_y = StandardScaler()
train_y = scaler_y.fit_transform(train_y.reshape(-1, 1))
test_y = scaler_y.transform(test_y.reshape(-1, 1))
lr = LinearRegression()
lr.fit(train_x, train_y)
SGDlr = SGDRegressor()
SGDlr.fit(train_x, train_y)

# 优化后的参数
print(lr.coef_, '\n', SGDlr.coef_)

# 预测
lr_predict_y = lr.predict(test_x)
SGDlr_predict_y = SGDlr.predict(test_x)

# 反向转换
lr_predict_y = scaler_y.inverse_transform(lr_predict_y)
SGDlr_predict_y = scaler_y.inverse_transform(SGDlr_predict_y)
print(lr_predict_y[0], '\n', SGDlr_predict_y[0])

[[-0.09884092  0.1553833  -0.01983582  0.05397562 -0.24654452  0.2831318
  -0.03361899 -0.41196611  0.31347168 -0.27157746 -0.20816501  0.0848551
  -0.37203369]] 
 [-0.074848    0.11091426 -0.06123332  0.05689557 -0.17970881  0.30582437
 -0.03447708 -0.34726406  0.16524678 -0.11326285 -0.19289973  0.08569028
 -0.34669325]
[21.94893302] 
 21.89017152256554


  return f(**kwargs)


# 均方误差

In [76]:
from sklearn.metrics import mean_squared_error
lr_mse = mean_squared_error(lr_predict_y, scaler_y.inverse_transform(test_y))
SGDlr_mse = mean_squared_error(SGDlr_predict_y, scaler_y.inverse_transform(test_y))
print(lr_mse)
print(SGDlr_mse)

26.706786948436083
26.753488061996823


# 岭回归
带有正则化的线性回归（L2正则化）

In [82]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha = 1)
ridge.fit(train_x, train_y)
ridge_predict_y = ridge.predict(test_x)
ridge_mse = mean_squared_error(scaler_y.inverse_transform(ridge_predict_y), 
                               scaler_y.inverse_transform(predict_y))
ridge_mse

119.8053185462501

# 逻辑回归

In [88]:
from sklearn.linear_model import LogisticRegression
lor = LogisticRegression()
train_x, test_x, train_y, test_y = train_test_split(iris.data, iris.target, test_size = 0.25)
lor.fit(train_x, train_y)
predict_y = lor.predict(test_x)
score = lor.score(test_x, test_y)
print(predict_y, score)

[1 1 1 0 2 0 1 2 2 0 0 0 1 2 2 0 0 0 2 1 2 1 1 0 2 0 2 1 1 0 0 0 2 0 2 0 1
 2] 0.9736842105263158


# k-means算法

In [95]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters = 3)
km.fit(train_x)
predict_y = km.predict(test_x)
print(predict_y, '\n', test_y)

[2 1 2 0 1 0 2 2 1 0 0 0 2 1 2 0 0 0 2 2 2 2 2 0 2 0 1 2 2 0 0 0 2 0 1 0 2
 1] 
 [1 1 1 0 2 0 1 2 2 0 0 0 1 2 2 0 0 0 2 1 1 1 1 0 2 0 2 1 1 0 0 0 2 0 2 0 1
 2]


# 轮廓系数 

In [99]:
from sklearn.metrics import silhouette_score
silhouette_score(test_x, predict_y)

0.5531120813377731

# DBSCAN

In [102]:
from sklearn.cluster import DBSCAN
dbscan = DBSCAN()
dbscan.fit(train_x)

DBSCAN()