# KNN 分类

In [9]:
# 加载库
%matplotlib inline
import os
import numpy as np
from scipy import stats
import pandas as pd
import sklearn.model_selection as cross_validation
import matplotlib.pyplot as plt

# 显示的最大行数和列数，如果超额就显示省略号，这个指的是多少个dataFrame的列。如果比较多又不允许换行，就会显得很乱。
# None 表示不限制列数
pd.set_option('display.max_columns', None)

In [10]:
# 加载数据集
orgData = pd.read_csv('data/date_data2.csv')
orgData.describe()

Unnamed: 0,income,attractive,assets,edueduclass,Dated,income_rank,attractive_rank,assets_rank
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,9010.0,50.5,96.0063,3.71,0.5,1.55,1.56,1.51
std,5832.675288,28.810948,91.082226,1.225116,0.502519,1.140397,1.103896,1.123621
min,3000.0,1.0,3.7284,1.0,0.0,0.0,0.0,0.0
25%,5000.0,28.0,31.665269,3.0,0.0,1.0,1.0,0.75
50%,7500.0,51.0,70.746924,4.0,0.5,2.0,2.0,2.0
75%,11500.0,68.875,131.481061,4.0,1.0,3.0,2.25,2.25
max,34000.0,99.5,486.311758,6.0,1.0,3.0,3.0,3.0


In [12]:
# 选取自变量
X = orgData.iloc[:, :4]
y = orgData[['Dated']]
X.head()
X.shape

(100, 4)

In [17]:
# 极值标准化
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
X_scaled = min_max_scaler.fit_transform(X)
X_scaled[0:4]# 看一下标准化后的结果

  return self.partial_fit(X, y)


array([[0.        , 0.08121827, 0.00293644, 0.        ],
       [0.        , 0.13705584, 0.07649535, 0.6       ],
       [0.        , 0.05076142, 0.00293644, 0.        ],
       [0.        , 0.        , 0.00691908, 0.        ]])

In [18]:
# 划分训练集和测试集
train_data, test_data, train_target, test_target = cross_validation.train_test_split(
    X_scaled, y, train_size=0.8, random_state=123)   #划分训练集和测试集



In [19]:
# 建模
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=3)  # 默认欧氏距离
model.fit(train_data, train_target.values.flatten())
test_est = model.predict(test_data)

In [25]:
# 验证
import sklearn.metrics as metrics

print(metrics.confusion_matrix(test_target, test_est, labels=[0, 1]))  # 混淆矩阵
print(metrics.classification_report(test_target, test_est))

[[ 8  1]
 [ 1 10]]
              precision    recall  f1-score   support

           0       0.89      0.89      0.89         9
           1       0.91      0.91      0.91        11

   micro avg       0.90      0.90      0.90        20
   macro avg       0.90      0.90      0.90        20
weighted avg       0.90      0.90      0.90        20



In [21]:
# 评分
model.score(test_data, test_target)

0.9

In [29]:
#选择k值
for k in range(1, 25):
    k_model = KNeighborsClassifier(n_neighbors=k)
    k_model.fit(train_data, train_target.values.flatten())
    score = k_model.score(test_data, test_target)
#     print(metrics.confusion_matrix(test_target, test_est, labels=[0, 1]))  # 混淆矩阵
#     print(metrics.classification_report(test_target, test_est))
    print(k, '\t', score)

1 	 0.9
2 	 0.85
3 	 0.9
4 	 0.9
5 	 0.9
6 	 0.85
7 	 0.9
8 	 0.85
9 	 0.9
10 	 0.85
11 	 0.9
12 	 0.95
13 	 1.0
14 	 1.0
15 	 1.0
16 	 1.0
17 	 1.0
18 	 1.0
19 	 1.0
20 	 0.95
21 	 0.95
22 	 0.85
23 	 0.85
24 	 0.85


In [31]:
#交叉验证 选择k值
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import KFold

n_samples = len(train_data)
kf = KFold(n_splits=3)

grid = ParameterGrid({'n_neighbors':[np.arange(1,15)]})
estimator = KNeighborsClassifier()

gridSearchCV = GridSearchCV(estimator, grid, cv=kf)

gridSearchCV.fit(train_data, train_target.values.flatten())

# gridSearchCV.grid_scores_



GridSearchCV(cv=KFold(n_splits=3, random_state=None, shuffle=False),
       error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=<sklearn.model_selection._search.ParameterGrid object at 0x1a1c30a7f0>,
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [33]:
gridSearchCV.best_params_ #分类问题 最优模型选择F指标最好的。

{'n_neighbors': 7}

In [34]:
best = gridSearchCV.best_estimator_ 
best.score(test_data, test_target)

0.9