#机器学习100天——第十一天：K近邻法（K-NN）

##第一步：导入相关库

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

##第二步：导入数据集

In [2]:
dataset = pd.read_csv('../datasets/Social_Network_Ads.csv')
print(dataset.head())

    User ID  Gender  Age  EstimatedSalary  Purchased
0  15624510    Male   19            19000          0
1  15810944    Male   35            20000          0
2  15668575  Female   26            43000          0
3  15603246  Female   27            57000          0
4  15804002    Male   19            76000          0


为了方便理解，这里我们只取Age年龄和EstimatedSalary估计工资作为特征

In [3]:
X = dataset.iloc[:, [2, 3]].values
y = dataset.iloc[:, 4].values

##第三步：将数据划分成训练集和测试集

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

##第四步：特征缩放

In [5]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)



##第五步：使用K-NN对训练集数据进行训练

从sklearn的neighbors类中导入KNeighborsClassifier学习器

In [6]:
from sklearn.neighbors import KNeighborsClassifier

设置好相关的参数
n_neighbors = 5(K值的选择，默认选择5)、
metric = 'minkowski'(距离度量的选择，这里选择的是闵氏距离(默认参数))、
p = 2 (距离度量metric的附属参数，只用于闵氏距离和带权重闵氏距离中p值的选择，p=1为曼哈顿距离， p=2为欧式距离。默认为2)

In [7]:
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
print(classifier.fit(X_train, y_train))
print(classifier.score(X_test, y_test))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
0.93


##第六步：对测试集进行预测

In [8]:
y_pred = classifier.predict(X_test)
print(y_pred)

[0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0
 0 0 1 0 0 0 0 1 0 0 1 0 1 1 0 0 1 1 1 0 0 1 0 0 1 0 1 0 1 0 0 0 0 1 0 0 1
 0 0 0 0 1 1 1 1 0 0 1 0 0 1 1 0 0 1 0 0 0 0 0 1 1 1]


##第七步：生成混淆矩阵

混淆矩阵可以对一个分类器性能进行分析，由此可以计算出许多指标，例如：ROC曲线、正确率等

In [9]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[64  4]
 [ 3 29]]



        预测值
        0   1
    实0 64  4   
    际1 3   29
    值

预测集中的0总共有68个，1总共有32个。
在这个混淆矩阵中，实际有68个0，但K-NN预测出有67(64+3)个0，其中有3个实际上是1。
同时K-NN预测出有33(4+29)个1，其中4个实际上是0。

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

numeric_features = ['Age', 'EstimatedSalary']
numeric_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['Gender']
categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2))])

X1 = dataset.drop(['Purchased','User ID'], axis=1)
y1 = dataset['Purchased']

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2)

clf.fit(X1_train, y1_train)
print("model score: %.3f" % clf.score(X1_test, y1_test))


model score: 0.925


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


In [11]:
y1_pred = clf.predict(X1_test)
print(y1_pred)


[0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 1 0 0 1 0 0 0 1 0 0 0 0 0 1 1 1 0 0 0 0 1 0
 0 0 1 1 0 1 0 1 0 1 0 0 0 1 1 0 0 0 1 1 1 0 0 0 0 0 1 0 1 0 1 0 1 0 0 1 1
 0 0 0 0 0 0]


  Xt = transform.transform(Xt)


In [12]:
cm = confusion_matrix(y1_test, y1_pred)
print(cm)

[[53  5]
 [ 1 21]]


In [4]:
import seaborn as sns
sns.pairplot(dataset, hue='Purchased', size=2.5)


NameError: name 'dataset' is not defined

# 训练stockdata

In [147]:
from pathlib import Path
import pandas as pd

data_path = Path( "c:\\Users\\ywh\\Git_Repository\\stockdata.yahoo\\out.feature\\2019-01-06.100\\^GSPC.2019-01-06.[Adj_Price].pkl\\^GSPC.2019-01-06.[Adj_Price].pkl.features.pkl" )
data = pd.read_pickle(data_path)
data['2018'].head()

feature,pct_chg,pct_chg,pct_chg,pct_chg,pct_chg,pct_chg,pct_chg,pct_chg,pct_chg,pct_chg,...,pct_chg_minmax,pct_chg_minmax,pct_chg_minmax,pct_chg_minmax,pct_chg_minmax_gate,pct_chg_minmax_gate,pct_chg_minmax_gate,pct_chg_minmax_gate,pct_chg_minmax_gate,pct_chg_minmax_gate
arg,1,1,1,1,1,2,2,2,2,2,...,"min[-1,-6)","min[-1,-6)","min[-1,-6)","min[-1,-6)","min[-1,-5)","min[-1,-5)","min[-1,-5)","max[-1,-5)","max[-1,-5)","max[-1,-5)"
key,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume,...,High,Low,Close,Volume,Low>-0.02,Low>-0.01,Low>0,High>0.02,High>0.05,High>0.1
Date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2018-01-02,0.00202,0.003126,-0.003771,-0.003771,-0.274337,0.000883,0.001464,-0.000388,0.00142,-0.360508,...,0.011417,0.005232,0.010929,-0.038794,True,True,True,False,False,False
2018-01-03,-0.005234,-0.000727,-0.005742,-0.000756,-0.048439,-0.003225,-0.002124,-0.008985,-0.008985,-0.309487,...,0.011654,0.007866,0.009689,-0.085354,True,True,True,False,False,False
2018-01-04,-0.007892,-0.001817,-0.007921,-0.002298,-0.042379,-0.013084,-0.008613,-0.013588,-0.008642,-0.088765,...,0.008877,0.003166,0.008767,-0.124116,True,True,False,False,False,False
2018-01-05,-0.004401,-0.000747,-0.004489,-0.002687,0.141703,-0.012258,-0.006209,-0.012287,-0.006689,0.09332,...,0.00629,0.001732,0.005997,0.001863,True,True,False,False,False,False
2018-01-08,-0.004135,0.000284,-0.005378,0.000175,-0.00186,-0.008517,-0.004878,-0.008605,-0.006811,0.13958,...,0.002964,-0.00241,0.002027,0.065018,True,True,False,False,False,False


In [239]:
data['pct_chg_minmax_gate'].describe()

arg,"min[-1,-5)","min[-1,-5)","min[-1,-5)","max[-1,-5)","max[-1,-5)","max[-1,-5)"
key,Low>-0.02,Low>-0.01,Low>0,High>0.02,High>0.05,High>0.1
count,17364,17364,17364,17364,17364,17364
unique,2,2,2,2,2,2
top,True,False,False,False,False,False
freq,11175,11144,16189,15047,17216,17355


In [215]:
data['2010':'2018'].drop(columns=['pct_chg_minmax',("pct_chg_minmax_gate","max[-1,-5)"),("pct_chg_minmax_gate","min[-1,-5)",'Low>-0.02'),("pct_chg_minmax_gate","min[-1,-5)",'Low>-0.01'),]).head()

feature,pct_chg,pct_chg,pct_chg,pct_chg,pct_chg,pct_chg,pct_chg,pct_chg,pct_chg,pct_chg,pct_chg,pct_chg,pct_chg,pct_chg,pct_chg,pct_chg,pct_chg,pct_chg,pct_chg,pct_chg,pct_chg_minmax_gate
arg,1,1,1,1,1,2,2,2,2,2,...,19,19,19,19,20,20,20,20,20,"min[-1,-5)"
key,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume,...,High,Low,Close,Volume,Open,High,Low,Close,Volume,Low>0
Date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2018-12-24,0.027002,0.043261,0.003328,0.00669,1.910946,0.040078,0.045435,0.016921,0.027852,1.136928,...,0.114053,0.103897,0.113678,0.317537,0.096977,0.102888,0.096032,0.096644,-0.368135,False
2018-12-26,0.015843,0.019982,-0.005087,-0.005087,-0.382632,0.043273,0.05979,0.019225,0.02264,0.797125,...,0.135164,0.123891,0.135012,-0.176847,0.121386,0.131703,0.121386,0.131322,-0.186595,False
2018-12-27,-0.032499,0.010342,-0.039271,0.010317,0.033535,-0.017171,-0.013167,-0.037421,-0.037421,-0.361929,...,0.123439,0.09903,0.123353,-0.03538,0.090583,0.098272,0.087365,0.098125,-0.149243,False
2018-12-28,-0.022519,-0.00387,-0.040352,-0.003978,0.106408,-0.054287,-0.01241,-0.060906,-0.012434,0.143512,...,0.102042,0.089712,0.095659,-0.038311,0.07711,0.09814,0.07428,0.098056,0.067263,False
2018-12-31,-6.8e-05,0.008536,-0.010424,-0.005282,0.075446,-0.022586,-0.003938,-0.040417,-0.004046,0.189882,...,0.10482,0.093568,0.104536,0.353109,0.095252,0.101967,0.089638,0.095585,0.034245,False


In [235]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
import seaborn as sns
from pathlib import Path
import pandas as pd

data_path = Path( "c:\\Users\\ywh\\Git_Repository\\stockdata.yahoo\\out.feature\\2019-01-06.100\\^GSPC.2019-01-06.[Adj_Price].pkl\\^GSPC.2019-01-06.[Adj_Price].pkl.features.pkl" )
data = pd.read_pickle(data_path)
# data['2018'].head()

# subset_data = data.ix['2018',(list(range(25))+[-4])]
subset_data = data.iloc[slice(None),(list(range(100))+[-4])].dropna()
# sns.pairplot( subset_data, hue=('pct_chg_minmax_gate','min[-1,-5)','Low>0'), height=2.5)
subset_data = subset_data['1960':]

numeric_features = list(range(subset_data.columns.__len__()))[slice(None,-1,None)]
numeric_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = list(range(subset_data.columns.__len__()))[slice(-1,None)]
categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
#         ('cat', categorical_transformer, categorical_features)
    ]
)

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2))])

X2 = subset_data.drop([('pct_chg_minmax_gate','min[-1,-5)','Low>0')], axis=1)
y2 = subset_data[('pct_chg_minmax_gate','min[-1,-5)','Low>0')]
# print(X2.info())
# print(y2)

# X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2)
# clf.fit(X2_train, y2_train)
# print("model score: %.3f" % clf.score(X2_test, y2_test))

tscv = TimeSeriesSplit(n_splits=10)
index = 0
for train_index, test_index in tscv.split(subset_data):
    index+=1
    print("TRAIN:", X2.iloc[train_index].index, "TEST:", X2.iloc[test_index].index)
    X_train, X_test = X2.iloc[train_index], X2.iloc[test_index]
    y_train, y_test = y2.iloc[train_index], y2.iloc[test_index]
    if index ==1:
        clf.fit(X_train, y_train)
    print("model score: %.3f" % clf.score(X_test, y_test))

    y_pred = clf.predict(X_test)
#     print(y_pred)
    
    cm = confusion_matrix(y_test, y_pred)
    print(cm)



TRAIN: DatetimeIndex(['1960-01-04', '1960-01-05', '1960-01-06', '1960-01-07',
               '1960-01-08', '1960-01-11', '1960-01-12', '1960-01-13',
               '1960-01-14', '1960-01-15',
               ...
               '1965-05-05', '1965-05-06', '1965-05-07', '1965-05-10',
               '1965-05-11', '1965-05-12', '1965-05-13', '1965-05-14',
               '1965-05-17', '1965-05-18'],
              dtype='datetime64[ns]', name='Date', length=1353, freq=None) TEST: DatetimeIndex(['1965-05-19', '1965-05-20', '1965-05-21', '1965-05-24',
               '1965-05-25', '1965-05-26', '1965-05-27', '1965-05-28',
               '1965-06-01', '1965-06-02',
               ...
               '1970-10-21', '1970-10-22', '1970-10-23', '1970-10-26',
               '1970-10-27', '1970-10-28', '1970-10-29', '1970-10-30',
               '1970-11-02', '1970-11-03'],
              dtype='datetime64[ns]', name='Date', length=1350, freq=None)
model score: 0.994
[[1342    7]
 [   1    0]]
TRAIN: Date

In [227]:
from sklearn.model_selection import TimeSeriesSplit
X = np.array([1, 2, 3, 4, 1, 2, 5,3,34,76,])
y = np.array([1, 2, 3, 4, 5, 6,7,8])
tscv = TimeSeriesSplit(n_splits=5)
print(tscv)  
for train_index, test_index in tscv.split(subset_data):
    print("TRAIN:", train_index, "TEST:", test_index)
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]


TimeSeriesSplit(max_train_size=None, n_splits=5)
TRAIN: [   0    1    2 ... 2891 2892 2893] TEST: [2894 2895 2896 ... 5781 5782 5783]
TRAIN: [   0    1    2 ... 5781 5782 5783] TEST: [5784 5785 5786 ... 8671 8672 8673]
TRAIN: [   0    1    2 ... 8671 8672 8673] TEST: [ 8674  8675  8676 ... 11561 11562 11563]
TRAIN: [    0     1     2 ... 11561 11562 11563] TEST: [11564 11565 11566 ... 14451 14452 14453]
TRAIN: [    0     1     2 ... 14451 14452 14453] TEST: [14454 14455 14456 ... 17341 17342 17343]


In [179]:
list(range(10))[slice(None,-1,None)]

[0, 1, 2, 3, 4, 5, 6, 7, 8]

In [201]:
list(range(subset_data.columns.__len__()))[slice(None,-1,None)]
list(range(subset_data.columns.__len__()))[slice(-1,None)]


[100]

In [186]:
subset_data.columns[slice(-1,None)]

MultiIndex(levels=[['pct_chg', 'pct_chg_minmax', 'pct_chg_minmax_gate'], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 'max[-1,-5)', 'max[-1,-6)', 'min[-1,-5)', 'min[-1,-6)'], ['Close', 'High', 'High>0.02', 'High>0.05', 'High>0.1', 'Low', 'Low>-0.01', 'Low>-0.02', 'Low>0', 'Open', 'Volume']],
           labels=[[2], [22], [8]],
           names=['feature', 'arg', 'key'])