# 1 自己实现LinearRegression
# 2 使用sklearn中的LinearRegression
# 3 用KNN算法进行回归

In [43]:


from sklearn.datasets import load_boston
import numpy as np
import matplotlib.pyplot as plt
from LinearRegression import LinearRegression
from model_selection import train_test_split 

### 加载数据

In [14]:
data = load_boston()
data_X = data.data
data_y = data.target

In [15]:
data_X.shape

(506, 13)

In [16]:
data_y.shape

(506,)

### 去除边界数据

In [17]:
data_X = data_X[data_y<50]

In [19]:
data_y = data_y[data_y<50]

### 分割数据

In [32]:
X_train,y_train,X_test,y_test = train_test_split(data_X,data_y,seed=666)

### 训练模型

In [21]:
LR = LinearRegression()
LR.fit_normal(X_train,y_train)

LinearRegression()  with normal_equation

In [22]:
LR.score(X_test,y_test)

0.81298026026583592

In [23]:
LR.cofficients_

array([ -1.18919477e-01,   3.63991462e-02,  -3.56494193e-02,
         5.66737830e-02,  -1.16195486e+01,   3.42022185e+00,
        -2.31470282e-02,  -1.19509560e+00,   2.59339091e-01,
        -1.40112724e-02,  -8.36521175e-01,   7.92283639e-03,
        -3.81966137e-01])

In [24]:
LR.intercept_

array([ 34.1614355])

## 使用sklearn重封装好的LinearRegression

In [33]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [34]:
data = load_boston()
data_X = data.data
data_y = data.target
data_X = data_X[data_y<50.0]
data_y = data_y[data_y<50.0]

In [40]:
#X_train,X_test,y_train,y_test = train_test_split(data_X,data_y,random_state = 666,test_size = 0.2)
#若全部使用sklearn中的方法我们应该使用sklearn中的split 
#    但是为了方便和上面自己实现的方法进行对比  我们依然使用自己的train_test_split切割完毕的数据


In [35]:
sklearn_LR = LinearRegression()

In [36]:
sklearn_LR.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [37]:
sklearn_LR.coef_

array([ -1.18919477e-01,   3.63991462e-02,  -3.56494193e-02,
         5.66737830e-02,  -1.16195486e+01,   3.42022185e+00,
        -2.31470282e-02,  -1.19509560e+00,   2.59339091e-01,
        -1.40112724e-02,  -8.36521175e-01,   7.92283639e-03,
        -3.81966137e-01])

In [38]:
sklearn_LR.intercept_

34.161435496245851

In [39]:
sklearn_LR.score(X_test,y_test)

0.81298026026584858

## 二、KNN Regressor

In [44]:
from sklearn.neighbors import KNeighborsRegressor

In [46]:
KNR = KNeighborsRegressor()

In [47]:
KNR.fit(X_train,y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

In [48]:
KNR.score(X_test,y_test)

0.58654121983008989

### 使用网格搜索选用最好的超参数

In [50]:
from sklearn.model_selection import GridSearchCV

In [52]:
para_grid = [
    {
        "n_neighbors":[i for i in range(1,11)],
        "weights":["uniform"]
    },
    {
        "n_neighbors":[i for i in range(1,11)],
        "weights":["distance"],
        "p":[i for i in range(1,6)]
    }
]

In [54]:
grid_search = GridSearchCV(KNR,param_grid=para_grid,n_jobs=1,verbose=2)

In [56]:
grid_search.fit(X_train,y_train)

Fitting 3 folds for each of 60 candidates, totalling 180 fits
[CV] n_neighbors=1, weights=uniform ..................................
[CV] ................... n_neighbors=1, weights=uniform, total=   0.0s
[CV] n_neighbors=1, weights=uniform ..................................
[CV] ................... n_neighbors=1, weights=uniform, total=   0.0s
[CV] n_neighbors=1, weights=uniform ..................................
[CV] ................... n_neighbors=1, weights=uniform, total=   0.0s
[CV] n_neighbors=2, weights=uniform ..................................
[CV] ................... n_neighbors=2, weights=uniform, total=   0.0s
[CV] n_neighbors=2, weights=uniform ..................................
[CV] ................... n_neighbors=2, weights=uniform, total=   0.0s
[CV] n_neighbors=2, weights=uniform ..................................
[CV] ................... n_neighbors=2, weights=uniform, total=   0.0s
[CV] n_neighbors=3, weights=uniform ..................................
[CV] ..........

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] ............. n_neighbors=1, p=4, weights=distance, total=   0.0s
[CV] n_neighbors=1, p=4, weights=distance ............................
[CV] ............. n_neighbors=1, p=4, weights=distance, total=   0.0s
[CV] n_neighbors=1, p=5, weights=distance ............................
[CV] ............. n_neighbors=1, p=5, weights=distance, total=   0.0s
[CV] n_neighbors=1, p=5, weights=distance ............................
[CV] ............. n_neighbors=1, p=5, weights=distance, total=   0.0s
[CV] n_neighbors=1, p=5, weights=distance ............................
[CV] ............. n_neighbors=1, p=5, weights=distance, total=   0.0s
[CV] n_neighbors=2, p=1, weights=distance ............................
[CV] ............. n_neighbors=2, p=1, weights=distance, total=   0.0s
[CV] n_neighbors=2, p=1, weights=distance ............................
[CV] ............. n_neighbors=2, p=1, weights=distance, total=   0.0s
[CV] n_neighbors=2, p=1, weights=distance ............................
[CV] .

[CV] ............. n_neighbors=5, p=4, weights=distance, total=   0.0s
[CV] n_neighbors=5, p=5, weights=distance ............................
[CV] ............. n_neighbors=5, p=5, weights=distance, total=   0.0s
[CV] n_neighbors=5, p=5, weights=distance ............................
[CV] ............. n_neighbors=5, p=5, weights=distance, total=   0.0s
[CV] n_neighbors=5, p=5, weights=distance ............................
[CV] ............. n_neighbors=5, p=5, weights=distance, total=   0.0s
[CV] n_neighbors=6, p=1, weights=distance ............................
[CV] ............. n_neighbors=6, p=1, weights=distance, total=   0.0s
[CV] n_neighbors=6, p=1, weights=distance ............................
[CV] ............. n_neighbors=6, p=1, weights=distance, total=   0.0s
[CV] n_neighbors=6, p=1, weights=distance ............................
[CV] ............. n_neighbors=6, p=1, weights=distance, total=   0.0s
[CV] n_neighbors=6, p=2, weights=distance ............................
[CV] .

[CV] ............ n_neighbors=10, p=1, weights=distance, total=   0.0s
[CV] n_neighbors=10, p=1, weights=distance ...........................
[CV] ............ n_neighbors=10, p=1, weights=distance, total=   0.0s
[CV] n_neighbors=10, p=1, weights=distance ...........................
[CV] ............ n_neighbors=10, p=1, weights=distance, total=   0.0s
[CV] n_neighbors=10, p=2, weights=distance ...........................
[CV] ............ n_neighbors=10, p=2, weights=distance, total=   0.0s
[CV] n_neighbors=10, p=2, weights=distance ...........................
[CV] ............ n_neighbors=10, p=2, weights=distance, total=   0.0s
[CV] n_neighbors=10, p=2, weights=distance ...........................
[CV] ............ n_neighbors=10, p=2, weights=distance, total=   0.0s
[CV] n_neighbors=10, p=3, weights=distance ...........................
[CV] ............ n_neighbors=10, p=3, weights=distance, total=   0.0s
[CV] n_neighbors=10, p=3, weights=distance ...........................
[CV] .

[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed:    1.8s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'weights': ['uniform']}, {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'weights': ['distance'], 'p': [1, 2, 3, 4, 5]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [57]:
grid_search.best_params_

{'n_neighbors': 5, 'p': 1, 'weights': 'distance'}

In [58]:
grid_search.best_estimator_.score(X_test,y_test)

0.70443577270379965

### 经过对比 KNN 即使选择了最优的参数后 回归的效果依旧一般
不过gridsearch 计算score的方法和我们的不一样！！！！