In [2]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

In [3]:
boston = datasets.load_boston()
X = boston.data
y = boston.target
X = X[y < 50.0]
y = y[y < 50.0]
# X has 13 features
X.shape, y.shape

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 600)

In [11]:
from fun_machine_learning.linear_regression import LinearRegression
reg = LinearRegression()
reg.fit_normal(X_train, y_train)
print(reg.interception_)
reg.score(X_test, y_test)

28.920513410785453


0.7344367205330666

## Use linear regression in scikit-learn

In [7]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()

In [9]:
lin_reg.fit(X_train, y_train)
lin_reg.coef_

array([-1.12307724e-01,  2.58183302e-02, -2.73034446e-02,  3.93618556e-01,
       -1.34231029e+01,  4.33171276e+00, -3.03666267e-02, -1.15156923e+00,
        2.18130563e-01, -1.35827830e-02, -8.49632410e-01,  8.07149582e-03,
       -2.91322565e-01])

In [10]:
lin_reg.score(X_test, y_test)

0.7344367205330411

## Use KNN regressor to solve a linear regression problem
### Note that we must adjust parameter to get a higher score

In [12]:
from sklearn.neighbors import KNeighborsRegressor
knn_reg = KNeighborsRegressor()
knn_reg.fit(X_train, y_train)
knn_reg.score(X_test, y_test)

0.567665054412363

In [14]:
from sklearn.model_selection import GridSearchCV
param_grid = [
    {
        'weights': ['uniform'],
        'n_neighbors': [i for i in range(1, 11)]
    },
    {
        'weights': ['distance'],
        'n_neighbors': [i for i in range(1, 11)],
        'p': [i for i in range(1, 6)]
    }
]
knn_reg = KNeighborsRegressor()
grid_search = GridSearchCV(knn_reg, param_grid, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    1.3s finished


GridSearchCV(estimator=KNeighborsRegressor(), n_jobs=-1,
             param_grid=[{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'weights': ['uniform']},
                         {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'p': [1, 2, 3, 4, 5], 'weights': ['distance']}],
             verbose=1)

In [15]:
grid_search.best_params_

{'n_neighbors': 8, 'p': 1, 'weights': 'distance'}

In [16]:
# Note that this doesn't use same method as lin_reg.score(X_test, y_test)
grid_search.best_score_

0.6222558260445735

In [18]:
grid_search.best_estimator_.score(X_test, y_test)

0.6960157368530511

## Linear regression is intuitive and easy to explain logically (On the contrast, many other machine learning algorithms is a black box)
### Sort feature names according to coefficients, it's obvious that RM (number of rooms) has the highest relevance to price, while NOX (the harmful gas) is the most relevant features (in a reverse way 负相关)

In [20]:
from sklearn.linear_model import LinearRegression
lin_reg2 = LinearRegression()
lin_reg2.fit(X, y)
lin_reg2.coef_

array([-1.06715912e-01,  3.53133180e-02, -4.38830943e-02,  4.52209315e-01,
       -1.23981083e+01,  3.75945346e+00, -2.36790549e-02, -1.21096549e+00,
        2.51301879e-01, -1.37774382e-02, -8.38180086e-01,  7.85316354e-03,
       -3.50107918e-01])

In [21]:
boston.feature_names[np.argsort(lin_reg2.coef_)]

array(['NOX', 'DIS', 'PTRATIO', 'LSTAT', 'CRIM', 'INDUS', 'AGE', 'TAX',
       'B', 'ZN', 'RAD', 'CHAS', 'RM'], dtype='<U7')