In [47]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
import jupyterthemes
import pandas as pd

from jupyterthemes import jtplot
jtplot.style()

In [48]:
boston = load_boston()

# create X (features) and y (response)
X = boston.data
y = boston.target.astype(int)

In [49]:
boston.target[:10]

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9])

In [50]:
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [51]:
from sklearn.metrics import mean_absolute_error

In [52]:
for i in range(10):
    rand_state = np.random.randint(100) # Генерация случайного числа для random state
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=rand_state) # Разбиение датасета на обучаемую и тестовую выборку
    # Обучение модели Knn
    model = LinearRegression() 
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mae = mean_absolute_error(y_test, predictions) # Оценка качества модели
    print('№{} MAE: {}'.format(i, round(mae, 5)))

№0 MAE: 3.26172
№1 MAE: 3.50297
№2 MAE: 3.71938
№3 MAE: 3.54348
№4 MAE: 3.6168
№5 MAE: 3.4452
№6 MAE: 3.16373
№7 MAE: 3.36896
№8 MAE: 3.14641
№9 MAE: 3.86106


## GridSearchCV
В некоторым смысле - эта функция более продвинутая версия цикла for, описанного выше

In [53]:
from sklearn.model_selection import GridSearchCV

In [54]:
# define the parameter values that should be searched
k_range = list(range(1, 10))
print(k_range)

[1, 2, 3, 4, 5, 6, 7, 8, 9]


In [55]:
# create a parameter grid: map the parameter names to the values that should be searched
# simply a python dictionary
# key: parameter name
# value: list of values that should be searched for that parameter
# single key-value pair for param_grid
param_grid = dict(n_jobs=k_range)
print(param_grid)

{'n_jobs': [1, 2, 3, 4, 5, 6, 7, 8, 9]}


In [56]:
# instantiate the grid
grid = GridSearchCV(LinearRegression(), param_grid=param_grid)

Полученный объект будет перебирать параметр "количество соседей" у KNN, делать кросс-валидацию и оценивать качество, используя accuracy в качестве метрики.
n_jobs=-1 заставляет параллелить вычисления по всем ядрам вашего компьютера

In [57]:
# fit the grid with data
grid.fit(X, y)

GridSearchCV(cv=None, error_score=nan,
             estimator=LinearRegression(copy_X=True, fit_intercept=True,
                                        n_jobs=None, normalize=False),
             iid='deprecated', n_jobs=None,
             param_grid={'n_jobs': [1, 2, 3, 4, 5, 6, 7, 8, 9]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [58]:
# view the complete results (list of named tuples)
for k in grid.cv_results_:
    print(k, ":", grid.cv_results_[k])

mean_fit_time : [0.00261741 0.00106373 0.00092201 0.00071831 0.00097818 0.0011117
 0.00096703 0.0009686  0.00095758]
std_fit_time : [3.21521948e-03 5.00674557e-04 1.55274053e-04 2.45417853e-05
 3.10996704e-04 3.71446640e-04 2.94202073e-04 3.28496320e-04
 3.14710090e-04]
mean_score_time : [0.00892758 0.00086069 0.00066957 0.0005692  0.00078201 0.0007205
 0.00071683 0.00070081 0.00082278]
std_score_time : [1.25919929e-02 4.22958027e-04 9.20953417e-05 9.69938823e-06
 2.29581320e-04 1.34326390e-04 2.20793915e-04 1.80379663e-04
 1.79590510e-04]
param_n_jobs : [1 2 3 4 5 6 7 8 9]
params : [{'n_jobs': 1}, {'n_jobs': 2}, {'n_jobs': 3}, {'n_jobs': 4}, {'n_jobs': 5}, {'n_jobs': 6}, {'n_jobs': 7}, {'n_jobs': 8}, {'n_jobs': 9}]
split0_test_score : [0.62725825 0.62725825 0.62725825 0.62725825 0.62725825 0.62725825
 0.62725825 0.62725825 0.62725825]
split1_test_score : [0.71348966 0.71348966 0.71348966 0.71348966 0.71348966 0.71348966
 0.71348966 0.71348966 0.71348966]
split2_test_score : [0.5887338

In [63]:
from sklearn.model_selection import RandomizedSearchCV

In [65]:
rnd_search = RandomizedSearchCV(LinearRegression(), param_distributions=param_grid)
rnd_search.fit(X, y)



RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=LinearRegression(copy_X=True, fit_intercept=True,
                                              n_jobs=None, normalize=False),
                   iid='deprecated', n_iter=10, n_jobs=None,
                   param_distributions={'n_jobs': [1, 2, 3, 4, 5, 6, 7, 8, 9]},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring=None, verbose=0)

In [66]:
rnd_search.best_estimator_

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [67]:
rnd_search.best_estimator_

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [68]:
rnd_search.best_params_

{'n_jobs': 1}

In [69]:
rnd_search.best_params_

{'n_jobs': 1}

In [70]:
rnd_search.best_score_

0.34896596679951525

In [71]:
rnd_search.best_score_

0.34896596679951525

Самый большой вопрос по заданию: какие параметры в линейной регрессии можно перебирать / оптимизировать? Правильно ли я понимаю? что перебирать можно гиперпараметры, то есть то, что мы задаем руками? В линейной регрессии таких параметров не так много... В общем, в правильности сделанного не уверена и буду благодарна за фидбэк.
    