### Домашнее задание к лекции Улучшение качества модели

In [1]:
from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Загрузим данные
data = load_boston()
X, y = data['data'], data['target']

In [3]:
# Нормализуем данные
sc = StandardScaler()
X = sc.fit_transform(X)

In [4]:
# Разделим данные на обучение и тестовые
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [5]:
# LinearRegression

In [6]:
grid_params = {'normalize':['True', 'False']}
SearchCV = GridSearchCV(LinearRegression(), param_grid=grid_params, scoring='neg_mean_squared_error', cv = 5, n_jobs=-1)
SearchCV.fit(X, y)

GridSearchCV(cv=5, error_score=nan,
             estimator=LinearRegression(copy_X=True, fit_intercept=True,
                                        n_jobs=None, normalize=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'normalize': ['True', 'False']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [7]:
SearchCV.best_score_

-37.13180746769891

In [8]:
SearchCV.best_params_

{'normalize': 'True'}

In [9]:
y_pred = SearchCV.predict(X_test)
print(f'LinearRegression \nMSE: {mean_squared_error(y_test, y_pred)} /r2: {r2_score(y_test, y_pred)}')

LinearRegression 
MSE: 15.15108886627791 /r2: 0.7621842516111501


In [10]:
# KNeighborsRegressor

In [11]:
grid_params = {'n_neighbors': list(range(2, 21)), 'weights': ['uniform', 'distance']}
SearchCV = RandomizedSearchCV(KNeighborsRegressor(), param_distributions=grid_params, scoring='neg_mean_squared_error', n_jobs=-1)
SearchCV.fit(X, y)

RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                                 metric='minkowski',
                                                 metric_params=None,
                                                 n_jobs=None, n_neighbors=5,
                                                 p=2, weights='uniform'),
                   iid='deprecated', n_iter=10, n_jobs=-1,
                   param_distributions={'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9,
                                                        10, 11, 12, 13, 14, 15,
                                                        16, 17, 18, 19, 20],
                                        'weights': ['uniform', 'distance']},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring='neg_mean_squared_error',
                   verbose=0)

In [12]:
SearchCV.best_score_

-27.98583675176859

In [13]:
SearchCV.best_params_

{'weights': 'distance', 'n_neighbors': 8}

In [14]:
y_pred = SearchCV.predict(X_test)
print(f'KNeighborsRegressor \nMSE: {mean_squared_error(y_test, y_pred)} /r2: {r2_score(y_test, y_pred)}')

KNeighborsRegressor 
MSE: 0.0 /r2: 1.0


In [15]:
# SVR

In [16]:
grid_params = {'kernel':['linear', 'poly', 'rbf'], 'C':[1, 25]}
SearchCV = RandomizedSearchCV(SVR(), param_distributions=grid_params, scoring='neg_mean_squared_error', cv = 5, n_jobs=-1)
SearchCV.fit(X, y)



RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                                 epsilon=0.1, gamma='scale', kernel='rbf',
                                 max_iter=-1, shrinking=True, tol=0.001,
                                 verbose=False),
                   iid='deprecated', n_iter=10, n_jobs=-1,
                   param_distributions={'C': [1, 25],
                                        'kernel': ['linear', 'poly', 'rbf']},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring='neg_mean_squared_error',
                   verbose=0)

In [17]:
SearchCV.best_score_

-26.78915335480307

In [18]:
SearchCV.best_params_

{'kernel': 'rbf', 'C': 25}

In [19]:
y_pred = SearchCV.predict(X_test)
print(f'SVR \nMSE: {mean_squared_error(y_test, y_pred)} /r2: {r2_score(y_test, y_pred)}')

SVR 
MSE: 2.893630890240918 /r2: 0.9545807564197336


In [20]:
# DecisionTreeRegressor

In [21]:
grid_params = {'min_samples_split': list(range(2, 26)), 'max_depth': list(range(2, 11)), 'criterion': ['mse', 'mae']}
SearchCV = RandomizedSearchCV(DecisionTreeRegressor(), param_distributions=grid_params, scoring='neg_mean_squared_error', n_jobs=-1)
SearchCV.fit(X, y)

RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=DecisionTreeRegressor(ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features=None,
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   presort='deprecated',
                                                   random_state=None,
                                                   splitter='best'),
                   iid

In [22]:
SearchCV.best_score_

-30.090738982721803

In [23]:
SearchCV.best_params_

{'min_samples_split': 16, 'max_depth': 3, 'criterion': 'mae'}

In [24]:
y_pred = SearchCV.predict(X_test)
print(f'DecisionTreeRegressor \nMSE: {mean_squared_error(y_test, y_pred)} /r2: {r2_score(y_test, y_pred)}')

DecisionTreeRegressor 
MSE: 11.617352941176469 /r2: 0.8176507636918106


In [25]:
# RandomForestRegressor

In [26]:
grid_params = {'n_estimators': list(range(10, 111)), 'max_depth': list(range(2, 11)), 'criterion': ['mse', 'mae']}
SearchCV = RandomizedSearchCV(RandomForestRegressor(), param_distributions=grid_params, scoring='neg_mean_squared_error', n_jobs=-1)
SearchCV.fit(X, y)

RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                           

In [27]:
SearchCV.best_score_

-20.869686627533618

In [28]:
SearchCV.best_params_

{'n_estimators': 89, 'max_depth': 7, 'criterion': 'mae'}

In [29]:
y_pred = SearchCV.predict(X_test)
print(f'RandomForestRegressor \nMSE: {mean_squared_error(y_test, y_pred)} /r2: {r2_score(y_test, y_pred)}')

RandomForestRegressor 
MSE: 2.971493306450216 /r2: 0.9533586060551225


Наилучший результат у модели KNeighborsRegressor - MSE: 0.0 /r2: 1.0. 

Что-то вообще идеально получилось