**Домашнее задание**

Взять boston house-prices datase (sklearn.datasets.load_boston). Возмите 7 любых регрессоров (попробовать разные алгоритмы, поподбирать параметры, вывести итоговое качество).

In [1]:
import pandas as pd
from sklearn.datasets import load_boston
from IPython.display import display, HTML
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import warnings

warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv(load_boston()["filename"], skiprows=1)
data

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


Отберем данные, выполним нормализацию и разделим на тренировочные и тестовые данные.

In [3]:
X_src = data[["CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS","RAD","TAX","PTRATIO","B","LSTAT"]]
Y_src = data["MEDV"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_src)

X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y_src, test_size=0.3, random_state=42)

Подберем гиперпараметры через RandomizedSearchCV.

In [4]:
def find_hyperparams(names, regressors, dicts, gridsearch=False):
    results = []
    for i in range(len(regressors)):
        if not gridsearch:
            grid = RandomizedSearchCV(regressors[i], dicts[i], cv=10)
        else:
            grid = GridSearchCV(regressors[i], dicts[i], cv=10)
        grid.fit(X_train, Y_train)
        results.append([names[i], grid.best_params_, grid.best_score_, grid.score(X_test, Y_test)])
        
    res_df = pd.DataFrame(results, columns=["Алгоритм", "Параметры", "Качество (трен.)", "Качество (тест.)"])
    display(HTML(res_df.to_html()))


names = [
    "Decision Tree               ",
    "Linear Regression           ",
    "SVR                         ",
    "Stochastic Gradient Descent ",
    "KNeighbors Regressor        ",
    "Random Forest Regressor     ",
    "Extra Trees Regressor       "
]

regressors = [
    DecisionTreeRegressor(random_state=42),
    LinearRegression(),
    SVR(kernel="poly"),
    SGDRegressor(random_state=42),
    KNeighborsRegressor(),
    RandomForestRegressor(random_state=42),
    ExtraTreesRegressor(random_state=42)
]

dicts = [
    {"criterion": ["mse", "friedman_mse", "mae"], "splitter": ["best", "random"], "max_depth": [x for x in range(1, 21)], "max_features": ["auto", "sqrt", "log2", None]},
    {"fit_intercept": [False, True], "normalize": [False, True]},
    {"gamma": ["scale", "auto"], "coef0": [i / 10 for i in (0, 11)], "shrinking": [False, True], "degree": [2, 3, 4, 5]},
    {"loss": ["squared_loss", "huber", "epsilon_insensitive", "squared_epsilon_insensitive"], "penalty": ["l2", "l1", "elasticnet"], "learning_rate": ["constant", "optimal", "invscaling", "adaptive"]},
    {"n_neighbors": [x for x in range(1, 21)], "weights": ["uniform", "distance"], "algorithm": ["auto", "ball_tree", "kd_tree", "brute"]},
    {"criterion": ["mse", "mae"], "max_depth": list(range(1, 31)) + [None], "max_features": ["auto", "sqrt", "log2"], "n_estimators": list(range(50, 101))},
    {"criterion": ["mse", "mae"], "max_depth": list(range(1, 31)) + [None], "max_features": ["auto", "sqrt", "log2"], "n_estimators": list(range(50, 101))}
]


rscv_df = find_hyperparams(names, regressors, dicts)
rscv_df

Unnamed: 0,Алгоритм,Параметры,Качество (трен.),Качество (тест.)
0,Decision Tree,"{'splitter': 'best', 'max_features': 'auto', 'max_depth': 7, 'criterion': 'mse'}",0.685756,0.742271
1,Linear Regression,"{'normalize': False, 'fit_intercept': True}",0.687535,0.711226
2,SVR,"{'shrinking': False, 'gamma': 'scale', 'degree': 4, 'coef0': 1.1}",0.834461,0.838939
3,Stochastic Gradient Descent,"{'penalty': 'elasticnet', 'loss': 'squared_epsilon_insensitive', 'learning_rate': 'invscaling'}",0.691955,0.707836
4,KNeighbors Regressor,"{'weights': 'uniform', 'n_neighbors': 3, 'algorithm': 'kd_tree'}",0.765893,0.764676
5,Random Forest Regressor,"{'n_estimators': 62, 'max_features': 'log2', 'max_depth': 15, 'criterion': 'mae'}",0.835812,0.87077
6,Extra Trees Regressor,"{'n_estimators': 72, 'max_features': 'log2', 'max_depth': None, 'criterion': 'mae'}",0.864077,0.859729


Подберем гиперпараметры через GridSearchCV.

In [5]:
gscv_df = find_hyperparams(names, regressors, dicts, True)
gscv_df

Unnamed: 0,Алгоритм,Параметры,Качество (трен.),Качество (тест.)
0,Decision Tree,"{'criterion': 'friedman_mse', 'max_depth': 8, 'max_features': 'auto', 'splitter': 'best'}",0.714044,0.75398
1,Linear Regression,"{'fit_intercept': True, 'normalize': False}",0.687535,0.711226
2,SVR,"{'coef0': 1.1, 'degree': 4, 'gamma': 'auto', 'shrinking': True}",0.835084,0.839081
3,Stochastic Gradient Descent,"{'learning_rate': 'invscaling', 'loss': 'squared_epsilon_insensitive', 'penalty': 'l2'}",0.691955,0.707836
4,KNeighbors Regressor,"{'algorithm': 'auto', 'n_neighbors': 3, 'weights': 'distance'}",0.787534,0.777751
5,Random Forest Regressor,"{'criterion': 'mse', 'max_depth': 14, 'max_features': 'sqrt', 'n_estimators': 70}",0.843921,0.850908
6,Extra Trees Regressor,"{'criterion': 'mae', 'max_depth': 30, 'max_features': 'sqrt', 'n_estimators': 64}",0.866773,0.864309
