In [53]:
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler
import sklearn

from sklearn.model_selection import train_test_split

In [14]:
boston = load_boston()
print(boston.DESCR)
boston_df = pd.DataFrame(boston.data, columns=boston.feature_names)
boston_df['MEDV'] = boston.target
print(boston_df.shape)
boston_df.head()

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [15]:
boston_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB


In [23]:
X_train, X_test, y_train, y_test = train_test_split(boston_df.drop(columns=['MEDV']), boston_df.MEDV, test_size=0.2)

In [26]:
sc1 = StandardScaler()
sc2 = StandardScaler()
X_train = sc1.fit_transform(X_train)
X_test = sc1.transform(X_test)

In [116]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.metrics import mean_absolute_error, mean_squared_error

linear regr

In [36]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
pd.DataFrame({'Actual': y_test, 'Predicted': lin_reg.predict(X_test), 'Diff': y_test-lin_reg.predict(X_test)})

Unnamed: 0,Actual,Predicted,Diff
26,16.6,15.210894,1.389106
190,37.0,30.912074,6.087926
201,24.1,29.113223,-5.013223
37,21.0,22.604305,-1.604305
472,23.2,22.698761,0.501239
...,...,...,...
429,9.5,13.041866,-3.541866
386,10.5,5.316503,5.183497
414,7.0,-5.676961,12.676961
229,31.5,30.803408,0.696592


In [37]:
print('MSE:', mean_squared_error(y_test, lin_reg.predict(X_test)))
print('MAE:', mean_absolute_error(y_test, lin_reg.predict(X_test)))

MSE: 25.788769102187903
MAE: 3.237336194941829


KNeighborsRegressor

In [66]:
params = {'algorithm': ['brute'],
             'leaf_size': list(range(10,50,1)),
             'metric': sorted(sklearn.neighbors.VALID_METRICS['brute']),
             'n_neighbors': list(range(3,20,1)),
             'p': [1, 2],
             'weights': ['uniform', 'distance']}
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

kn_regr = RandomizedSearchCV(KNeighborsRegressor(), params, n_jobs=-1,  random_state=0, n_iter=10)
kn_regr = kn_regr.fit(X_train, y_train).best_estimator_
kn_regr

KNeighborsRegressor(algorithm='brute', leaf_size=31, metric='euclidean',
                    metric_params=None, n_jobs=None, n_neighbors=12, p=1,
                    weights='distance')

In [67]:
print('MSE:', mean_squared_error(y_test, kn_regr.predict(X_test)))
print('MAE:', mean_absolute_error(y_test, kn_regr.predict(X_test)))
pd.DataFrame({'Actual': y_test, 'Predicted': kn_regr.predict(X_test), 'Diff': y_test-kn_regr.predict(X_test)})

MSE: 22.611928536460987
MAE: 2.766805991642916


Unnamed: 0,Actual,Predicted,Diff
26,16.6,16.419822,0.180178
190,37.0,29.383175,7.616825
201,24.1,24.918668,-0.818668
37,21.0,21.705276,-0.705276
472,23.2,20.705216,2.494784
...,...,...,...
429,9.5,11.385665,-1.885665
386,10.5,9.858917,0.641083
414,7.0,10.256377,-3.256377
229,31.5,26.924782,4.575218


RandomForestRegressor

In [98]:
params = {   'criterion': ['mse', 'mae'],
             'max_depth': list(range(5, 30, 1)),
             'max_features': ['auto', 'sqrt', 'log2'],
             'min_samples_leaf': list(range(1, 10, 1)),
             'min_samples_split': list(range(2, 10, 1)),
             'n_estimators': list(range(5, 100, 1)),
             'n_jobs': [-1],
             'random_state': [0]}

rf_regr = RandomizedSearchCV(RandomForestRegressor(), params, cv=5, n_jobs=-1, n_iter=10)
rf_regr =  rf_regr.fit(X_train, y_train).best_estimator_

In [99]:
rf_regr

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=15, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=6, min_weight_fraction_leaf=0.0,
                      n_estimators=87, n_jobs=-1, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [100]:
print('MSE:', mean_squared_error(y_test, rf_regr.predict(X_test)))
print('MAE:', mean_absolute_error(y_test, rf_regr.predict(X_test)))
pd.DataFrame({'Actual': y_test, 'Predicted': rf_regr.predict(X_test), 'Diff': y_test-rf_regr.predict(X_test)})

MSE: 12.456058742422776
MAE: 2.2460258705945866


Unnamed: 0,Actual,Predicted,Diff
26,16.6,17.573727,-0.973727
190,37.0,30.945138,6.054862
201,24.1,22.509287,1.590713
37,21.0,20.570806,0.429194
472,23.2,19.835745,3.364255
...,...,...,...
429,9.5,12.200022,-2.700022
386,10.5,7.205698,3.294302
414,7.0,8.421447,-1.421447
229,31.5,29.496950,2.003050


SVR

In [110]:
params = {'degree': list(range(1,10,1)),
         'epsilon': [0.1, 0.001, 0.0001, 1],
         'gamma': ['scale', 'auto'],
         'max_iter': [-1]
         }

sv_reg = RandomizedSearchCV(SVR(), params, cv=10, random_state=0, n_jobs=-1, n_iter=10)
sv_reg = sv_reg.fit(X_train, y_train).best_estimator_

In [111]:
sv_reg

SVR(C=1.0, cache_size=200, coef0=0.0, degree=4, epsilon=0.001, gamma='auto',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [112]:
print('MSE:', mean_squared_error(y_test, sv_reg.predict(X_test)))
print('MAE:', mean_absolute_error(y_test, sv_reg.predict(X_test)))
pd.DataFrame({'Actual': y_test, 'Predicted': sv_reg.predict(X_test), 'Diff': y_test-sv_reg.predict(X_test)})

MSE: 25.1121317564623
MAE: 2.8702356881464537


Unnamed: 0,Actual,Predicted,Diff
26,16.6,16.374668,0.225332
190,37.0,30.313974,6.686026
201,24.1,25.641261,-1.541261
37,21.0,21.247954,-0.247954
472,23.2,20.296434,2.903566
...,...,...,...
429,9.5,12.134884,-2.634884
386,10.5,11.948707,-1.448707
414,7.0,17.470237,-10.470237
229,31.5,27.975126,3.524874


AdaBoostRegressor

In [123]:
params = {'base_estimator': [LinearRegression()],
         'learning_rate': [0.001, 0.01, 0.1, 1, 10],
         'loss': ['linear','square','exponential'],
         'n_estimators': list(range(5,100,1)),
         'random_state': [0]}

ada_reg = RandomizedSearchCV(AdaBoostRegressor(), params, cv = 5, n_jobs=-1, n_iter=10)
ada_reg = ada_reg.fit(X_train, y_train).best_estimator_

In [124]:
ada_reg

AdaBoostRegressor(base_estimator=LinearRegression(copy_X=True,
                                                  fit_intercept=True,
                                                  n_jobs=None,
                                                  normalize=False),
                  learning_rate=0.01, loss='exponential', n_estimators=61,
                  random_state=0)

In [125]:
print('MSE:', mean_squared_error(y_test, sv_reg.predict(X_test)))
print('MAE:', mean_absolute_error(y_test, sv_reg.predict(X_test)))
pd.DataFrame({'Actual': y_test, 'Predicted': sv_reg.predict(X_test), 'Diff': y_test-sv_reg.predict(X_test)})

MSE: 25.1121317564623
MAE: 2.8702356881464537


Unnamed: 0,Actual,Predicted,Diff
26,16.6,16.374668,0.225332
190,37.0,30.313974,6.686026
201,24.1,25.641261,-1.541261
37,21.0,21.247954,-0.247954
472,23.2,20.296434,2.903566
...,...,...,...
429,9.5,12.134884,-2.634884
386,10.5,11.948707,-1.448707
414,7.0,17.470237,-10.470237
229,31.5,27.975126,3.524874


## Вывод

RandomForestRegressor показал самые лучшие оценки mse и mae, поэтому выбираем его с указанными параметрами!