In [1]:
import warnings

warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_boston

In [3]:
boston = load_boston()

In [4]:
data = boston["data"]

In [5]:
feature_names = boston["feature_names"]

In [6]:
X = pd.DataFrame(data, columns=feature_names)

X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [7]:
target = boston["target"]

In [8]:
y = pd.DataFrame(target, columns=["price"])

y.head()

Unnamed: 0,price
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,  random_state=42)

In [10]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [11]:
lr.fit(X_train, y_train)

LinearRegression()

In [12]:
y_pred = lr.predict(X_test)

y_pred.shape

(152, 1)

In [13]:
check_test = pd.DataFrame({
    "y_test": y_test["price"],
    "y_pred": y_pred.flatten(),
})

check_test.head(10)

Unnamed: 0,y_test,y_pred
173,23.6,28.64896
274,32.4,36.495014
491,13.6,15.411193
72,22.8,25.403213
452,16.1,18.85528
76,20.0,23.146689
316,17.8,17.392124
140,14.0,14.078599
471,19.6,23.036927
500,16.8,20.599433


In [14]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.7112260057484887

In [15]:
from sklearn.ensemble import RandomForestRegressor

In [22]:
rfr = RandomForestRegressor (n_estimators=RandomForestRegressor1000, max_depth=12, random_state=42)

In [23]:
rfr.fit(X_train, y_train.values[:, 0])

RandomForestRegressor(max_depth=12, n_estimators=1000, random_state=42)

In [27]:
y_pred_rfr = rfr.predict(X_test)

In [29]:
check_test_rfr = pd.DataFrame({
    "y_test": y_test["price"],
    "y_pred": y_pred_rfr.flatten(),
})

check_test_rfr.head(10)

Unnamed: 0,y_test,y_pred
173,23.6,22.806412
274,32.4,31.131464
491,13.6,16.339125
72,22.8,23.810726
452,16.1,17.139521
76,20.0,21.832284
316,17.8,19.895747
140,14.0,14.754118
471,19.6,21.240835
500,16.8,20.898658


In [30]:
r2_score(y_test, y_pred_rfr)

0.87472606157312

In [None]:
#модель RandomForestRegressor является более точной, чем модель LinearRegression при использовании метрики r2_score

In [31]:
rfr.feature_importances_

array([0.03167574, 0.00154252, 0.00713813, 0.00123624, 0.01426897,
       0.40268179, 0.01429864, 0.06397257, 0.00528122, 0.01152493,
       0.01808108, 0.01245085, 0.41584732])

In [32]:
feature_importance = pd.DataFrame({'name':X.columns, 
                                   'feature_importance':rfr.feature_importances_}, 
                                  columns=['feature_importance', 'name'])
feature_importance

Unnamed: 0,feature_importance,name
0,0.031676,CRIM
1,0.001543,ZN
2,0.007138,INDUS
3,0.001236,CHAS
4,0.014269,NOX
5,0.402682,RM
6,0.014299,AGE
7,0.063973,DIS
8,0.005281,RAD
9,0.011525,TAX


In [34]:
feature_importance.nlargest(2, 'feature_importance')

Unnamed: 0,feature_importance,name
12,0.415847,LSTAT
5,0.402682,RM
