In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.datasets import load_boston

In [3]:
boston = load_boston()

In [4]:
data = boston.data

In [5]:
target = boston.target

In [6]:
feature_names = boston.feature_names

In [7]:
X = pd.DataFrame(data, columns=feature_names)

In [8]:
y = pd.DataFrame(target, columns=['price'])

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [11]:
from sklearn.ensemble import RandomForestRegressor

In [12]:
model = RandomForestRegressor(n_estimators=1000, oob_score=True, random_state=42, max_depth=12, n_jobs=-1)

In [13]:
model.fit(X_train, y_train.values[:, 0]) # обучение

RandomForestRegressor(max_depth=12, n_estimators=1000, n_jobs=-1,
                      oob_score=True, random_state=42)

In [14]:
y_pred = model.predict(X_test) # предсказание

In [15]:
# Метрика оценки качества
# r2
from sklearn.metrics import r2_score

In [16]:
r2_score(y_test, y_pred)

0.87472606157312

In [17]:
# Проверка
check_test = pd.DataFrame({'y_test': y_test['price'],
                          'y_pred': y_pred.flatten()},
                         columns=['y_test', 'y_pred'])
check_test.head(10)

Unnamed: 0,y_test,y_pred
173,23.6,22.806412
274,32.4,31.131464
491,13.6,16.339125
72,22.8,23.810726
452,16.1,17.139521
76,20.0,21.832284
316,17.8,19.895747
140,14.0,14.754118
471,19.6,21.240835
500,16.8,20.898658


In [18]:
importances = model.feature_importances_

In [58]:
importances

array([0.00123624, 0.00154252, 0.00528122, 0.00713813, 0.01152493,
       0.01245085, 0.01426897, 0.01429864, 0.01808108, 0.03167574,
       0.06397257, 0.40268179, 0.41584732])

In [59]:
importances.sum()

0.9999999999999998

In [60]:
feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [61]:
max_sins = pd.DataFrame({'feature names': feature_names,
                          'sum': importances},
                         columns=['feature names', 'sum'])

max_sins.sort_values(by='sum', ascending=False).head(2)

Unnamed: 0,feature names,sum
12,LSTAT,0.415847
11,B,0.402682
