In [2]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

boston = load_boston()
X = boston.data
y = boston.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

rf = RandomForestRegressor()
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [3]:
predictSample = X_test[[10, 20]]
print("Instance 0 prediction:", rf.predict(predictSample[0].reshape(1,-1)))
print("Instance 1 prediction:", rf.predict(predictSample[1].reshape(1,-1)))

Instance 0 prediction: [ 8.75]
Instance 1 prediction: [ 23.94]


### 모델 해석

In [4]:
from treeinterpreter import treeinterpreter as ti

prediction, bias, contributions = ti.predict(rf, predictSample)
for i in range(len(predictSample)):
    print("Instance : %d" % i)
    print("Bias (trainset mean) : %f" % bias[i])
    print("Feature contributions:")
    for c, feature in sorted(zip(contributions[i], 
                                 boston.feature_names), 
                             key=lambda x: -abs(x[0])):
        print(feature, round(c, 2))
    print('prediction : %f ' % prediction[i])
    print('bias + sum of contribution : %f ' % (bias[i] + np.sum(contributions[i])))    
    print("-"*20) 

Instance : 0
Bias (trainset mean) : 22.898870
Feature contributions:
LSTAT -5.75
CRIM -3.67
RM -2.48
DIS -1.0
NOX -0.65
TAX -0.54
AGE -0.06
B -0.0
ZN 0.0
INDUS 0.0
CHAS 0.0
RAD 0.0
PTRATIO 0.0
prediction : 8.750000 
bias + sum of contribution : 8.750000 
--------------------
Instance : 1
Bias (trainset mean) : 22.898870
Feature contributions:
LSTAT 4.67
RM -3.53
AGE -0.21
TAX -0.21
INDUS 0.11
ZN 0.07
PTRATIO 0.07
RAD 0.05
DIS 0.03
CHAS -0.02
NOX 0.01
CRIM -0.0
B -0.0
prediction : 23.940000 
bias + sum of contribution : 23.940000 
--------------------


### dataset 간의 비교

In [5]:
X_test.shape

(152, 13)

In [7]:
ds1 = X_test.data[0:76]
ds2 = X_test.data[76:]

prediction1, bias1, contributions1 = ti.predict(rf, ds1)
prediction2, bias2, contributions2 = ti.predict(rf, ds2)

In [8]:
totalc1 = np.mean(contributions1, axis=0) 
totalc2 = np.mean(contributions2, axis=0) 

# bias 는 동일, prediction의 차이는 contribution의 차이
print(np.sum(bias1) - np.sum(bias2))
print(np.sum(totalc1 - totalc2))
print(np.mean(prediction1) - np.mean(prediction2))

0.0
-0.334342105263
-0.334342105263


In [None]:
for c, feature in sorted(zip(totalc1 - totalc2, 
                             boston.feature_names), reverse=True):
    print(feature, round(c, 2))

### classification

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
iris = load_iris()
 
rf = RandomForestClassifier(max_depth = 4)
rf.fit(iris.data, iris.target)

instance = iris.data[100:101]
prediction, bias, contributions = ti.predict(rf, instance)
print("Prediction ", prediction)
print("Bias (trainset prior) ", bias)
print("Feature contributions:")
for c, feature in zip(contributions[0], iris.feature_names):
    print(feature, c)