# Random Forest - Boston House Price Prediction - Interpretability 

In [107]:
print("Bismillahir Rahmanir Rahim")

Bismillahir Rahmanir Rahim


In [131]:
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score

from lime.lime_tabular import LimeTabularExplainer

from pprint import pprint
from IPython.display import display

In [228]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import glob
import matplotlib.pyplot as plt
import numpy as np
import sklearn

import eli5
import lime
import treeinterpreter as ti

## Boston House Price Prediction

In [194]:
from treeinterpreter import treeinterpreter as ti
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import (LinearRegression, Ridge, 
                                  Lasso, RandomizedLasso)
from sklearn.feature_selection import RFE, f_regression
from sklearn.preprocessing import MinMaxScaler

In [241]:
def rank_to_dict(ranks, names, order=1):
#     minmax = MinMaxScaler()
#     ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0]
    ranks = map(lambda x: round(x, 2), ranks)
    return dict(zip(names, ranks ))

### Dataset

In [197]:
from sklearn.datasets import load_boston
boston = load_boston()

In [135]:
X, y = boston['data'], boston['target']
print("\nVariables =", boston.data[:1])
print("\nTarget = ", boston.target[:1])


Variables = [[6.320e-03 1.800e+01 2.310e+00 0.000e+00 5.380e-01 6.575e+00 6.520e+01
  4.090e+00 1.000e+00 2.960e+02 1.530e+01 3.969e+02 4.980e+00]]

Target =  [24.]


## Random Forest implementation

In [140]:
rf = RandomForestRegressor()
rf.fit(boston.data[:300], boston.target[:300])
# rf.fit(boston.data[:400], boston.target[:400])



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [191]:
instances = boston.data[[300]]
print("\nNumber of testing instances = ", len(instances))
print("\nRondom forest price predictions = ", rf.predict(instances))
print("\nRandom forest feature importances sorted values:")
weights = dict(zip(boston.feature_names, rf.feature_importances_))
for name, fval in sorted(weights.items(), key=lambda p:p[1], reverse=True):
    print(name, round(fval,6))


Number of testing instances =  1

Rondom forest price predictions =  [28.36]

Random forest feature importances sorted values:
RM 0.820485
LSTAT 0.090758
CRIM 0.017826
AGE 0.015257
PTRATIO 0.014463
TAX 0.014264
DIS 0.010932
B 0.006211
NOX 0.003903
INDUS 0.003559
RAD 0.001036
ZN 0.000688
CHAS 0.000617


### Random forest & TreeInterpreter

In [146]:
prediction, bias, contributions = ti.predict(rf, instances)

In [184]:
for i in range(len(instances)):
    print("\nInstance", i)
    print("\nBias (trainset mean)", bias[i])
    print("\nFeature contributions:")
    for c, feature in sorted(zip(contributions[i], 
                                 boston.feature_names), 
                             key=lambda x: -abs(x[0])):
        print(feature, round(c, 2))
    print("-"*20)


Instance 0

Bias (trainset mean) 25.756266666666676

Feature contributions:
RM 2.88
LSTAT 0.86
TAX -0.59
CRIM -0.22
AGE -0.2
PTRATIO 0.19
DIS -0.18
NOX -0.17
ZN -0.15
RAD 0.15
INDUS 0.03
B 0.01
CHAS 0.0
--------------------


In [148]:
print("\nSanity check")
print("\nPrediction Values", prediction)
print("\nSumming up 'bais' and 'feature contributions' = ", bias + np.sum(contributions, axis=1))


Sanity check

Prediction Values [[28.36]]

Summing up 'bais' and 'feature contributions' =  [28.36]


### Random forest & el5

In [230]:
eli5.explain_weights(rf, feature_names=boston.feature_names)

Weight,Feature
0.7906  ± 0.0503,RM
0.1132  ± 0.0560,LSTAT
0.0229  ± 0.0170,CRIM
0.0173  ± 0.0176,TAX
0.0141  ± 0.0272,AGE
0.0131  ± 0.0151,PTRATIO
0.0093  ± 0.0160,DIS
0.0067  ± 0.0090,B
0.0038  ± 0.0062,NOX
0.0038  ± 0.0056,INDUS


## Ridge Regression Implementation

In [189]:
rg = Ridge()
rg.fit(boston.data[:300], boston.target[:300])
print("\nRondom forest price predictions = ", rg.predict(instances))


Rondom forest price predictions =  [29.68677815]


In [199]:
# eli5.explain_weights(rg, feature_names=boston.feature_names)

## Comparisons

In [243]:
X = boston.data[:300]
Y = boston.target[:300]
names = boston.feature_names
ranks = {}

lr = LinearRegression(normalize=True)
lr.fit(X, Y)
ranks["Reg"] = rank_to_dict(np.abs(lr.coef_), names)

ridge = Ridge(alpha=7)
ridge.fit(X, Y)
ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names)
 
lasso = Lasso(alpha=.05)
lasso.fit(X, Y)
ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names)

# rlasso = RandomizedLasso(alpha=0.04)
# rlasso.fit(X, Y)
# ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names)

rf = RandomForestRegressor()
rf.fit(X,Y)
ranks["RF"] = rank_to_dict(rf.feature_importances_, names)

f, pval  = f_regression(X, Y, center=True)
ranks["Corr."] = rank_to_dict(f, names)

 
r = {}
for name in names:
    r[name] = round(np.mean([ranks[method][name] 
                             for method in ranks.keys()]), 2)

methods = sorted(ranks.keys())
ranks["Mean"] = r
methods.append("Mean")

 
print("\t\t%s" % "\t\t".join(methods))
for name in names:
    print("%s\t\t%s" % (name, "\t\t".join(map(str, 
                         [ranks[method][name] for method in methods]))))

		Corr.		Lasso		RF		Reg		Ridge		Mean
CRIM		13.94		0.45		0.01		1.2		0.64		3.25
ZN		35.7		0.01		0.0		0.01		0.02		7.15
INDUS		55.38		0.01		0.01		0.02		0.0		11.08
CHAS		1.64		0.09		0.0		0.6		0.56		0.58
NOX		21.15		0.0		0.0		8.83		0.73		6.14
RM		1241.9		9.03		0.81		9.13		8.26		253.83
AGE		27.21		0.05		0.01		0.05		0.04		5.47
DIS		1.73		0.84		0.01		1.01		0.91		0.9
RAD		0.39		0.14		0.0		0.17		0.18		0.18
TAX		31.12		0.01		0.02		0.01		0.02		6.24
PTRATIO		72.86		0.6		0.02		0.64		0.61		14.95
B		14.98		0.02		0.01		0.02		0.02		3.01
LSTAT		290.43		0.13		0.1		0.11		0.2		58.19




In [127]:
# te = TextExplainer(random_state=42)
# doc = newsgroups_test.data[idx]
# te.fit(doc, c.predict_proba)
# te.show_prediction(target_names=class_names)

In [128]:
# # te.metrics_
# pprint(len(te.samples_))
# print(te.samples_[0])

In [16]:
# te.explain_weights(top=None)

In [17]:
# type(te.clf_)
# type(te.vec_)


In [18]:
# import scipy.sparse
# matrix = te.X_
# scipy.sparse.save_npz('/Users/adnanbajwa/Desktop/sparse_matrix.npz', matrix)

### Explaining predictions using lime

In [8]:
# c = make_pipeline(vectorizer, nb)

In [9]:
# pprint(c.predict_proba([newsgroups_test.data[0]]).round(3))

In [10]:
# explainer = LimeTextExplainer(class_names=class_names)

In [11]:
# idx = 102
# exp = explainer.explain_instance(newsgroups_test.data[idx], c.predict_proba, num_features=6, labels=[0,1,2,3])

In [12]:
# print('Document id: %d' % idx)
# print('Predicted class =', class_names[nb.predict(test_vectors[idx]).reshape(1,-1)[0,0]])
# print('True class: %s' % class_names[newsgroups_test.target[idx]])

In [5]:
# exp = explainer.explain_instance(newsgroups_test.data[idx], c.predict_proba, num_features=6, top_labels=2)
# pprint(exp.available_labels())

In [45]:
# exp.show_in_notebook(text=False)

In [13]:
# exp.show_in_notebook(text=newsgroups_test.data[idx])

In [47]:
# exp.show_in_notebook(text=newsgroups_test.data[idx], labels=(0,))

# Conclusion: