In [7]:
import pandas as pd
from sklearn import model_selection
from sklearn.ensemble import RandomForestRegressor
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error

df = pd.read_excel('C:/Anaconda3/projects/oled/oled.xlsx', sheetname='extend_side')
df.dropna(axis=0, inplace=True)

X = df[list(df.columns)[1:-2]]
y = df[['cd/A']]

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.25, random_state=7)

scaler = Normalizer()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

filename = 'rforest_model.sav'
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
print('R-squared: ', loaded_model.score(X_test, y_test))


R-squared:  0.696328338358


In [8]:
new_df = pd.read_excel('C:/Anaconda3/projects/oled/oled.xlsx', sheetname='extend_side_sample')
new_X = new_df[list(new_df.columns)[:-1]]
new_X.head()
new_X = scaler.transform(new_X)

y_pred = loaded_model.predict(new_X)
print(y_pred)

[ 19.587       23.39179487  21.45        20.76333333  23.39179487
  26.42179487]


In [14]:
print("Instance 0 prediction:", loaded_model.predict(new_X[0]))
print("Instance 1 prediction:", loaded_model.predict(new_X[1]))

Instance 0 prediction: [ 19.587]
Instance 1 prediction: [ 23.39179487]




### Tree interpreter
Using tree interpreter, instruction available on this blog post http://blog.datadive.net/random-forest-interpretation-with-scikit-learn/

In [23]:
from treeinterpreter import treeinterpreter as ti

prediction, bias, contributions = ti.predict(loaded_model, new_X)
names = list(X)
for i in range(len(new_X)):
    print("Instance", i)
    print("Bias (trainset mean)", bias[i])
    print("Feature contributions:")
    for c, feature in sorted(zip(contributions[i], 
                                 names), 
                             key=lambda x: -abs(x[0])):
        print(feature, round(c, 2))
    print("-"*20 )

Instance 0
Bias (trainset mean) 24.2759015097
Feature contributions:
homo2 -4.94
tripletdopant 2.86
wf -1.78
triplet2 -1.41
thick 1.33
lumo5 -1.0
homo_d1 0.85
triplet3 -0.73
lumo_d1 0.6
triplet1 -0.56
homo4 -0.42
HOMO 0.32
homo3 -0.31
thick_etl 0.3
thick_htl 0.24
lumo6 -0.09
LUMO 0.05
homo1 0.0
--------------------
Instance 1
Bias (trainset mean) 24.2759015097
Feature contributions:
homo2 -3.56
homo1 2.99
tripletdopant 2.69
thick_etl -2.14
thick -1.82
homo_d1 1.3
homo4 -1.16
lumo5 -0.94
HOMO 0.84
triplet3 -0.6
lumo_d1 0.6
LUMO 0.56
thick_htl 0.43
triplet1 0.34
triplet2 -0.2
lumo6 -0.09
wf -0.09
homo3 -0.04
--------------------
Instance 2
Bias (trainset mean) 24.2759015097
Feature contributions:
homo2 -3.56
tripletdopant 3.2
thick_etl -1.84
thick -1.75
homo_d1 1.3
homo4 -1.16
lumo5 -0.94
HOMO 0.84
lumo_d1 0.6
LUMO 0.58
lumo6 -0.56
thick_htl 0.39
triplet1 0.34
triplet2 -0.2
wf -0.09
triplet3 0.07
homo1 -0.05
homo3 0.0
--------------------
Instance 3
Bias (trainset mean) 24.2759015097
Fea

In [26]:
import numpy as np

print(prediction)
print(bias + np.sum(contributions, axis=1))

[ 19.587       23.39179487  21.45        20.76333333  23.39179487
  26.42179487]
[ 19.587       23.39179487  21.45        20.76333333  23.39179487
  26.42179487]


In [27]:
# try using scikit decision path
loaded_model.decision_path(new_X)

(<6x1310 sparse matrix of type '<class 'numpy.int64'>'
 	with 461 stored elements in Compressed Sparse Row format>,
 array([   0,  121,  250,  387,  528,  663,  794,  921, 1058, 1191, 1310], dtype=int32))