# Global interpretation - intuitive

We project the leaf node participations into a 2D space, and visualize it.

Hue is for the predicted output, saturation is to highlight some values and how they are treated by the decision trees.

In [1]:
%run DataLoading.ipynb

#### Training

In [2]:
from rulefitcustom import RuleFitCustom
from sklearn import manifold

features = X.columns
X_mat = X.as_matrix()

# max_rules empricirally chosen so as to maximize the graphical representation's AUC (at the end of this notebook)
rf = RuleFitCustom(max_rules=400, model_type='r')
rf.fit(X_mat, y, feature_names=features)

tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)

# X_rules contains the tree leaf node participations
X_tsne = tsne.fit_transform(rf.X_rules)


In [3]:
import plotly.graph_objs as go
import numpy as np
import colorsys
from IPython.core.display import display, HTML

def intuitive_map_fig(colors):
    
    trace = go.Scatter(
        x = X_tsne[:, 0],
        y = X_tsne[:, 1],
        hoverinfo = 'none',
        mode = 'markers',
        marker = dict(
            symbol = 'circle-dot',
            color = colors,
            size=5
        )
    )

    data = [trace]

    layout = go.Layout(
        width = 800,
        height = 800,
        title='Global interpretation - intuitive',
        xaxis=dict(
            title='t-SNE 1'
        ),
        yaxis=dict(
            title='t-SNE 2'
        )
    )

    fig = go.Figure(data=data, layout=layout)

    return fig
    

In [4]:
import colorsys

def prediction_highlight_color(prediction, highlight):
    colors = []
    for i in range(len(prediction)):
        h, s, v = colorsys.rgb_to_hsv(prediction[i], 0, (1-prediction[i]))
        r, g, b = colorsys.hsv_to_rgb(h, highlight[i], 0.90)
        colors += ['rgb({},{},{})'.format(r, g, b)]
    return colors


#### The exploratory data analysis device

Some interesting queries are commented

In [5]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)     

# No highlighting
#highlight = np.full(y.shape, 1.0)

# Special missing data values
# -9 and -8 seem to indicate behavior special enough that it creates its own pocket.
# We use these values to create features of their own
highlight = X["MSinceMostRecentInqexcl7days"].apply(lambda x : 1.0 if x == -9 else 0.0)
#highlight = X["MSinceMostRecentInqexcl7days"].apply(lambda x : 1.0 if x == -8 else 0.0)
#highlight = X["MSinceMostRecentInqexcl7days"].apply(lambda x : 1.0 if x == -7 else 0.0)


# ExternalRiskEstimate provides a good generic estimate, 
# and seem to have influenced the left-right general layout
# At a fixed threshold the colors are not uniform, though 
#highlight = X["ExternalRiskEstimate"].apply(lambda x : 1.0 if x > 70 else 0.0)

# Some of the red areas in the top parts of ExternalRiskEstimate
# can be explained by NetFractionRevolvingBurden values
#highlight = X["NetFractionRevolvingBurden"].apply(lambda x : 1.0 if x > 50 else 0.0)

# Some local structures with PercentTradesNeverDelq
#highlight = X["PercentTradesNeverDelq"].apply(lambda x : 1.0 if x < 90 else 0.0)


prediction = rf.predict(X_mat)

# blue-red for prediction 0-1 (blue = low prob of default), saturation for highlighting 0-1
colors = prediction_highlight_color(prediction, highlight)

display(HTML(iplot(intuitive_map_fig(colors))))

<IPython.core.display.HTML object>

#### Evaluation of this 2D space as a map of the different behaviors

In [6]:
from sklearn.model_selection import train_test_split
X_train_post_tsne, X_test_post_tsne, y_train_post_tsne, y_test_post_tsne = train_test_split(X_tsne, y, test_size=0.2, random_state=1)

from sklearn.ensemble import RandomForestClassifier
rf_rules_tsne = RandomForestClassifier(n_estimators=500, max_depth=7, criterion='gini', random_state=0)
rf_rules_tsne.fit(X_train_post_tsne, y_train_post_tsne)

from sklearn import metrics

y_pred_rf = rf_rules_tsne.predict_proba(X_test_post_tsne)[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test_post_tsne, y_pred_rf)
roc_auc = metrics.auc(fpr, tpr)
roc_auc

0.772264966176788