In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as sc
import sklearn 
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

### Code to Build Random Forest and Return Tree Distances

In [1]:
def get_node_depths(tree1):
    def get_node_depths_(current_node, current_depth, l, r, depths):
        depths += [current_depth]
        if l[current_node] != -1 and r[current_node] != -1:
            get_node_depths_(l[current_node], current_depth + 1, l, r, depths)
            get_node_depths_(r[current_node], current_depth + 1, l, r, depths)
    depths = []
    get_node_depths_(0, 0, tree1.tree_.children_left, tree1.tree_.children_right, depths) 
    return np.array(depths)

def get_shared_nodes(i1,i2,node_indicator,n_nodes):
    sample_ids = [i1, i2]
    common_nodes = (node_indicator.toarray()[sample_ids].sum(axis=0) ==
                    len(sample_ids))
    
    common_node_id = np.arange(n_nodes)[common_nodes]
    
    return common_node_id

# Tree distance between nodes n1, n2 = depth(n1) + depth(n2) - 2 depth(LCA)
def distance_between_samples(indexes,depths,leaves,node_indicator,n_nodes):
    i1 = indexes[0]
    i2 = indexes[1]
    leaf_node1 = leaves[i1]
    leaf_node2 = leaves[i2]
    depth_node1 = depths[leaf_node1]
    depth_node2 = depths[leaf_node2]
    ancestors = get_shared_nodes(i1,i2,node_indicator,n_nodes)
    depth_LCA = max(depths[ancestors])
    
    dist = depth_node1 + depth_node2 - 2*depth_LCA
    
    return dist

# bootstraps data and builds a tree, then calculates pairwise distances on the data instances relative to the tree
# Tree distance is calculated via lowest common ancestor
def build_tree(xTrain,yTrain,xTest,X):
    train = xTrain.copy()
    train['y'] = yTrain
    train1 = train.sample(n = len(train), replace = True) 
    yTrain1 = train1['y']
    xTrain1 = train1.drop('y',axis = 1)
    gc.collect()
    estimator = DecisionTreeClassifier().fit(xTrain1,yTrain1)
    n_nodes = estimator.tree_.node_count
    depths = get_node_depths(estimator)
    leaves_train = estimator.apply(xTrain)
    leaves_test = estimator.apply(xTest)
    node_indicator_test = estimator.decision_path(xTest)
    node_indicator_train = estimator.decision_path(xTrain)
    train_comb = list(itertools.combinations(range(0,len(xTrain)), 2))
    test_comb = list(itertools.combinations(range(0,len(xTest)), 2))

    ### Train Distances
    train_distances = []
    for indexes in train_comb:
        dist = distance_between_samples(indexes,depths,leaves_train,node_indicator_train,n_nodes)
        train_distances.append([indexes[0],indexes[1],dist])

    ### Test Distances
    test_distances = []
    for indexes in test_comb:
        dist = distance_between_samples(indexes,depths,leaves_test,node_indicator_test,n_nodes)
        test_distances.append([indexes[0],indexes[1],dist])

    train_dist_df = pd.DataFrame(train_distances, columns = ['i1','i2','tree_dist'])
    test_dist_df = pd.DataFrame(test_distances, columns = ['i1','i2','tree_dist'])
    
    leaves_all = estimator.apply(X)
    node_indicator_all = estimator.decision_path(X)
    dist_args = [depths,leaves_all,node_indicator_all,n_nodes]
    
    return([estimator,train_dist_df,test_dist_df,dist_args])

def fit_random_forest(xTrain,yTrain,num_trees,xTest,X):
    i = 0
    mods = []
    dist_args = []
    train_dists = pd.DataFrame()
    test_dists = pd.DataFrame()
    while i <= num_trees:
        tree = build_tree(xTrain,yTrain,xTest,X)
        mods.append(tree[0])
        train_dists = train_dists.append(tree[1])
        test_dists = test_dists.append(tree[2])
        dist_args.append(tree[3])
        i = i+1
    train_final_dist = train_dists.groupby(['i1','i2']).mean().reset_index()
    test_final_dist = test_dists.groupby(['i1','i2']).mean().reset_index()
    return(mods,train_final_dist,test_final_dist,dist_args)

def rf_predict(xTest,mods):
    pred = []
    for clf in mods:
        pred.append(clf.predict(xTest))
    pred = np.mean(pred,axis = 0)
    pred = [int(x) for x in pred>=0.5]
    return pred

def return_distance_matrix(xTrain,yTrain,num_trees,xTest,X):
    mods,train_final_dist,test_final_dist,dist_args = fit_random_forest(xTrain,yTrain,num_trees,xTest,X)
    train_final_dist = train_final_dist.groupby(['i1','i2']).mean().reset_index()
    test_final_dist = test_final_dist.groupby(['i1','i2']).mean().reset_index()
    test_final_dist1 = test_final_dist.copy()
    test_final_dist1['i1'] = test_final_dist['i2']
    test_final_dist1['i2'] = test_final_dist['i1']
    test_final_dist = test_final_dist.append(test_final_dist1)
    
    test_dist = np.zeros(shape = (max(test_final_dist['i1'])+1,max(test_final_dist['i1'])+1))
    for i in range(0,len(test_final_dist)):
        temp = test_final_dist.iloc[i]
        i1 = int(temp['i1'])
        i2 = int(temp['i2'])
        tree_dist = temp['tree_dist']
        test_dist[i1][i2] = tree_dist

    train_final_dist1 = train_final_dist.copy()
    train_final_dist1['i1'] = train_final_dist['i2']
    train_final_dist1['i2'] = train_final_dist['i1']
    train_final_dist = train_final_dist.append(train_final_dist1)
    train_dist = np.zeros(shape = (max(train_final_dist['i1'])+1,max(train_final_dist['i1'])+1))
    for i in range(0,len(train_final_dist)):
        temp = train_final_dist.iloc[i]
        i1 = int(temp['i1'])
        i2 = int(temp['i2'])
        tree_dist = temp['tree_dist']
        train_dist[i1][i2] = tree_dist
    
    return mod, test_dist, dist_train