In [220]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
import xgboost
import numpy as np
from scipy.sparse import csr_matrix, diags

In [221]:
iris = load_iris(as_frame=True)
X, y = load_iris(return_X_y=True)
iris = iris['data']
X_train, X_test, y_train, y_test = train_test_split(X, y)

skrf = RandomForestClassifier(max_depth=3, random_state=0)
skrf.fit(X_train, y_train)
skpreds = skrf.predict(X_test)

# access trees
# trees = skrf.estimators_
leaves = skrf.apply(X_train)

In [222]:
def make_adjacency(leaves):

    n_samples, n_trees = leaves.shape
    
    # give each (tree, leaf) a unique global ID
    unique_leafIDs = []
    offset = 0
    offsets = []
    mappings = []
    for t in range(n_trees):
        col = leaves[:, t]
        # relabel leaves 
        unique, relabeled = np.unique(col, return_inverse=True)
        unique_leafIDs.append(relabeled + offset)
        mapping = {u: i for i, u in enumerate(unique)}
        mappings.append(mapping)
        offsets.append(offset)
        offset += len(unique)
    leafIDs_global = np.column_stack(unique_leafIDs)

    # Build sparse membership matrix M
    row_ind = np.repeat(np.arange(n_samples), n_trees)
    col_ind = leafIDs_global.ravel()
    data = np.ones_like(row_ind, dtype=np.float32)

    M = csr_matrix((data, (row_ind, col_ind)), shape=(n_samples, offset))

    # leaf sizes and weights
    leaf_sizes = np.array(M.sum(axis=0)).ravel()
    leaf_weights = 1.0 / np.maximum(leaf_sizes, 1)  # avoid div by zero

    M_norm = M @ diags(leaf_weights)
    A = (M_norm @ M.T) / n_trees
    
    metadata = {
        "offsets": offsets,
        "leaf_weights": leaf_weights,
        "M_norm": M_norm,
        "total_leaves": offset,
        "mappings": mappings
    }
    return A.toarray(), metadata

In [223]:
A, metadata = make_adjacency(leaves)
new_leaves = skrf.apply(X_test)

In [224]:
def new_adjacency(new_leaves, metadata):
    offsets = metadata["offsets"]
    leaf_weights = metadata["leaf_weights"]
    M_norm_train = metadata["M_norm"]
    n_new, n_trees = new_leaves.shape
    total_leaves = metadata["total_leaves"] 
    mappings = metadata["mappings"]

    new_leafIDs = []
    for t in range(n_trees):
        mapping = mappings[t]
        relabeled = np.array([mapping[lid] for lid in new_leaves[:, t]])
        new_leafIDs.append(relabeled + offsets[t])
    leafIDs_global = np.column_stack(new_leafIDs)
    row_ind = np.repeat(np.arange(n_new), n_trees)
    col_ind = leafIDs_global.ravel()

    data = np.ones_like(row_ind, dtype=np.float32)

    M_new = csr_matrix((data, (row_ind, col_ind)), shape=(n_new, total_leaves))
    M_norm_new = M_new @ diags(leaf_weights)
    A_new = (M_norm_new @ M_norm_train.T) / n_trees
    return A_new

In [226]:
A_new = new_adjacency(new_leaves, metadata)

In [227]:
A_new

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2000 stored elements and shape (38, 112)>