In [1]:
import copy

import numpy as np
import scipy as sp
import random
import os
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import r2_score, mean_squared_error, zero_one_loss, roc_auc_score
from sklearn.linear_model import LassoCV, LogisticRegressionCV, RidgeCV
from sklearn.preprocessing import OneHotEncoder

from imodels.importance.representation_cleaned import *
from imodels.importance.r2f_exp_cleaned import *

## 6. Testing full pipeline

In [14]:
n = 100
p1 = 5
p2 = 7
p3 = 1
# data_block1 = np.random.randn(n, p1)
# data_block2 = np.random.randn(n, p2)
data_block1 = np.ones((n, p1)) + np.random.randn(n, p1)
y = data_block1 @ np.array([1] + list(np.zeros(p1-1))) + np.random.randn(n) * 0.1
y_binary = np.random.binomial(1, p=sp.special.expit(y))

In [28]:
%%time

default_gmdi_pipeline(data_block1, y)

CPU times: user 6.65 s, sys: 1.01 s, total: 7.66 s
Wall time: 2.12 s


array([ 9.88115663e-01,  3.59409907e-03, -9.01820213e-04, -6.73031504e-05,
        3.19628895e-03])

In [17]:
default_gmdi_pipeline(data_block1, y_binary)

array([ 0.00542176, -0.01983371, -0.02328567, -0.02018197, -0.01684698])

In [16]:
%%time

default_gmdi_pipeline(data_block1, y_binary, regression=False)

CPU times: user 48.5 s, sys: 8.71 s, total: 57.2 s
Wall time: 15.1 s


array([0.52201067, 0.28533333, 0.28884267, 0.23638933, 0.322096  ])

## 5. Testing LOO PPM

In [29]:
n = 100
p1 = 5
p2 = 7
p3 = 1
# data_block1 = np.random.randn(n, p1)
# data_block2 = np.random.randn(n, p2)
data_block1 = np.ones((n, p1)) + np.random.randn(n, p1)
data_block2 = np.ones((n, p2)) * 2 + np.random.randn(n, p2)
data_block3 = np.ones((n, p3)) * 3 + np.random.randn(n, p3)
data_blocks = [data_block1, data_block2, data_block3]
y = data_block1 @ np.array([1] + list(np.zeros(p1-1))) + np.random.randn(n) * 0.1
y_binary = np.random.binomial(1, p=sp.special.expit(y))
blocked_data = BlockPartitionedData(data_blocks)
y_categorical = np.random.randint(0, 4, n)
y_cat_one_hot = OneHotEncoder().fit_transform(y_categorical.reshape(-1,1)).toarray()

In [11]:
# Check that cv values are the same as that in ridge loo ppm

ridge_cv = RidgeCV(alphas=np.logspace(-5, 5, 20), store_cv_values=True)
ridge_cv.fit(np.hstack(data_blocks), y)
ridge_cv.cv_values_.mean(axis=0)

RidgeCV(alphas=array([1.00000000e-05, 3.35981829e-05, 1.12883789e-04, 3.79269019e-04,
       1.27427499e-03, 4.28133240e-03, 1.43844989e-02, 4.83293024e-02,
       1.62377674e-01, 5.45559478e-01, 1.83298071e+00, 6.15848211e+00,
       2.06913808e+01, 6.95192796e+01, 2.33572147e+02, 7.84759970e+02,
       2.63665090e+03, 8.85866790e+03, 2.97635144e+04, 1.00000000e+05]),
        store_cv_values=True)

In [30]:
ridge_loo_ppm = RidgeLOOPPM()
# ridge_loo_ppm.set_alphas("default", blocked_data, y)
ridge_loo_ppm.fit(blocked_data, y)

In [33]:
for i in range(3):
    print(mean_squared_error(y, ridge_loo_ppm.get_partial_predictions(i)))

0.009740558985122851
1.0445627125783774
1.038655225654163


In [34]:
ridge_loo_ppm.get_partial_predictions(0) - y

array([ 0.03522215, -0.02731276, -0.05306986, -0.15863974, -0.0135043 ,
        0.01488684,  0.05252128, -0.00798004,  0.09640539,  0.05875139,
       -0.02915093, -0.0860761 ,  0.06633575,  0.17918859, -0.16102169,
        0.1825987 , -0.14722516,  0.09257829, -0.05591041,  0.02347763,
        0.10264974,  0.13445467,  0.363944  , -0.10472336,  0.04655741,
       -0.10092641, -0.03921782, -0.14559562, -0.02711224,  0.0472973 ,
        0.01512068, -0.05505554,  0.08162527,  0.01849988, -0.00962624,
       -0.01692529,  0.01722482,  0.04679461,  0.07423115, -0.10081393,
       -0.00691934, -0.0219557 ,  0.0313211 , -0.06756291,  0.07848294,
       -0.14326682, -0.0804344 , -0.00052118,  0.000466  ,  0.0804554 ,
        0.13094288,  0.00486731,  0.01995709,  0.01836871,  0.03528491,
        0.14015391, -0.14077661,  0.11925971, -0.05419787, -0.18064702,
       -0.06848067, -0.02031649, -0.0202443 ,  0.00307043,  0.00817902,
        0.01902598, -0.03093485, -0.04772098,  0.06785038,  0.14

In [32]:
ridge_loo_ppm.alpha_

0.1623776739188721

In [35]:
log_loo_ppm = LogisticLOOPPM()
log_loo_ppm.fit(blocked_data, y_binary)

In [36]:
for i in range(3):
    print(roc_auc_score(y_binary, log_loo_ppm.get_partial_predictions(i)))

0.6974259349198639
0.4035939776590578
0.36765420106847985


In [37]:
log_loo_ppm = LogisticLOOPPM()
log_loo_ppm.fit(blocked_data, y_cat_one_hot)

In [38]:
for i in range(3):
    print(roc_auc_score(y_cat_one_hot, log_loo_ppm.get_partial_predictions(i)))

0.22062523744911805
0.2537056535504297
0.14837630031659882


In [41]:
log_loo_ppm.alpha_

array([6.95192796e+01, 1.00000000e+05, 2.06913808e+01, 1.00000000e+05])

## 4. Testing PPM

In [2]:
n = 100
p1 = 5
p2 = 7
p3 = 1
# data_block1 = np.random.randn(n, p1)
# data_block2 = np.random.randn(n, p2)
data_block1 = np.ones((n, p1)) + np.random.randn(n, p1)
data_block2 = np.ones((n, p2)) * 2 + np.random.randn(n, p2)
data_block3 = np.ones((n, p3)) * 3 + np.random.randn(n, p3)
data_blocks = [data_block1, data_block2, data_block3]
y = data_block1 @ np.array([1] + list(np.zeros(p1-1))) + np.random.randn(n) * 0.1
y_binary = np.random.binomial(1, p=sp.special.expit(y))
blocked_data = BlockPartitionedData(data_blocks)

In [3]:
ridge_ppm = RidgePPM()
ridge_ppm.set_alphas("default", blocked_data, y)
ridge_ppm.fit(blocked_data, y, mode="keep_rest")

In [13]:
ridge_ppm.get_partial_predictions(2) - y

array([ 0.08492051, -0.0349213 , -0.06792441,  0.08407434, -0.07620152,
       -0.0004732 ,  0.05422286, -0.04613619,  0.00037839, -0.01384697,
        0.08827928,  0.06944406, -0.00322979, -0.01649656, -0.0015974 ,
       -0.14507789,  0.04960882,  0.0381334 , -0.04473916,  0.06861695,
        0.00911737, -0.0473678 , -0.18176052,  0.11773713, -0.08485342,
        0.04029759,  0.07034176,  0.05821912, -0.03303226, -0.0178732 ,
       -0.02955732, -0.03761804,  0.01260764,  0.00749997,  0.04575837,
        0.07281188, -0.05025104,  0.09746286,  0.09943579,  0.03628978,
        0.0654146 , -0.05898608,  0.09908187, -0.02896701, -0.07364056,
        0.17138064, -0.06298987, -0.02333959, -0.05114554, -0.05958105,
       -0.04476297, -0.11882374,  0.15732295,  0.00582565,  0.05843596,
       -0.00483641,  0.00345955, -0.06417568, -0.18592493, -0.07238636,
        0.08430831, -0.10502395, -0.06111078,  0.06695921,  0.01078021,
        0.04535849,  0.02325094,  0.00470122,  0.04858282, -0.11

In [15]:
lasso_ppm = GenericPPM(estimator=LassoCV())
lasso_ppm.fit(blocked_data, y)
print(mean_squared_error(y, lasso_ppm.get_partial_predictions(0)), mean_squared_error(y, lasso_ppm.get_partial_predictions(1)))

0.007890359106514915 0.8119756854858856


In [24]:
logistic_ppm = GenericPPM(estimator=LogisticRegressionCV())
logistic_ppm.fit(blocked_data, y_binary)
logistic_ppm.get_partial_predictions(0)
for i in range(3):
    print(roc_auc_score(y_binary, logistic_ppm.get_partial_predictions(i)[:,1]))

0.7221978021978022
0.6865934065934065
0.5507692307692307


## 3. Testing tree transformer

In [2]:
n = 100
p1 = 5
p2 = 7
p3 = 1
# data_block1 = np.random.randn(n, p1)
# data_block2 = np.random.randn(n, p2)
data_block1 = np.ones((n, p1)) + np.random.randn(n, p1)
y = data_block1 @ np.array([1] + list(np.zeros(p1-1))) + np.random.randn(n) * 0.1
tree_model = DecisionTreeRegressor()
tree_model.fit(data_block1, y)

DecisionTreeRegressor()

In [3]:
tree_transformer = TreeTransformer(p1, tree_model)

In [4]:
tree_block = tree_transformer.transform_one_feature(data_block1, 0)

In [5]:
np.linalg.norm(tree_block, axis=0) ** 2

array([100.,  54.,  13.,   7.,   2.,   3.,   2.,   3.,  41.,  14.,   7.,
        27.,   2.,  18.,   3.,  46.,  30.,  19.,   8.,  11.,   6.,   2.,
        16.,  11.,   2.,   4.,   2.])

In [3]:
rf_model = RandomForestRegressor()
rf_model.fit(data_block1, y)
tree_transformer = TreeTransformer(p1, rf_model, data=data_block1)
tree_block = tree_transformer.transform_one_feature(data_block1, 0)

In [4]:
np.linalg.norm(tree_block, axis=0) ** 2

array([100.,  53.,  13., ...,   8.,   7.,   6.])

In [7]:
tree = rf_model.estimators_[0].tree_

In [10]:
tree.n_node_samples[0] = 5

In [12]:
tree.n_node_samples = 1

AttributeError: attribute 'n_node_samples' of 'sklearn.tree._tree.Tree' objects is not writable

In [24]:
rf_model.estimators_[0].decision_path(data_block1).getnnz(0)

array([100,  47,  17,   7,   5,   3,   2,   1,   1,   2,  10,   5,   1,
         4,   5,   3,   2,   1,   2,  30,  10,   4,   2,   2,   1,   1,
         6,   1,   5,   2,   1,   1,   3,   1,   2,  20,   5,   3,   1,
         2,   1,   1,   2,   1,   1,  15,  10,   8,   7,   2,   5,   3,
         2,   1,   2,   5,   4,   3,   1,   2,   1,   1,  53,  38,  18,
        10,   2,   1,   1,   8,   7,   5,   1,   4,   2,   1,   1,   2,
         1,   1,   2,   1,   1,   1,   8,   1,   7,   5,   1,   4,   2,
         1,   1,  20,  12,   2,   1,   1,  10,   6,   3,   1,   2,   1,
         1,   3,   2,   1,   4,   1,   3,   2,   1,   8,   1,   7,   5,
         1,   4,   2,  15,  12,   7,   6,   1,   5,   2,   3,   1,   5,
         2,   3,   3,   1,   2,   1,   1])

In [7]:
tree.weighted_n_node_samples

array([100.,  47.,  17.,   6.,   5.,   2.,   3.,   1.,   2.,   1.,  11.,
         4.,   1.,   3.,   7.,   6.,   4.,   2.,   1.,  30.,  13.,   4.,
         1.,   3.,   1.,   2.,   9.,   2.,   7.,   3.,   2.,   1.,   4.,
         1.,   3.,  17.,   6.,   4.,   1.,   3.,   1.,   2.,   2.,   1.,
         1.,  11.,   6.,   5.,   4.,   1.,   3.,   1.,   2.,   1.,   1.,
         5.,   4.,   2.,   1.,   1.,   2.,   1.,  53.,  39.,  20.,  13.,
         3.,   2.,   1.,  10.,   9.,   7.,   1.,   6.,   3.,   1.,   2.,
         3.,   1.,   2.,   2.,   1.,   1.,   1.,   7.,   1.,   6.,   2.,
         1.,   1.,   4.,   3.,   1.,  19.,  12.,   2.,   1.,   1.,  10.,
         6.,   4.,   1.,   3.,   1.,   2.,   2.,   1.,   1.,   4.,   2.,
         2.,   1.,   1.,   7.,   1.,   6.,   4.,   2.,   2.,   2.,  14.,
        11.,   8.,   6.,   1.,   5.,   1.,   4.,   2.,   3.,   2.,   1.,
         3.,   1.,   2.,   1.,   1.])

## 2. Testing block transformers

In [2]:
n = 100
p1 = 5
p2 = 7
p3 = 1
# data_block1 = np.random.randn(n, p1)
# data_block2 = np.random.randn(n, p2)
data_block1 = np.ones((n, p1)) + np.random.randn(n, p1)
id_transformer = IdentityTransformer(p1)
blocked_data = id_transformer.transform(data_block1)

In [6]:
blocked_data.get_modified_data(0)

array([[ 1.6219188 ,  0.97284691,  1.14803982,  1.08145469,  0.95946014],
       [ 2.92452526,  0.97284691,  1.14803982,  1.08145469,  0.95946014],
       [ 2.81472583,  0.97284691,  1.14803982,  1.08145469,  0.95946014],
       [ 0.94636508,  0.97284691,  1.14803982,  1.08145469,  0.95946014],
       [-0.51978534,  0.97284691,  1.14803982,  1.08145469,  0.95946014],
       [ 0.89233908,  0.97284691,  1.14803982,  1.08145469,  0.95946014],
       [ 1.76061846,  0.97284691,  1.14803982,  1.08145469,  0.95946014],
       [ 0.89524016,  0.97284691,  1.14803982,  1.08145469,  0.95946014],
       [ 1.00120013,  0.97284691,  1.14803982,  1.08145469,  0.95946014],
       [ 2.21920765,  0.97284691,  1.14803982,  1.08145469,  0.95946014],
       [ 1.42370828,  0.97284691,  1.14803982,  1.08145469,  0.95946014],
       [-0.29063623,  0.97284691,  1.14803982,  1.08145469,  0.95946014],
       [ 1.08470336,  0.97284691,  1.14803982,  1.08145469,  0.95946014],
       [ 1.54916908,  0.97284691,  1.1

In [8]:
np.all(blocked_data.get_all_data() == data_block1)

True

In [16]:
data = id_transformer.transform(data_block1, rescale=True).get_all_data()
data.std(axis=0)

array([1., 1., 1., 1., 1.])

In [23]:
composite = CompositeTransformer([id_transformer, id_transformer], adj_std="max")
data = composite.transform(data_block1).get_block(2)
data

array([[ 0.97145808,  0.97145808],
       [ 1.05989336,  1.05989336],
       [-0.13927433, -0.13927433],
       [ 0.89047901,  0.89047901],
       [ 1.36384863,  1.36384863],
       [ 1.81174978,  1.81174978],
       [-0.10637308, -0.10637308],
       [ 1.86508573,  1.86508573],
       [ 1.20759156,  1.20759156],
       [ 2.28089708,  2.28089708],
       [ 1.39909675,  1.39909675],
       [-0.12859915, -0.12859915],
       [ 1.44098019,  1.44098019],
       [ 1.85729251,  1.85729251],
       [ 1.49524595,  1.49524595],
       [ 0.56307112,  0.56307112],
       [-1.67644076, -1.67644076],
       [ 3.21770753,  3.21770753],
       [ 1.65936503,  1.65936503],
       [ 2.38086379,  2.38086379],
       [ 1.13624595,  1.13624595],
       [ 0.20513627,  0.20513627],
       [ 1.77113169,  1.77113169],
       [ 1.14378989,  1.14378989],
       [ 0.02631339,  0.02631339],
       [ 0.51020795,  0.51020795],
       [ 1.41125058,  1.41125058],
       [ 3.04989549,  3.04989549],
       [ 2.28281936,

## 1. Testing block partitioned data

In [3]:
n = 100
p1 = 5
p2 = 7
p3 = 1
# data_block1 = np.random.randn(n, p1)
# data_block2 = np.random.randn(n, p2)
data_block1 = np.ones((n, p1)) + np.random.randn(n, p1)
data_block2 = np.ones((n, p2)) * 2 + np.random.randn(n, p2)
data_block3 = np.ones((n, p3)) * 3 + np.random.randn(n, p3)
data_blocks = [data_block1, data_block2, data_block3]
blocked_data = BlockPartitionedData(data_blocks)

In [4]:
blocked_data.get_all_data()

array([[ 2.28169397,  1.07445645,  0.34343945, ...,  0.84806761,
         2.89905064,  1.19476034],
       [-0.03749265,  2.08981242,  2.22264514, ...,  1.79143923,
         2.00702395,  3.4788338 ],
       [-0.51604933, -0.52687196,  0.66962039, ...,  3.48781738,
         2.93676149,  2.9407791 ],
       ...,
       [ 0.04854499,  0.46283128,  0.40500174, ...,  1.29235355,
         1.90683027,  2.4985751 ],
       [ 1.27899668,  0.52405581,  0.28625703, ...,  2.40823346,
         2.87838462,  1.11505443],
       [ 2.73609826,  0.3404902 ,  1.42146683, ...,  1.98196607,
         3.15719736,  4.33054791]])

In [7]:
blocked_data.get_modified_data(1, mode="keep_rest")

array([[ 2.28169397,  1.07445645,  0.34343945, ...,  2.05453732,
         1.91600772,  1.19476034],
       [-0.03749265,  2.08981242,  2.22264514, ...,  2.05453732,
         1.91600772,  3.4788338 ],
       [-0.51604933, -0.52687196,  0.66962039, ...,  2.05453732,
         1.91600772,  2.9407791 ],
       ...,
       [ 0.04854499,  0.46283128,  0.40500174, ...,  2.05453732,
         1.91600772,  2.4985751 ],
       [ 1.27899668,  0.52405581,  0.28625703, ...,  2.05453732,
         1.91600772,  1.11505443],
       [ 2.73609826,  0.3404902 ,  1.42146683, ...,  2.05453732,
         1.91600772,  4.33054791]])