In [2]:
import os
import numpy as np
import pandas as pd
import time
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sial import CIT, RIT
import plotly.express as px
import plotly.io as pio

In [3]:
def gen_xy(
    corr,
    beta_1,
    n_cases):
    rng = np.random.default_rng()
    n_ivs = 3
    mean = np.zeros((n_ivs,))
    cov = (corr * np.ones((n_ivs, n_ivs)) + 
           (1 - corr) * np.eye(n_ivs))
    x = rng.multivariate_normal(
        mean = mean, 
        cov = cov, 
        size = n_cases)
    beta = np.array([[beta_1], [.3], [.3]])
    m = x @ beta
    r2 = beta.T @ cov @ beta 
    sigma = np.sqrt(1 - r2)
    e = sigma * rng.normal(size = (n_cases, 1))
    y = m + e
    pfi = 2 * (beta_1**2)
    cpfi = 2 * (beta_1**2) * (1-(2*(corr**2))/(1+corr))
    loco = (beta_1**2) * (1-(2*(corr**2))/(1+corr))
    return x, y, pfi, cpfi, loco


In [17]:
x, y, pfi_true, cpfi_true, loco_true = gen_xy(
    corr = .5,
    beta_1 = .4,
    n_cases = 100000)

x_train, x_test, y_train, y_test = train_test_split(
        x, y, 
        test_size = .5, 
        random_state = 0)

removal = 0
learner = LinearRegression()
_ = learner.fit(x_train, y_train)

competitor = LinearRegression()
_ = competitor.fit(
    np.delete(x_train, removal, axis = 1), 
    y_train)

sampler = LinearRegression()
_ = sampler.fit(
    np.delete(x_train, removal, axis = 1), x_train[:, removal])

cit = CIT(learner, None, removal, "CPI")
_ = cit.infer(x_test, y_test)
pfi = cit.summarize(verbose = False)["estimate"].values.item()

cit = CIT(learner, sampler, removal, "CPI")
_ = cit.infer(x_test, y_test)
cpfi = cit.summarize(verbose = False)["estimate"].values.item()


In [18]:
pfi

0.3161591758142697

In [21]:
pfi_true

0.32000000000000006

In [20]:
cpfi

0.21254957992117046

In [63]:
np.random.seed(0)
corr_all = [.0, .2, .4, .6]
beta_1_all = [.0, .1, .2, .3, .4, .5]
n_cases = 5000
n_reps = 1000
n_rep = 0

rsts = []
for corr in corr_all:
    for beta_1 in beta_1_all:
        for n_rep in range(n_reps):
            x, y, pfi_true, cpfi_true, loco_true = gen_xy(
                corr = corr,
                beta_1 = beta_1,
                n_cases = n_cases)
            
            x_train, x_test, y_train, y_test = train_test_split(
                    x, y, 
                    test_size = .5, 
                    random_state = 0)
            
            removal = 0
            learner = LinearRegression()
            _ = learner.fit(x_train, y_train)
            
            competitor = LinearRegression()
            _ = competitor.fit(
                np.delete(x_train, removal, axis = 1), 
                y_train)
            
            sampler = LinearRegression()
            _ = sampler.fit(
                np.delete(x_train, removal, axis = 1), x_train[:, removal])
            
            mse_full = mean_squared_error(
                y_test, 
                learner.predict(x_test))
            x_test_perm = x_test.copy()
            
            rng = np.random.default_rng()
            x_test_perm[:,0] = rng.permutation(x_test_perm[:,0])
            mse_reduced = mean_squared_error(
                y_test, learner.predict(x_test_perm))
            pfi = mse_reduced - mse_full
            diff = (y_test - learner.predict(x_test_perm))**2 - (y_test - learner.predict(x_test))**2
            se = diff.std()
            cover_pfi = (pfi - 1.645 * se/np.sqrt(n_cases/2)) < pfi_true
            
            cit = CIT(learner, sampler, removal, "CPI")
            _ = cit.infer(x_test, y_test)
            cpfi = cit.summarize(verbose = False)["estimate"].values.item()
            diff = cit.r_losses_[0] - cit.l_losses_[0]
            se = diff.std()
            cover_cpfi = (cpfi - 1.645 * se/np.sqrt(n_cases/2)) < cpfi_true
            
            
            rit = RIT(learner, competitor, removal, "LOCO")
            _ = rit.infer(x_test, y_test)
            loco = rit.summarize(verbose = False)["estimate"].values.item()
            diff = rit.r_losses_[0] - rit.l_losses_[0]
            se = diff.std()
            cover_loco = (loco - 1.645 * se/np.sqrt(n_cases/2)) < loco_true
            
            rst = pd.DataFrame(
                    {"corr":[corr]*6, 
                     "beta_1":[beta_1]*6,
                     "n_rep":[n_rep]*6,
                     "measure":["PFI", "PFI", "CPFI", "CPFI", "LOCO", "LOCO"],
                     "type":["Truth", "Estimate", "Truth", "Estimate", "Truth", "Estimate"], 
                     "importance":[pfi_true, pfi, cpfi_true, cpfi, loco_true, loco], 
                     "cover":[cover_pfi, cover_pfi, cover_cpfi, cover_cpfi, cover_loco, cover_loco]})
            rsts.append(rst)

In [4]:
rsts_plot = pd.concat(
    rsts, 
    ignore_index = True).groupby(
    ["corr", "beta_1", "measure", "type"])["importance"].apply(
    lambda x: x.mean()).reset_index()

In [61]:
pio.templates.default = "simple_white"
fig = px.bar(
    rsts_plot, 
    y = "importance",
    x = "beta_1",
    color = "type",
    facet_col = "measure",
    facet_row = "corr",
    barmode = "group",
    labels = {
        "beta_1":"beta",
         "measure": "Method",
        "importance": "Value",
        "type": "Type"
     },
    category_orders = {
                    "measure": ["PFI", "CPFI", "LOCO"]})
fig.update_layout(
    legend=dict(
        y=.5),
    margin=dict(l=20, r=20, t=40, b=20))
pio.write_image(
    fig, "./figures/toy_example.pdf", 
    width = 600, height=500, scale = 2)

ValueError: Value of 'y' is not the name of a column in 'data_frame'. Expected one of ['corr', 'beta_1', 'measure', 'cover'] but received: importance

In [64]:
rsts_plot = pd.concat(
    rsts, 
    ignore_index = True).groupby(
    ["corr", "beta_1", "measure"])["cover"].apply(
    lambda x: x.mean()).reset_index()
pio.templates.default = "simple_white"
fig = px.bar(
    rsts_plot, 
    y = "cover",
    x = "beta_1",
    facet_col = "measure",
    facet_row = "corr",
    barmode = "group",
    labels = {
        "beta_1":"beta",
         "measure": "Method",
        "importance": "Value",
        "type": "Type"
     },
    category_orders = {
                    "measure": ["PFI", "CPFI", "LOCO"]})
fig.update_layout(
    legend=dict(
        y=.5),
    margin=dict(l=20, r=20, t=40, b=20))
fig.add_hline(y=.95, line_width=1.5, line_color="black")
pio.write_image(
    fig, "./figures/toy_example.pdf", 
    width = 600, height=500, scale = 2)

In [56]:
pd.concat(
    rsts, 
    ignore_index = True).groupby(
    ["corr", "beta_1", "measure"])["cover"].apply(
    lambda x: x.mean()).reset_index()

Unnamed: 0,corr,beta_1,measure,cover
0,0.0,0.0,CPFI,0.876
1,0.0,0.0,LOCO,0.948
2,0.0,0.0,PFI,0.886
3,0.0,0.1,CPFI,0.854
4,0.0,0.1,LOCO,0.906
...,...,...,...,...
67,0.6,0.4,LOCO,0.878
68,0.6,0.4,PFI,0.796
69,0.6,0.5,CPFI,0.820
70,0.6,0.5,LOCO,0.886
