# Causal Discovery and Injection for Feed Forward Neural Networks

Notebook collecting results for causal injection paper (real data experiments)

In [77]:
import sys
sys.path.insert(0,'..')
import re
import os
import numpy as np
import pandas as pd
from utils import load_pickle, plot_DAG, plot_ly_lines, heat_mat, DAG_retreive_np
from IPython.display import Image
from IPython.core.display import HTML 

results_folder = '../results'
mats_folder = '../results/adjmats'
output_folder = 'figures'
os.listdir(results_folder)

debug = False
save_figs = True

## Appendix - Tau Optimisation

In [78]:
def collect_results(
    version,
    metric,
    res_folder='./results/',
    mat_folder='./results/adjmats',
    dsets = '([\w.]+)',
    outfs = str(5)
    ):

    ## Collect Edges Numbers
    list_edges = []
    for filename in os.listdir(mat_folder):
        match = re.search(f'W_est.{dsets}.(\d).(\d).(.*?).{version}.pkl$', filename)
        if match:
            out_f, in_f, theta = match.groups()
            edges = sum(load_pickle(os.path.join(mat_folder,filename), verbose=False).flatten()>0)
            list_edges.append((float(theta), out_f, in_f, edges))
    n_edges = pd.DataFrame(list_edges).groupby(0, as_index=False).agg({3 : ['mean', 'std']})
    n_edges.columns = n_edges.columns.droplevel()
    n_edges.columns = ['theta','N_edges','Std']

    ## Collect Predictive Performances
    filename = f"Nested{outfs}FoldCASTLE.Reg.Synth.{dsets}.{version}.pkl"
    describe = load_pickle(os.path.join(res_folder,filename), verbose=False)
    agg_stats = pd.DataFrame([(describe[c]['theta'],c[9],describe[c]['fold'],
                                describe[c][metric]) for c in describe]).groupby(0, as_index=False).agg({2 : ['count','max'], 3 : ['mean', 'std']})
    agg_stats.columns = ['theta', 'count','folds',  'accuracy_mean', 'accuracy_std']
    agg_stats=agg_stats.merge(n_edges,on='theta')
    agg_stats= agg_stats.drop_duplicates(subset=['N_edges'])

    return agg_stats


### Fico

In [79]:
rerun_fico_optim = False
if rerun_fico_optim:
    ## First run - broader
    %run -i main_realdata.py --version=fico_3
    ## Second run - focused on interval of interest
    %run -i main_realdata.py --version=fico_3 --theta_auto=False --theta_range="0.005","0.008","0.001"
    # ## Third run
    %run -i main_realdata.py --version=fico_3 --theta_auto=False --theta_range="0.009","0.012","0.001"


In [80]:
version = 'fico_3'
metric = 'auc'

agg_stats = collect_results(version=version, metric=metric, dsets='100000', res_folder=results_folder, mat_folder=mats_folder)

sub = -0.005
agg_stats.loc[agg_stats['theta']==-1,'theta'] = sub

filter_theta = 0.03
filter_list = [0.005,0.007,0.009,0.011]
agg_stats = agg_stats[(agg_stats['theta']<=filter_theta) & (~agg_stats['theta'].isin(filter_list))]
display(agg_stats)

### Plot results
fig = plot_ly_lines(agg_stats,'auc', height=200)

fig.update_layout(
    legend={
        'y':-0.58,
        'x':0.8,
        'orientation':"h",
        'xanchor': 'center',
        'yanchor': 'bottom'},
    font=dict(
        family='Serif',#"Courier New, monospace",
        size=18,
        # color="Black"
    ) ,
    margin=dict(
            l=10,
            r=10,
            b=0,
            t=25,
            pad=0
            ),   
)
# Set x-axis title
fig.update_xaxes(showgrid=True,title={'text':r'$\text{Threshold } \tau$','font':{'size':18#, 'family': 'Courier New, monospace',
}},tickvals=[a for sub in [[0,0.004],list(np.round(np.arange(0.01, max(agg_stats['theta']), 0.01),2))] for a in sub],
)

fig.add_annotation(x=0.02, y=-0.225,
            text="CASTLE",
            showarrow=False,
            yshift=0,
            xref='paper',
            yref='paper',)
            
chosen_tau=0.004

fig.add_annotation(x=chosen_tau, y=0.7929,
            text=str(chosen_tau),
            showarrow=True,
            yshift=0,
            xref='x',ax=0,
            yref='y2',ay=110#170
            )
fig.add_annotation(x=chosen_tau, y=0.7929,
            text=str(0.79),
            showarrow=True,
            yshift=0,
            xref='x',ax=300,
            yref='y2',ay=0)
fig.add_annotation(x=chosen_tau, y=84.6,
            text=str(85),
            showarrow=True,
            yshift=0,
            xref='x',ax=-130,
            yref='y',ay=0)

fig.show()

if not os.path.exists(output_folder):
    os.mkdir(output_folder)

out_path = os.path.join(output_folder,f"plot_tauoptim_{version}.png")

if save_figs:
    import kaleido
    fig.write_image(out_path)

# Image(url= out_path)

Unnamed: 0,theta,count,folds,accuracy_mean,accuracy_std,N_edges,Std
0,-0.005,25,5,0.795521,0.005878,552.0,0.0
1,0.0,25,5,0.795158,0.003524,276.0,0.0
2,0.004,25,5,0.792953,0.003936,84.6,6.144103
4,0.006,25,5,0.790614,0.002682,60.8,6.812978
6,0.008,25,5,0.780123,0.012351,47.0,4.232808
8,0.01,25,5,0.773345,0.009844,36.4,2.857738
10,0.012,25,5,0.714603,0.1099,29.8,1.979057
11,0.016,25,5,0.55032,0.102716,20.8,2.362908
12,0.02,25,5,0.5,0.0,15.2,1.979057
13,0.024,25,5,0.5,0.0,11.2,0.763763


### Adult

In [81]:
rerun_adult_optim = False
if rerun_adult_optim:
    ## Broad
    %run -i main_realdata.py --version=adult_fulltest, --theta_auto=True
    ## Finer grained run
    %run -i main_realdata.py --version=adult_fulltest, --theta_range="-1","0.8","0.1", --theta_auto=True

In [82]:
version = 'adult_fulltest'
metric = 'auc'

agg_stats = collect_results(version=version, metric=metric, dsets='100000', res_folder=results_folder, mat_folder=mats_folder)

sub = -0.09
agg_stats.loc[agg_stats['theta']==-1,'theta'] = sub

filter_theta = 0.6
filter_list = [0.2,0.24]
agg_stats = agg_stats[(agg_stats['theta']<=filter_theta) & (~agg_stats['theta'].isin(filter_list))]
display(agg_stats)

### Plot results
fig = plot_ly_lines(agg_stats,'auc', height=200)

fig.update_layout(
    legend={
        'y':-0.58,
        'x':0.8,
        'orientation':"h",
        'xanchor': 'center',
        'yanchor': 'bottom'},
    font=dict(
        family='Serif',#"Courier New, monospace",
        size=18,
        # color="Black"
    ) ,
    margin=dict(
            l=10,
            r=10,
            b=0,
            t=25,
            pad=0
            ),   
)

# Set x-axis title
fig.update_xaxes(showgrid=True,title={'text':r'$\text{Threshold } \tau$','font':{'size':18#, 'family': 'Courier New, monospace',
}},tickvals=[a for sub in [[0,0.08],list(np.round(np.arange(0.15, max(agg_stats['theta']), 0.15),1))] for a in sub],
)

fig.add_annotation(x=0.02, y=-0.225,
            text="CASTLE",
            showarrow=False,
            yshift=0,
            xref='paper',
            yref='paper',)

chosen_tau=0.08

fig.add_annotation(x=chosen_tau, y=0.8556,
            text=str(chosen_tau),
            showarrow=True,
            yshift=0,
            xref='x',ax=0,
            yref='y2',ay=112#172
            )
fig.add_annotation(x=chosen_tau, y=0.8556,
            text=str(0.86),
            showarrow=True,
            yshift=0,
            xref='x',ax=280,
            yref='y2',ay=0)
fig.add_annotation(x=chosen_tau, y=46.2,
            text=str(46),
            showarrow=True,
            yshift=0,
            xref='x',ax=-160,
            yref='y',ay=0)

fig.show()

if not os.path.exists(output_folder):
    os.mkdir(output_folder)

out_path = os.path.join(output_folder,f"plot_tauoptim_{version}.png")

if save_figs:
    import kaleido
    fig.write_image(out_path)

Unnamed: 0,theta,count,folds,accuracy_mean,accuracy_std,N_edges,Std
0,-0.09,25,5,0.747038,0.027878,210.0,0.0
1,0.0,25,5,0.863468,0.00599,105.0,0.0
3,0.08,25,5,0.855648,0.010548,46.2,1.979057
4,0.1,25,5,0.851808,0.006541,40.8,2.614065
5,0.12,25,5,0.850752,0.006104,36.6,2.783882
6,0.14,25,5,0.820374,0.059331,32.2,1.354006
7,0.16,25,5,0.783488,0.06342,30.8,1.190238
8,0.18,20,5,0.65379,0.161708,29.75,1.332785
11,0.28,5,5,0.5,0.0,27.0,0.0
12,0.32,5,5,0.5,0.0,25.0,0.0


### Boston

In [83]:
rerun_boston_optim = False
if rerun_boston_optim:
    %run -i main_realdata.py --version="boston" 

In [84]:
version = 'boston'
metric = 'MSE'

agg_stats = collect_results(version=version, metric=metric, dsets='100000', res_folder=results_folder, mat_folder=mats_folder)

sub = -0.17
agg_stats.loc[agg_stats['theta']==-1,'theta'] = sub

filter_theta = 0.8
filter_list = []
agg_stats = agg_stats[(agg_stats['theta']<=filter_theta) & (~round(agg_stats['theta'],3).isin(filter_list))]
display(agg_stats)

### Plot results
fig = plot_ly_lines(agg_stats,'MSE',height=200)

fig.update_layout(
    legend={
        'y':-0.58,
        'x':0.8,
        'orientation':"h",
        'xanchor': 'center',
        'yanchor': 'bottom'},
    font=dict(
        family='Serif',#"Courier New, monospace",
        size=18,
        # color="Black"
    ) ,
    margin=dict(
            l=10,
            r=10,
            b=0,
            t=25,
            pad=0
            ),   
)
# Set x-axis title
fig.update_xaxes(showgrid=True,title={'text':r'$\text{Threshold } \tau$','font':{'size':18#, 'family': 'Courier New, monospace',
}},tickvals=[a for sub in [[0,0.13],list(np.round(np.arange(0.3, max(agg_stats['theta']), 0.2),1))] for a in sub],
)

fig.add_annotation(x=0.02, y=-0.225,
            text="CASTLE",
            showarrow=False,
            yshift=0,
            xref='paper',
            yref='paper',)
            
chosen_tau=0.13

fig.add_annotation(x=chosen_tau, y=20.363,
            text=str(chosen_tau),
            showarrow=True,
            yshift=-149,#-202,
            xref='x',ax=0,
            yref='paper',ay=0)
fig.add_annotation(x=chosen_tau, y=20.363,
            text=str(20.4),
            showarrow=True,
            yshift=0,
            xref='x',ax=280,
            yref='y2',ay=0)
fig.add_annotation(x=chosen_tau, y=47.8,
            text=str(48),
            showarrow=True,
            yshift=0,
            xref='x',ax=-130,
            yref='y',ay=0)
            
fig.show()

if not os.path.exists(output_folder):
    os.mkdir(output_folder)

out_path = os.path.join(output_folder,f"plot_tauoptim_{version}.png")

if save_figs:
    import kaleido
    fig.write_image(out_path)

Unnamed: 0,theta,count,folds,accuracy_mean,accuracy_std,N_edges,Std
0,-0.17,25,5,22.231392,7.387012,182.0,0.0
1,0.0,25,5,18.50769,7.04278,91.0,0.0
15,0.037,10,5,16.818329,6.78389,90.5,0.527046
16,0.066,25,5,18.437362,6.868943,86.0,1.290994
17,0.132,25,5,20.363651,5.233823,47.8,7.942502
18,0.198,25,5,40.765085,27.205065,32.8,8.098354
19,0.264,25,5,41.8767,27.307922,23.0,6.390097
20,0.33,25,5,62.497449,29.58318,17.0,5.91608
21,0.396,25,5,84.998406,12.277859,12.2,5.416026
22,0.462,25,5,84.606197,12.862441,9.8,4.804512


### Cali

In [85]:
rerun_cali_optim = False
if rerun_cali_optim:
    ## First run - broader
    %run -i main_realdata.py --version=cali
    ## Second run
    %run -i main_realdata.py --version=cali --theta_interval=False --thetas='0.35','0.5'



In [86]:
version = 'cali'
metric = 'MSE'

agg_stats = collect_results(version=version, metric=metric, dsets='100000', res_folder=results_folder, mat_folder=mats_folder)

sub = -0.5
agg_stats.loc[agg_stats['theta']==-1,'theta'] = sub

filter_theta = 1.8
filter_list = [0.1, 0.188, 0.2,0.25, 0.3,0.4,0.45, 0.376, 0.55,0.65, 0.7,0.75, 0.8,0.85,0.9,0.94, 1.504,1.88, 2.256,2.632]#[0.1,0.188,0.2,0.3,0.35,0.376,0.45,0.65,0.7,0.8,0.85,0.9,0.95]
agg_stats = agg_stats[(agg_stats['theta']<=filter_theta) & (~round(agg_stats['theta'],3).isin(filter_list))]
display(agg_stats)

### Plot results
fig = plot_ly_lines(agg_stats,'MSE', height=200)

fig.update_layout(
    legend={
        'y':-0.58,
        'x':0.8,
        'orientation':"h",
        'xanchor': 'center',
        'yanchor': 'bottom'},
    font=dict(
        family='Serif',#"Courier New, monospace",
        size=18,
        # color="Black"
    ) ,
    margin=dict(
            l=10,
            r=10,
            b=0,
            t=25,
            pad=0
            ),   
)

# Set x-axis title
fig.update_xaxes(showgrid=True,title={'text':r'$\text{Threshold } \tau$','font':{'size':18#, 'family': 'Courier New, monospace',
}},tickvals=[a for sub in [[0.05],list(np.round(np.arange(0.5, max(agg_stats['theta']), 0.5),1))] for a in sub],
)

fig.add_annotation(x=0.02, y=-0.225,
            text="CASTLE",
            showarrow=False,
            yshift=0,
            xref='paper',
            yref='paper',)
            
chosen_tau=0.05

fig.add_annotation(x=chosen_tau, y=1.0185,
            text=str(chosen_tau),
            showarrow=True,
            yshift=-126,#-195,
            xref='x',ax=0,
            yref='paper',ay=0)
fig.add_annotation(x=chosen_tau, y=1.0185,
            text=str(1.02),
            showarrow=True,
            yshift=0,
            xref='x',ax=310,
            yref='y2',ay=0)
fig.add_annotation(x=chosen_tau, y=30.6,
            text=str(31),
            showarrow=True,
            yshift=0,
            xref='x',ax=-120,
            yref='y',ay=0)
            
fig.show()

if not os.path.exists(output_folder):
    os.mkdir(output_folder)

out_path = os.path.join(output_folder,f"plot_tauoptim_{version}.png")

if save_figs:
    import kaleido
    fig.write_image(out_path)

Unnamed: 0,theta,count,folds,accuracy_mean,accuracy_std,N_edges,Std
0,-0.5,25,5,0.662208,0.081241,72.0,0.0
1,0.0,25,5,1.062381,0.312212,36.0,0.0
2,0.05,25,5,1.018462,0.348869,30.6,2.929733
4,0.15,25,5,1.049525,0.365987,23.4,0.816497
9,0.35,25,5,1.081789,0.323505,18.4,1.527525
13,0.5,25,5,1.083272,0.321736,16.8,1.5
20,0.752,20,5,1.174837,0.294267,14.25,0.850696
25,0.95,15,5,1.339808,0.030016,11.0,2.236068
27,1.316,20,5,1.339957,0.023627,8.75,1.681947
29,1.692,25,5,1.331741,0.02689,6.2,1.0


## Table 1 - *Full* DAG Discovery and Injection Experiment

In [87]:
def collect_and_build_tab(
                version,
                metric,
                tau,
                sizes = ["100", "500","1000","2000","5000","10000","20000"],
                cols = ["CASTLE","Injected"],
                out_folds = 5,
                debug=False,
                display_tab = False,
                display_edge = False,
                print_latex = True
                ):

    size_list = [int(i) for i in sizes]

    list_edges = []
    for filename in os.listdir(mats_folder):
        # print(filename)
        match = re.search(f'W_est.(.*?)[.](\d)[.](\d)[.](.*?)[.]{version}[.]pkl$', filename)
        if match:
            sz, out_f, in_f, theta = match.groups()
            edges = sum(load_pickle(os.path.join(mats_folder,filename), verbose=False).flatten()>0)
            list_edges.append((float(theta), out_f, in_f, edges))
            # print(list_edges)
    n_edges = pd.DataFrame(list_edges).groupby(0, as_index=False).agg({3 : ['count','mean', 'std']})
    n_edges.columns = n_edges.columns.droplevel()
    n_edges.columns = ['theta','N runs','N_edges','Std']

    if display_edge:
        display(n_edges)

    agg_stats_all = pd.DataFrame()
    for size in size_list:
        filename = f"Nested{out_folds}FoldCASTLE.Reg.Synth.{size}.{version}.pkl"
        describe = load_pickle(os.path.join(results_folder,filename), verbose=False)

        if debug:
            print(size)
            print(pd.DataFrame([(describe[c]['theta'],describe[c]['fold'],describe[c][metric]) for c in describe]).groupby(0, as_index=False).agg({1 : ['count'], 2 : ['mean', 'std']}))

        agg_stats = pd.DataFrame([(describe[c]['theta'],describe[c]['fold'],describe[c][metric]) for c in describe]).groupby(0, as_index=False).agg({1 : ['count'], 2 : ['mean', 'std']})
        agg_stats.columns = ['theta', 'folds', 'accuracy_mean', 'accuracy_std']

        agg_stats.loc[agg_stats['theta']==-1,'theta'] = -0.05

        agg_stats['Dset_sz'] = size

        agg_stats_all = pd.concat([agg_stats_all,agg_stats])

    agg_stats_all

    ds = agg_stats_all[['theta', 'accuracy_mean', 'accuracy_std','Dset_sz']]

    ds['values'] = ["{0:.2f}".format(round(mean,3))+" ({0:.2f})".format(round(std,2)) for mean,std in zip(ds['accuracy_mean'],ds['accuracy_std']) ]
    ds.loc[ds['theta']==-0.05,'theta'] = "CASTLE"
    ds.loc[ds['theta']==tau,'theta'] = "Injected"
    if debug:
        print(ds)
    table = ds.pivot(index='Dset_sz',columns='theta', values='values')
    if debug:
        display(table)
    table = table[cols]

    return table


def format_tab(table):
    def compute_top_2(names):
        t = ''
        pcount = round(len(names)/len(np.unique(names)))
        _, idx = np.unique(names, return_index=True)
        names = names[np.sort(idx)]
        if debug:
            print(names)
        for i in range(len(names)):
            t += ' & \multicolumn{' + str(pcount) + '}{c}{' + str(names[i]) + '}'
        return t
                
    # top1 = '\multirow{2}{*}{Dataset} & \multirow{2}{*}{BC$^{*}$}' + compute_top(table.columns.values[1:]) + '\\\\'
    top1 = 'Dataset' + compute_top_2(table.columns.values) + '\\\\ \n\hline '

    print_table = lambda table, top : table.to_latex(escape = False, index = True, index_names = False, header =False,
                        formatters=[(lambda x : '\!' + str(x) + '\!') for col in table.columns.values],
    #                      header = [f'\!\!{nice_relations[col[-1:]] if col != "BC" else ""}\!\!' for col in table.columns.values],\
                        column_format = 'r' + 'c' * (len(table.columns.values)),
                        ).replace(r'\toprule', top).replace(r'\midrule', r'\hline').replace(r'\bottomrule', r'\hline').replace('.0%', '%').replace('\!%', '\!0%').replace('%', '\%')
    print(print_table(table, top1))

### Cali

In [88]:
rerun_cali_size = False
if rerun_cali_size:
    ### Nested
    %run -i main_realdata.py --version="cali_size3" --out_folds=5 --in_folds=5 --dataset_szs="100","500","1000","2000","5000","10000","20000" --thetas="-1","0.05"

format_tab(collect_and_build_tab(version = f'cali_size3', cols = ["CASTLE"], metric = 'MSE', tau = 0.05))
format_tab(pd.concat(
[collect_and_build_tab(version = f'cali_size3', metric = 'MSE', tau = 0.05, cols = ["Injected"], sizes = ["100"]),
collect_and_build_tab(version = f'cali_lr8', metric = 'MSE', tau = 0.05, cols = ["Injected"], sizes = ["500"]),
collect_and_build_tab(version = f'cali_size3', metric = 'MSE', tau = 0.05, cols = ["Injected"], sizes = ["1000","2000","5000","10000","20000"])]
))


In future versions `DataFrame.to_latex` is expected to utilise the base implementation of `Styler.to_latex` for formatting and rendering. The arguments signature may therefore change. It is recommended instead to use `DataFrame.style.to_latex` which also contains additional functionality.



\begin{tabular}{rc}
Dataset & \multicolumn{1}{c}{CASTLE}\\ 
\hline 
100   &  \!7.05 (12.81)\! \\
500   &   \!2.33 (1.39)\! \\
1000  &   \!2.96 (4.12)\! \\
2000  &   \!3.86 (3.68)\! \\
5000  &   \!4.91 (7.41)\! \\
10000 &   \!1.74 (1.70)\! \\
20000 &   \!0.66 (0.08)\! \\
\hline
\end{tabular}

\begin{tabular}{rc}
Dataset & \multicolumn{1}{c}{Injected}\\ 
\hline 
100   &  \!2.94 (2.63)\! \\
500   &  \!2.25 (1.07)\! \\
1000  &  \!1.68 (1.14)\! \\
2000  &  \!1.71 (0.57)\! \\
5000  &  \!1.51 (0.62)\! \\
10000 &  \!1.16 (0.31)\! \\
20000 &  \!1.02 (0.35)\! \\
\hline
\end{tabular}




In future versions `DataFrame.to_latex` is expected to utilise the base implementation of `Styler.to_latex` for formatting and rendering. The arguments signature may therefore change. It is recommended instead to use `DataFrame.style.to_latex` which also contains additional functionality.



### Boston

In [89]:
rerun_boston_size = False
if rerun_boston_size:
    %run -i main_realdata.py --version="boston_size3" --out_folds=5 --in_folds=5 --dataset_szs="100","506" --thetas='-1','0.13'

format_tab(collect_and_build_tab(version = f'boston_size3', metric = 'MSE', tau = 0.13, sizes = ["100", "506"]))


\begin{tabular}{rcc}
Dataset & \multicolumn{1}{c}{CASTLE} & \multicolumn{1}{c}{Injected}\\ 
\hline 
100 &  \!112.04 (91.06)\! &  \!86.17 (13.75)\! \\
506 &    \!21.95 (6.84)\! &   \!20.46 (5.12)\! \\
\hline
\end{tabular}




In future versions `DataFrame.to_latex` is expected to utilise the base implementation of `Styler.to_latex` for formatting and rendering. The arguments signature may therefore change. It is recommended instead to use `DataFrame.style.to_latex` which also contains additional functionality.



### Fico

In [90]:
rerun_fico_size = False
if rerun_fico_size:
    %run -i main_realdata.py --version="fico_size5" --out_folds=5 --in_folds=5 --dataset_szs="100","500","1000","2000","5000","8000" --thetas="-1"
    %run -i main_realdata.py --version="fico_size7" --out_folds=5 --in_folds=5 --dataset_szs="100","500","1000","2000","5000","8000" --thetas="0.004"

format_tab(collect_and_build_tab(version = f'fico_size5', metric = 'auc', tau = -1, cols = ["CASTLE"], sizes = ["100","500","1000","2000","5000","8000"]))
format_tab(pd.concat([
collect_and_build_tab(version = f'fico_lr_75', metric = 'auc', tau = 0.004, cols = ["Injected"], sizes = ["100"]),
collect_and_build_tab(version = f'fico_size10', metric = 'auc', tau = 0.004, cols = ["Injected"], sizes = ["500"]),
collect_and_build_tab(version = f'fico_size7', metric = 'auc', tau = 0.004, cols = ["Injected"], sizes = ["1000","2000","5000","8000"])
]))


In future versions `DataFrame.to_latex` is expected to utilise the base implementation of `Styler.to_latex` for formatting and rendering. The arguments signature may therefore change. It is recommended instead to use `DataFrame.style.to_latex` which also contains additional functionality.



\begin{tabular}{rc}
Dataset & \multicolumn{1}{c}{CASTLE}\\ 
\hline 
100  &  \!0.75 (0.02)\! \\
500  &  \!0.79 (0.01)\! \\
1000 &  \!0.78 (0.01)\! \\
2000 &  \!0.79 (0.01)\! \\
5000 &  \!0.79 (0.01)\! \\
8000 &  \!0.80 (0.01)\! \\
\hline
\end{tabular}

\begin{tabular}{rc}
Dataset & \multicolumn{1}{c}{Injected}\\ 
\hline 
100  &  \!0.74 (0.04)\! \\
500  &  \!0.78 (0.01)\! \\
1000 &  \!0.78 (0.01)\! \\
2000 &  \!0.78 (0.01)\! \\
5000 &  \!0.79 (0.01)\! \\
8000 &  \!0.79 (0.01)\! \\
\hline
\end{tabular}




In future versions `DataFrame.to_latex` is expected to utilise the base implementation of `Styler.to_latex` for formatting and rendering. The arguments signature may therefore change. It is recommended instead to use `DataFrame.style.to_latex` which also contains additional functionality.



### Adult

In [91]:
rerun_adult_size = False
if rerun_adult_size:
    %run -i main_realdata.py --version="adult_size" --out_folds=5 --in_folds=5 --dataset_szs="100","500","1000","2000","5000","10000","20000" --thetas="-1"
    %run -i main_realdata.py --version="adult_size2" --out_folds=5 --in_folds=5 --dataset_szs="100","500","1000","2000","5000","10000","20000" --thetas="0.08"

format_tab(collect_and_build_tab(version = f'adult_size', metric = 'auc', tau = -1, cols = ["CASTLE"]))
format_tab(pd.concat([
collect_and_build_tab(version = f'adult_size5', metric = 'auc', tau = 0.08, cols = ["Injected"], sizes = ["100"]),
collect_and_build_tab(version = f'adult_size4', metric = 'auc', tau = 0.08, cols = ["Injected"], sizes = ["500"]),
collect_and_build_tab(version = f'adult_size2', metric = 'auc', tau = 0.08, cols = ["Injected"], sizes = ["1000","2000","5000","10000","20000"])
]))


In future versions `DataFrame.to_latex` is expected to utilise the base implementation of `Styler.to_latex` for formatting and rendering. The arguments signature may therefore change. It is recommended instead to use `DataFrame.style.to_latex` which also contains additional functionality.



\begin{tabular}{rc}
Dataset & \multicolumn{1}{c}{CASTLE}\\ 
\hline 
100   &  \!0.67 (0.03)\! \\
500   &  \!0.72 (0.04)\! \\
1000  &  \!0.75 (0.03)\! \\
2000  &  \!0.74 (0.03)\! \\
5000  &  \!0.75 (0.03)\! \\
10000 &  \!0.75 (0.02)\! \\
20000 &  \!0.76 (0.02)\! \\
\hline
\end{tabular}

\begin{tabular}{rc}
Dataset & \multicolumn{1}{c}{Injected}\\ 
\hline 
100   &  \!0.69 (0.04)\! \\
500   &  \!0.74 (0.02)\! \\
1000  &  \!0.76 (0.03)\! \\
2000  &  \!0.77 (0.01)\! \\
5000  &  \!0.79 (0.03)\! \\
10000 &  \!0.85 (0.01)\! \\
20000 &  \!0.86 (0.01)\! \\
\hline
\end{tabular}




In future versions `DataFrame.to_latex` is expected to utilise the base implementation of `Styler.to_latex` for formatting and rendering. The arguments signature may therefore change. It is recommended instead to use `DataFrame.style.to_latex` which also contains additional functionality.



## Table 1 - Adult Partial Experiment

### Create a Partial Input Graph

This code is also in utils.py where main_synth.py picks it up from. Here for illustration.

In [92]:
debug = False
from pmlb import fetch_data

# load dataset and descriptive statistics
df_Name = 'adult'; df = fetch_data(df_Name)

cols = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
        'marital-status', 'occupation', 'relationship', 'race', 'sex',
        'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']

cols = list(df.columns)
cols = [cols[-1]] + cols[:-1]
df = df[cols]

partial_mat = 1 - np.identity(df.shape[1])
if debug:  
    print(partial_mat)
    print(df.columns)

## Define what cannot be caused by other variables
prime_causes = ['age', 'sex', 'race', 'native-country']
only_effects = ['target','capital-gain', 'capital-loss']
not_caused_by = [('education',['occupation','hours-per-week'])
                 ,('education-num',['occupation','hours-per-week'])
                 ,('fnlwgt',['occupation','hours-per-week'])
                 ,('relationship',['occupation','hours-per-week'])
                 ,('marital-status',['occupation','hours-per-week'])
                ]

for i,col in enumerate(df.columns.values):
    if col in prime_causes:
        ## the whole column is set to 0
        partial_mat[:,i] = 0
    elif col in only_effects:
        ## the whole row, apart from target, is set to 0
        partial_mat[i,1:] = 0    
    elif col in [c[0] for c in not_caused_by]:
        not_causes = [c[1] for c in not_caused_by if c[0]==col][0]
        not_causes_idxs = [i for i in range(len(df.columns)) if list(df.columns)[i] in not_causes]
        if debug:
            print("0ing:", not_causes_idxs)
        partial_mat[not_causes_idxs,i] = 0  
if debug:        
    print(partial_mat)

def heat_mat(mat, names=None, colour="#003E74"):
    import seaborn as sns
    cm = sns.light_palette(colour, as_cmap=True)
    x=pd.DataFrame(mat).round(3)
    if names is not None:
        x.columns = names
        x.index = names
    x.style.set_properties(**{
    # 'background-color': 'white',
    'font-size': '100pt',
    })    
    x=x.style.background_gradient(cmap=cm, low= np.min(mat.flatten()), high=np.max(mat.flatten())).format("{:.3}")

    return display(x) 

heat_mat(partial_mat, names= df.columns.values)


Unnamed: 0,target,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
target,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
age,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
workclass,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
fnlwgt,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
education,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
education-num,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
marital-status,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
occupation,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
relationship,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
race,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0


### Data Size Experiment

In [93]:
rerun_size_p = False
if rerun_size_p:
    %run -i main_realdata.py --version="adult_partial" --out_folds=5 --in_folds=5 --dataset_szs="100","500","1000","2000","5000","10000","20000" --thetas="0.08"

    ## Define what cannot be caused by other variables
    # prime_causes = ['age', 'sex', 'race', 'native-country']
    # only_effects = ['target','capital-gain', 'capital-loss']
    # not_caused_by = [('education',['occupation','hours-per-week'])
    #                 ,('education-num',['occupation','hours-per-week'])
    #                 ,('fnlwgt',['occupation','hours-per-week'])
    #                 ,('relationship',['occupation','hours-per-week'])
    #                 ,('marital-status',['occupation','hours-per-week'])
    #                 ]

In [94]:
format_tab(collect_and_build_tab(version = f'adult_partial', metric = 'auc', tau = 0.08, cols = ["Injected"]))

\begin{tabular}{rc}
Dataset & \multicolumn{1}{c}{Injected}\\ 
\hline 
100   &  \!0.66 (0.02)\! \\
500   &  \!0.71 (0.02)\! \\
1000  &  \!0.74 (0.03)\! \\
2000  &  \!0.76 (0.03)\! \\
5000  &  \!0.76 (0.02)\! \\
10000 &  \!0.76 (0.02)\! \\
20000 &  \!0.77 (0.02)\! \\
\hline
\end{tabular}




In future versions `DataFrame.to_latex` is expected to utilise the base implementation of `Styler.to_latex` for formatting and rendering. The arguments signature may therefore change. It is recommended instead to use `DataFrame.style.to_latex` which also contains additional functionality.



## Table 1 - Adult Refine Experiment

### Display computed (*full*) DAG

In [95]:
debug = False
import re
dset = '100000'
theta = '0.08'
version = "adult"+"_fulltest"
folder = "../results/adjmats/"
list_edges = []
count = 0
W_est = np.zeros((df.shape[1],df.shape[1]))
for filename in os.listdir(folder):
    match = re.search(f'W_est.{dset}.(\d).(\d).{theta}(\d+)?.{version}.pkl$', filename)
    if match:
        if debug:
            print(filename)
        out_f, in_f, _ = match.groups()
        w_est = load_pickle(os.path.join(folder,filename), verbose=False)
        W_est += w_est
        if debug:
            print(W_est)
        count += 1
W_est = W_est/count
print(count)

tau = float(theta)
print("tau:",tau)
maxed_adj = DAG_retreive_np(W_est, 0)
maxed_adj = W_est
# print(W_est)
# heat_mat(maxed_adj, names= df.columns.values)
plot_DAG(maxed_adj, graphic_type = "py", names = list(df.columns), name= "adult", version=theta, output_folder=output_folder)

25
tau: 0.08


### Data Size Experiment

In [96]:
rerun_size_refine = False
if rerun_size_refine:
    %run -i main_realdata.py --version="adult_refine" --out_folds=5 --in_folds=5 --dataset_szs="100","500","1000","2000","5000","10000","20000" --thetas="0.08"

    ## Define what cannot be caused by other variables
    # prime_causes = ['age', 'sex', 'race', 'native-country']
    # only_effects = ['target','capital-gain', 'capital-loss']
    # not_caused_by = [('education',['occupation','hours-per-week'])
    #                 ,('education-num',['occupation','hours-per-week'])
    #                 ,('fnlwgt',['occupation','hours-per-week'])
    #                 ,('relationship',['occupation','hours-per-week'])
    #                 ,('marital-status',['occupation','hours-per-week'])
    #                 ]

In [97]:
format_tab(collect_and_build_tab(version = f'adult_refine', metric = 'auc', tau = 0.08, cols = ["Injected"]))

\begin{tabular}{rc}
Dataset & \multicolumn{1}{c}{Injected}\\ 
\hline 
100   &  \!0.69 (0.04)\! \\
500   &  \!0.74 (0.02)\! \\
1000  &  \!0.76 (0.02)\! \\
2000  &  \!0.77 (0.02)\! \\
5000  &  \!0.79 (0.03)\! \\
10000 &  \!0.85 (0.01)\! \\
20000 &  \!0.86 (0.01)\! \\
\hline
\end{tabular}




In future versions `DataFrame.to_latex` is expected to utilise the base implementation of `Styler.to_latex` for formatting and rendering. The arguments signature may therefore change. It is recommended instead to use `DataFrame.style.to_latex` which also contains additional functionality.

