## Fico

In [18]:
import os
import pandas as pd
import numpy as np
from utils import load_pickle
save = True

version = 'fico_3'
metric = 'auc'

dset='100000'
theta = '(([+-])?\d(\.\d+)?)'

folder = "./results/adjmats/"

import re
list_edges = []
for filename in os.listdir(folder):
    match = re.search(f'W_est.{dset}.(\d).(\d).(.*?).{version}.pkl$', filename)
    if match:
        out_f, in_f, theta = match.groups()
        edges = sum(load_pickle(os.path.join(folder,filename), verbose=False).flatten()>0)
        list_edges.append((float(theta), out_f, in_f, edges))
n_edges = pd.DataFrame(list_edges).groupby(0, as_index=False).agg({3 : ['mean', 'std']})
n_edges.columns = n_edges.columns.droplevel()
n_edges.columns = ['theta','N_edges','Std']


folder = "./results/"
filename = f"Nested5FoldCASTLE.Reg.Synth.100000.{version}.pkl"
describe = load_pickle(os.path.join(folder,filename), verbose=False)
agg_stats = pd.DataFrame([(describe[c]['theta'],c[9],describe[c]['fold'],describe[c][metric]) for c in describe]).groupby(0, as_index=False).agg({2 : ['count','max'], 3 : ['mean', 'std']})
agg_stats.columns = ['theta', 'count','folds',  'accuracy_mean', 'accuracy_std']
agg_stats=agg_stats.merge(n_edges,on='theta')
agg_stats= agg_stats.drop_duplicates(subset=['N_edges'])

if version == 'fico2':
    #### Add Normal CASTLE
    agg_stats.loc[-1] = [-1,5,5,0.723377,0.009611,405.12,10.401602]#,True]  # adding a row
    agg_stats.index = agg_stats.index + 1  # shifting index
    agg_stats = agg_stats.sort_index()  # sorting by index

    n_edges.loc[-1] = [-1,405.12,10.401602]  # adding a row
    n_edges.index = n_edges.index + 1  # shifting index
    n_edges = n_edges.sort_index()  # sorting by index

sub = -0.007
agg_stats.loc[agg_stats['theta']==-1,'theta'] = sub

filter_theta = 0.03
filter_list = [0.005,0.007,0.009,0.011]
agg_stats = agg_stats[(agg_stats['theta']<=filter_theta) & (~agg_stats['theta'].isin(filter_list))]
display(agg_stats)

import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

if metric == 'accuracy':
    metric =metric.capitalize()
else:
    metric = metric.upper()

# Add traces
fig.add_trace(
    go.Scatter(x=agg_stats['theta'], y=agg_stats['accuracy_mean'], name=metric,         
    error_y=dict(
            type='data', # value of error bar given in data coordinates
            array=agg_stats['accuracy_std'],
            visible=True)
            ),
    secondary_y=True, 
)

fig.add_trace(
    go.Scatter(x=agg_stats['theta'], y=agg_stats['N_edges'], name="Edges",
    error_y=dict(
            type='data', # value of error bar given in data coordinates
            array=agg_stats['Std'],
            visible=True)
              ),
    secondary_y=False,
)

# Add figure title
fig.update_layout(
    title='',
    legend={
        'y':-0.23,
        'x':0.85,
        'orientation':"h",
        'xanchor': 'center',
        'yanchor': 'bottom'},
    template='plotly_white',
    autosize=True,
    width=600, height=250, 
    margin=dict(
        l=10,
        r=10,
        b=0,
        t=10,
        pad=0
    ),
    font=dict(
        family='Serif',#"Courier New, monospace",
        size=18,
        # color="Black"
    )    
)

# Set x-axis title
fig.update_xaxes(showgrid=True,title={'text':r'$\text{Threshold } \tau$','font':{'size':18#, 'family': 'Courier New, monospace',
}},tickvals=list(np.round(np.arange(0, max(agg_stats['theta']), 0.01),3)),
)

# Set y-axes titles
fig.update_yaxes(showgrid=False,nticks=6,zeroline=False, title={'text':metric#,'font':{'size':18}
}, #nticks=13,
secondary_y=True)
fig.update_yaxes(title={'text':"Number of Edges"#,'font':{'size':18}
}, secondary_y=False)

fig.add_annotation(x=0.02, y=-0.094,
            text="CASTLE",
            showarrow=False,
            yshift=0,
            xref='paper',
            yref='paper',)
chosen_tau=0.004
y_co = agg_stats[agg_stats['theta']==chosen_tau]['accuracy_mean']
fig.add_annotation(x=chosen_tau, y=0.7929,
            text=str(chosen_tau),
            showarrow=True,
            yshift=0,
            xref='x',ax=0,
            yref='y2',ay=170)
fig.add_annotation(x=chosen_tau, y=0.7929,
            text=str(0.79),
            showarrow=True,
            yshift=0,
            xref='x',ax=300,
            yref='y2',ay=0)
fig.add_annotation(x=chosen_tau, y=84.6,
            text=str(85),
            showarrow=True,
            yshift=0,
            xref='x',ax=-150,
            yref='y',ay=0)

fig.show()

output_folder = "figures"
if not os.path.exists(output_folder):
    os.mkdir(output_folder)

out_path = os.path.join(output_folder,f"plot_tauoptim_{version}.png")

if save:
    import kaleido
    fig.write_image(out_path)



Unnamed: 0,theta,count,folds,accuracy_mean,accuracy_std,N_edges,Std
0,-0.007,25,5,0.795521,0.005878,552.0,0.0
1,0.0,25,5,0.795158,0.003524,276.0,0.0
2,0.004,25,5,0.792953,0.003936,84.6,6.144103
4,0.006,25,5,0.790614,0.002682,60.8,6.812978
6,0.008,25,5,0.780123,0.012351,47.0,4.232808
8,0.01,25,5,0.773345,0.009844,36.4,2.857738
10,0.012,25,5,0.714603,0.1099,29.8,1.979057
11,0.016,25,5,0.55032,0.102716,20.8,2.362908
12,0.02,25,5,0.5,0.0,15.2,1.979057
13,0.024,25,5,0.5,0.0,11.2,0.763763


In [17]:
dag_type = 'fico'
csv = './data/fico/WOE_Rud_data.csv'
csv_y = './data/fico/y_data.csv'
df = pd.read_csv(csv)
label = 'RiskPerformance'
y = pd.read_csv(os.path.join(csv_y))
y = pd.get_dummies(y[label])[['Bad']].to_numpy()
df.insert(loc=0, column=label, value=y)
from utils import DAG_retreive_np, heat_mat, plot_DAG


import re
dset = '100000'
theta = '0.006'
version = dag_type+"_3"
folder = "./results/adjmats/"
list_edges = []
count = 0
W_est = np.zeros((df.shape[1],df.shape[1]))
# print(W_est.shape)
for filename in os.listdir(folder):
#     print(filename)
    if version in ['v3','p1']:
        match = re.search(f'W_est.{dset}.([\w]+).([\w.]+).{version}.pkl$', filename)
    else:
        match = re.search(f'W_est.{dset}.(\d).(\d).{theta}(\d+)?.{version}.pkl$', filename)
    if match:
#         print( match.groups())
#         print(filename)
        out_f, in_f, _ = match.groups()
        w_est = load_pickle(os.path.join(folder,filename), verbose=False)
#         print(sum(w_est.flatten()>0))
#         print(w_est)
        # print(w_est.shape)
        W_est += w_est
#         print(W_est)
        count += 1
W_est = W_est/count
# print(W_est)
# print((W_est).flatten())
# print(sum((W_est).flatten()>0))

tau = float(theta)
print("tau:",tau)
maxed_adj = DAG_retreive_np(W_est, tau)
heat_mat(maxed_adj, names= df.columns.values)
plot_DAG(maxed_adj, graphic_type = "py", names = list(df.columns))

tau: 0.006


Unnamed: 0,RiskPerformance,ExternalRiskEstimate_bin_WOE,MSinceOldestTradeOpen_bin_WOE,MSinceMostRecentTradeOpen_bin_WOE,AverageMInFile_bin_WOE,NumSatisfactoryTrades_bin_WOE,NumTrades60Ever2DerogPubRec_bin_WOE,NumTrades90Ever2DerogPubRec_bin_WOE,NumTotalTrades_bin_WOE,NumTradesOpeninLast12M_bin_WOE,PercentTradesNeverDelq_bin_WOE,MSinceMostRecentDelq_bin_WOE,MaxDelq2PublicRecLast12M_bin_WOE,MaxDelqEver_bin_WOE,PercentInstallTrades_bin_WOE,NetFractionInstallBurden_bin_WOE,NumInstallTradesWBalance_bin_WOE,MSinceMostRecentInqexcl7days_bin_WOE,NumInqLast6M_bin_WOE,NumInqLast6Mexcl7days_bin_WOE,NetFractionRevolvingBurden_bin_WOE,NumRevolvingTradesWBalance_bin_WOE,NumBank2NatlTradesWHighUtilization_bin_WOE,PercentTradesWBalance_bin_WOE
RiskPerformance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ExternalRiskEstimate_bin_WOE,0.011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MSinceOldestTradeOpen_bin_WOE,0.0,0.0,0.0,0.0,0.028,0.0,0.0,0.0,0.007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MSinceMostRecentTradeOpen_bin_WOE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AverageMInFile_bin_WOE,0.009,0.021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015,0.0,0.0,0.0,0.0,0.0,0.0
NumSatisfactoryTrades_bin_WOE,0.007,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009,0.015
NumTrades60Ever2DerogPubRec_bin_WOE,0.0,0.016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NumTrades90Ever2DerogPubRec_bin_WOE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NumTotalTrades_bin_WOE,0.0,0.0,0.0,0.0,0.0,0.023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NumTradesOpeninLast12M_bin_WOE,0.0,0.011,0.0,0.0,0.009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Total Number of Edges in G: 42
Max in degree: 14
DAG: True


In [None]:
save_mat = True
if save_mat:
    pd.DataFrame(W_est).to_csv(f"{dag_type}_adj_matrix.csv", index=False)

In [94]:
import networkx as nx
debug=True

W_est = pd.read_csv(f"{dag_type}_adj_matrix.csv").to_numpy()

v = f'{dag_type}_size2'
agg_stats_all = pd.DataFrame()

metric = 'auc'
theta = 0.006 ## fico: 0.006
# size_list = [100,500,1000,4000,8000] ##fico
# size_list = [50,100,200,300,400,506] ## boston
size_list = [int(i) for i in ["100","200","300","400","500","1000","2000","4000","8000"]]
# out_folds = 5 ##fico
out_folds = 1
# metric = 'accuracy' ##fico


for size in size_list:
    # filename = f"5FoldCASTLE.Reg.Synth.{size}.pkl"
# for size in [50,100,200,500,1000]:
    filename = f"results/Nested{out_folds}FoldCASTLE.Reg.Synth.{size}.{v}.pkl"
    describe = load_pickle(os.path.join(filename), verbose=False)

    if debug:
        print(size)
        print(pd.DataFrame([(describe[c]['theta'],describe[c]['fold'],describe[c][metric]) for c in describe]).groupby(0, as_index=False).agg({1 : ['max'], 2 : ['mean', 'std']}))

    agg_stats = pd.DataFrame([(describe[c]['theta'],describe[c]['fold'],describe[c][metric]) for c in describe]).groupby(0, as_index=False).agg({1 : ['max'], 2 : ['mean', 'std']})
    agg_stats.columns = ['theta', 'folds', 'accuracy_mean', 'accuracy_std']

    agg_stats.loc[agg_stats['theta']==-1,'theta'] = -0.05

    # display(agg_stats)

    n_edges = pd.DataFrame([(theta, nx.from_numpy_matrix(DAG_retreive_np(W_est, theta), create_using=nx.DiGraph, parallel_edges=False).number_of_edges()) for theta in agg_stats['theta']])

    n_edges.loc[n_edges[0]==-1,0] = -0.05
    n_edges.loc[n_edges[0]==-0.05,1] = nx.from_numpy_matrix(W_est, create_using=nx.DiGraph, parallel_edges=False).number_of_edges()

    # display(n_edges)

    n_edges_DAG = n_edges.merge(pd.DataFrame([(theta, nx.is_directed_acyclic_graph(nx.from_numpy_matrix(DAG_retreive_np(W_est, theta), create_using=nx.DiGraph, parallel_edges=False))) for theta in agg_stats['theta']]), on=0)

    n_edges_DAG.columns = ['theta', 'N_edges', 'is_DAG']

    agg_stats.merge(n_edges_DAG,on='theta')
    agg_stats['Dset_sz'] = size

    # display(agg_stats)

    agg_stats_all = pd.concat([agg_stats_all,agg_stats])

agg_stats_all

ds = agg_stats_all[['theta', 'accuracy_mean', 'accuracy_std','Dset_sz']]

ds['values'] = ["{0:.3f}".format(round(mean,3))+" ({0:.2f})".format(round(std,2)) for mean,std in zip(ds['accuracy_mean'],ds['accuracy_std']) ]
ds.loc[ds['theta']==-0.05,'theta'] = "Unconstrained"
ds.loc[ds['theta']==theta,'theta'] = "Injected"
table = ds.pivot(index='Dset_sz',columns='theta', values='values')

display(table)


def compute_top_2(names):
    t = ''
    pcount = round(len(names)/len(np.unique(names)))
    _, idx = np.unique(names, return_index=True)
    names = names[np.sort(idx)]
    print(names)
    for i in range(len(names)):
        t += ' & \multicolumn{' + str(pcount) + '}{c}{' + str(names[i]) + '}'
    return t
            
# top1 = '\multirow{2}{*}{Dataset} & \multirow{2}{*}{BC$^{*}$}' + compute_top(table.columns.values[1:]) + '\\\\'
top1 = 'Dataset' + compute_top_2(table.columns.values) + '\\\\ \n\hline '

print_table = lambda table, top : table.to_latex(escape = False, index = True, index_names = False, header =False,
                     formatters=[(lambda x : '\!' + str(x) + '\!') for col in table.columns.values],
#                      header = [f'\!\!{nice_relations[col[-1:]] if col != "BC" else ""}\!\!' for col in table.columns.values],\
                     column_format = 'r' + 'c' * (len(table.columns.values)),
                    ).replace(r'\toprule', top).replace(r'\midrule', r'\hline').replace(r'\bottomrule', r'\hline').replace('.0%', '%').replace('\!%', '\!0%').replace('%', '\%')

print(print_table(table, top1))

100
       0   1         2          
         max      mean       std
0 -1.000  10  0.747526  0.012529
1  0.006  10  0.500000  0.000000
200
       0   1         2          
         max      mean       std
0 -1.000  10  0.786898  0.005297
1  0.006  10  0.500000  0.000000
300
       0   1         2          
         max      mean       std
0 -1.000  10  0.790780  0.005481
1  0.006  10  0.714671  0.002736
400
       0   1         2          
         max      mean       std
0 -1.000  10  0.793683  0.001849
1  0.006  10  0.500000  0.000000
500
       0   1         2          
         max      mean       std
0 -1.000  10  0.794163  0.002011
1  0.006  10  0.790653  0.002072
1000
       0   1         2          
         max      mean       std
0 -1.000  10  0.794397  0.003449
1  0.006  10  0.792225  0.002034
2000
       0   1         2          
         max      mean       std
0 -1.000  10  0.794649  0.003458
1  0.006  10  0.793043  0.002927
4000
       0   1         2          
        

theta,Injected,Unconstrained
Dset_sz,Unnamed: 1_level_1,Unnamed: 2_level_1
100,0.500 (0.00),0.748 (0.01)
200,0.500 (0.00),0.787 (0.01)
300,0.715 (0.00),0.791 (0.01)
400,0.500 (0.00),0.794 (0.00)
500,0.791 (0.00),0.794 (0.00)
1000,0.792 (0.00),0.794 (0.00)
2000,0.793 (0.00),0.795 (0.00)
4000,0.777 (0.00),0.797 (0.00)
8000,0.793 (0.00),0.797 (0.00)


['Injected' 'Unconstrained']
\begin{tabular}{rcc}
Dataset & \multicolumn{1}{c}{Injected} & \multicolumn{1}{c}{Unconstrained}\\ 
\hline 
100  &  \!0.500 (0.00)\! &  \!0.748 (0.01)\! \\
200  &  \!0.500 (0.00)\! &  \!0.787 (0.01)\! \\
300  &  \!0.715 (0.00)\! &  \!0.791 (0.01)\! \\
400  &  \!0.500 (0.00)\! &  \!0.794 (0.00)\! \\
500  &  \!0.791 (0.00)\! &  \!0.794 (0.00)\! \\
1000 &  \!0.792 (0.00)\! &  \!0.794 (0.00)\! \\
2000 &  \!0.793 (0.00)\! &  \!0.795 (0.00)\! \\
4000 &  \!0.777 (0.00)\! &  \!0.797 (0.00)\! \\
8000 &  \!0.793 (0.00)\! &  \!0.797 (0.00)\! \\
\hline
\end{tabular}



## Adult

In [5]:
import os
import pandas as pd
import numpy as np
from utils import load_pickle
save = True

version = 'adult_fulltest'
metric = 'auc'

dset='100000'
theta = '(([+-])?\d(\.\d+)?)'

folder = "./results/adjmats/"

import re
list_edges = []
for filename in os.listdir(folder):
    match = re.search(f'W_est.{dset}.(\d).(\d).(.*?).{version}.pkl$', filename)
    if match:
        out_f, in_f, theta = match.groups()
        edges = sum(load_pickle(os.path.join(folder,filename), verbose=False).flatten()>0)
        list_edges.append((float(theta), out_f, in_f, edges))
n_edges = pd.DataFrame(list_edges).groupby(0, as_index=False).agg({3 : ['mean', 'std']})
n_edges.columns = n_edges.columns.droplevel()
n_edges.columns = ['theta','N_edges','Std']


folder = "./results/"
filename = f"Nested5FoldCASTLE.Reg.Synth.100000.{version}.pkl"
describe = load_pickle(os.path.join(folder,filename), verbose=False)
agg_stats = pd.DataFrame([(describe[c]['theta'],c[9],describe[c]['fold'],describe[c][metric]) for c in describe]).groupby(0, as_index=False).agg({2 : ['count','max'], 3 : ['mean', 'std']})
agg_stats.columns = ['theta', 'count','folds',  'accuracy_mean', 'accuracy_std']
agg_stats=agg_stats.merge(n_edges,on='theta')
agg_stats= agg_stats.drop_duplicates(subset=['N_edges']) ########################################################### DROP DUPS

if version == 'fico2':
    #### Add Normal CASTLE
    agg_stats.loc[-1] = [-1,5,5,0.723377,0.009611,405.12,10.401602]#,True]  # adding a row
    agg_stats.index = agg_stats.index + 1  # shifting index
    agg_stats = agg_stats.sort_index()  # sorting by index

    n_edges.loc[-1] = [-1,405.12,10.401602]  # adding a row
    n_edges.index = n_edges.index + 1  # shifting index
    n_edges = n_edges.sort_index()  # sorting by index

sub = -0.07
agg_stats.loc[agg_stats['theta']==-1,'theta'] = sub

filter_theta = 0.6
filter_list = [0.2,0.24]
agg_stats = agg_stats[(agg_stats['theta']<=filter_theta) & (~agg_stats['theta'].isin(filter_list))]
display(agg_stats)

import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

if metric == 'accuracy':
    metric =metric.capitalize()
else:
    metric = metric.upper()

# Add traces
fig.add_trace(
    go.Scatter(x=agg_stats['theta'], y=agg_stats['accuracy_mean'], name=metric,         
    error_y=dict(
            type='data', # value of error bar given in data coordinates
            array=agg_stats['accuracy_std'],
            visible=True)
            ),
    secondary_y=True, 
)

fig.add_trace(
    go.Scatter(x=agg_stats['theta'], y=agg_stats['N_edges'], name="Edges",
    error_y=dict(
            type='data', # value of error bar given in data coordinates
            array=agg_stats['Std'],
            visible=True)
              ),
    secondary_y=False,
)

# Add figure title
fig.update_layout(
    title='',
    legend={
        'y':-0.23,
        'x':0.85,
        'orientation':"h",
        'xanchor': 'center',
        'yanchor': 'bottom'},
    template='plotly_white',
    autosize=True,
    width=600, height=250, 
    margin=dict(
        l=10,
        r=10,
        b=0,
        t=10,
        pad=0
    ),
    font=dict(
        family='Serif',#"Courier New, monospace",
        size=18,
        # color="Black"
    )    
)

# Set x-axis title
fig.update_xaxes(showgrid=True,title={'text':r'$\text{Threshold } \tau$','font':{'size':18#, 'family': 'Courier New, monospace',
}},tickvals=list(np.round(np.arange(0, max(agg_stats['theta']), 0.1),3)),
)

# Set y-axes titles
fig.update_yaxes(showgrid=False,nticks=6,zeroline=False, title={'text':metric#,'font':{'size':18}
}, #nticks=13,
secondary_y=True)
fig.update_yaxes(showgrid=True,nticks=6,zeroline=True, title={'text':"Number of Edges"#,'font':{'size':18}
}, secondary_y=False)

fig.add_annotation(x=0.02, y=-0.094,
            text="CASTLE",
            showarrow=False,
            yshift=0,
            xref='paper',
            yref='paper',)

chosen_tau=0.08
y_co = agg_stats[agg_stats['theta']==chosen_tau]['accuracy_mean']
fig.add_annotation(x=chosen_tau, y=0.8556,
            text=str(chosen_tau),
            showarrow=True,
            yshift=0,
            xref='x',ax=0,
            yref='y2',ay=165)
fig.add_annotation(x=chosen_tau, y=0.8556,
            text=str(0.86),
            showarrow=True,
            yshift=0,
            xref='x',ax=300,
            yref='y2',ay=0)
fig.add_annotation(x=chosen_tau, y=46.2,
            text=str(46),
            showarrow=True,
            yshift=0,
            xref='x',ax=-150,
            yref='y',ay=0)

fig.show()

output_folder = "figures"
if not os.path.exists(output_folder):
    os.mkdir(output_folder)

out_path = os.path.join(output_folder,f"plot_tauoptim_{version}.png")

if save:
    import kaleido
    fig.write_image(out_path)

y_co

Unnamed: 0,theta,count,folds,accuracy_mean,accuracy_std,N_edges,Std
0,-0.07,25,5,0.747038,0.027878,210.0,0.0
1,0.0,25,5,0.863468,0.00599,105.0,0.0
3,0.08,25,5,0.855648,0.010548,46.2,1.979057
4,0.1,25,5,0.851808,0.006541,40.8,2.614065
5,0.12,25,5,0.850752,0.006104,36.6,2.783882
6,0.14,25,5,0.820374,0.059331,32.2,1.354006
7,0.16,25,5,0.783488,0.06342,30.8,1.190238
8,0.18,20,5,0.65379,0.161708,29.75,1.332785
11,0.28,5,5,0.5,0.0,27.0,0.0
12,0.32,5,5,0.5,0.0,25.0,0.0


3    0.855648
Name: accuracy_mean, dtype: float64

## Boston

In [15]:
import re
from utils import load_pickle
import pandas as pd
import numpy as np

save = True
dset='100000'
version = 'boston'#'(adult2|adult)'
metric = 'MSE'
theta = '(([+-])?\d(\.\d+)?)'
folder = "./results/adjmats/"
list_edges = []
for filename in os.listdir(folder):
    match = re.search(f'W_est.{dset}.(\d).(\d).(.*?).{version}.pkl$', filename)
    if match:
        out_f, in_f, theta = match.groups()
        edges = sum(load_pickle(os.path.join(folder,filename), verbose=False).flatten()>0)
        list_edges.append((float(theta), out_f, in_f, edges))
n_edges = pd.DataFrame(list_edges).groupby(0, as_index=False).agg({3 : ['mean', 'std']})
n_edges.columns = n_edges.columns.droplevel()
n_edges.columns = ['theta','N_edges','Std']

folder = "./results/"
filename = f"Nested5FoldCASTLE.Reg.Synth.100000.{version}.pkl"
describe = load_pickle(os.path.join(folder,filename), verbose=False)
agg_stats = pd.DataFrame([(describe[c]['theta'],c[9],describe[c]['fold'],describe[c][metric]) for c in describe]).groupby(0, as_index=False).agg({2 : ['count','max'], 3 : ['mean', 'std']})
agg_stats.columns = ['theta', 'count','folds',  'accuracy_mean', 'accuracy_std']
agg_stats=agg_stats.merge(n_edges,on='theta')
agg_stats= agg_stats.drop_duplicates(subset=['N_edges'])
sub = -0.17
agg_stats.loc[agg_stats['theta']==-1,'theta'] = sub

filter_theta = 0.8
filter_list = []
agg_stats = agg_stats[(agg_stats['theta']<=filter_theta) & (~round(agg_stats['theta'],3).isin(filter_list))]
display(agg_stats)

import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

if metric == 'accuracy':
    metric =metric.capitalize()
else:
    metric = metric.upper()

# Add traces
fig.add_trace(
    go.Scatter(x=agg_stats['theta'], y=agg_stats['accuracy_mean'], name=metric,         
    error_y=dict(
            type='data', # value of error bar given in data coordinates
            array=agg_stats['accuracy_std'],
            visible=True)
            ),
    secondary_y=True, 
)

fig.add_trace(
    go.Scatter(x=agg_stats['theta'], y=agg_stats['N_edges'], name="Edges",
    error_y=dict(
            type='data', # value of error bar given in data coordinates
            array=agg_stats['Std'],
            visible=True)
              ),
    secondary_y=False,
)

# Add figure title
fig.update_layout(
    title='',
    legend={
        'y':-0.23,
        'x':0.85,
        'orientation':"h",
        'xanchor': 'center',
        'yanchor': 'bottom'},
    template='plotly_white',
    autosize=True,
    width=600, height=250, 
    margin=dict(
        l=10,
        r=10,
        b=0,
        t=10,
        pad=0
    ),
    font=dict(
        family='Serif',#"Courier New, monospace",
        size=18,
        # color="Black"
    )    
)

# Set x-axis title
fig.update_xaxes(showgrid=True,title={'text':r'$\text{Threshold } \tau$','font':{'size':18#, 'family': 'Courier New, monospace',
}},tickvals=list(np.round(np.arange(0, max(agg_stats['theta']), 0.3),3)),
)

# Set y-axes titles
fig.update_yaxes(showgrid=False,nticks=6,zeroline=False, title={'text':metric#,'font':{'size':18}
}, #nticks=13,
secondary_y=True)
fig.update_yaxes(showgrid=True,nticks=6,zeroline=True, title={'text':"Number of Edges"#,'font':{'size':18}
}, secondary_y=False)

fig.add_annotation(x=0.02, y=-0.094,
            text="CASTLE",
            showarrow=False,
            yshift=0,
            xref='paper',
            yref='paper',)
chosen_tau=0.13
y_co = agg_stats[agg_stats['theta']==chosen_tau]['accuracy_mean']
fig.add_annotation(x=chosen_tau, y=20.363,
            text=str(chosen_tau),
            showarrow=True,
            yshift=0,
            xref='x',ax=0,
            yref='y2',ay=80)
fig.add_annotation(x=chosen_tau, y=20.363,
            text=str(20.4),
            showarrow=True,
            yshift=0,
            xref='x',ax=300,
            yref='y2',ay=0)
fig.add_annotation(x=chosen_tau, y=47.8,
            text=str(48),
            showarrow=True,
            yshift=0,
            xref='x',ax=-150,
            yref='y',ay=0)
            
fig.show()

output_folder = "figures"
if not os.path.exists(output_folder):
    os.mkdir(output_folder)

out_path = os.path.join(output_folder,f"plot_tauoptim_{version}.png")

if save:
    import kaleido
    fig.write_image(out_path)



Unnamed: 0,theta,count,folds,accuracy_mean,accuracy_std,N_edges,Std
0,-0.17,25,5,22.231392,7.387012,182.0,0.0
1,0.0,25,5,18.50769,7.04278,91.0,0.0
15,0.037,10,5,16.818329,6.78389,90.5,0.527046
16,0.066,25,5,18.437362,6.868943,86.0,1.290994
17,0.132,25,5,20.363651,5.233823,47.8,7.942502
18,0.198,25,5,40.765085,27.205065,32.8,8.098354
19,0.264,25,5,41.8767,27.307922,23.0,6.390097
20,0.33,25,5,62.497449,29.58318,17.0,5.91608
21,0.396,25,5,84.998406,12.277859,12.2,5.416026
22,0.462,25,5,84.606197,12.862441,9.8,4.804512


## Cali

In [25]:
import re
from utils import load_pickle
import pandas as pd
import numpy as np

save = True
dset='100000'
version = 'cali'#'(adult2|adult)'
metric = 'MSE'
theta = '(([+-])?\d(\.\d+)?)'
folder = "./results/adjmats/"
list_edges = []
for filename in os.listdir(folder):
    match = re.search(f'W_est.{dset}.(\d).(\d).(.*?).{version}.pkl$', filename)
    if match:
        out_f, in_f, theta = match.groups()
        edges = sum(load_pickle(os.path.join(folder,filename), verbose=False).flatten()>0)
        list_edges.append((float(theta), out_f, in_f, edges))
n_edges = pd.DataFrame(list_edges).groupby(0, as_index=False).agg({3 : ['mean', 'std']})
n_edges.columns = n_edges.columns.droplevel()
n_edges.columns = ['theta','N_edges','Std']

folder = "./results/"
filename = f"Nested5FoldCASTLE.Reg.Synth.100000.{version}.pkl"
describe = load_pickle(os.path.join(folder,filename), verbose=False)
agg_stats = pd.DataFrame([(describe[c]['theta'],c[9],describe[c]['fold'],describe[c][metric]) for c in describe]).groupby(0, as_index=False).agg({2 : ['count','max'], 3 : ['mean', 'std']})
agg_stats.columns = ['theta', 'count','folds',  'accuracy_mean', 'accuracy_std']
agg_stats=agg_stats.merge(n_edges,on='theta')
agg_stats= agg_stats.drop_duplicates(subset=['N_edges'])
sub = -0.5
agg_stats.loc[agg_stats['theta']==-1,'theta'] = sub

filter_theta = 3
filter_list = [0.1, 0.188, 0.2,0.25, 0.3,0.4,0.45, 0.376, 0.55,0.65, 0.7,0.75, 0.8,0.85,0.9,0.94, 1.504,1.88, 2.256,2.632]#[0.1,0.188,0.2,0.3,0.35,0.376,0.45,0.65,0.7,0.8,0.85,0.9,0.95]
agg_stats = agg_stats[(agg_stats['theta']<=filter_theta) & (~round(agg_stats['theta'],3).isin(filter_list))]
display(agg_stats)

import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=agg_stats['theta'], y=agg_stats['accuracy_mean'], name=metric,         
    error_y=dict(
            type='data', # value of error bar given in data coordinates
            array=agg_stats['accuracy_std'],
            visible=True)
            ),
    secondary_y=True, 
)

fig.add_trace(
    go.Scatter(x=agg_stats['theta'], y=agg_stats['N_edges'], name="Edges",
    error_y=dict(
            type='data', # value of error bar given in data coordinates
            array=agg_stats['Std'],
            visible=True)
              ),
    secondary_y=False,
)

# Add figure title
fig.update_layout(
    title='',
    legend={
        'y':-0.23,
        'x':0.85,
        'orientation':"h",
        'xanchor': 'center',
        'yanchor': 'bottom'},
    template='plotly_white',
    autosize=True,
    width=600, height=250, 
    margin=dict(
        l=10,
        r=10,
        b=0,
        t=10,
        pad=0
    ),
    font=dict(
        family='Serif',#"Courier New, monospace",
        size=18,
        # color="Black"
    )    
)

# Set x-axis title
fig.update_xaxes(showgrid=True,title={'text':r'$\text{Threshold } \tau$','font':{'size':18#, 'family': 'Courier New, monospace',
}},tickvals=list(np.round(np.arange(0, max(agg_stats['theta']), 0.5),1)),
)

# Set y-axes titles
fig.update_yaxes(title={'text':"Number of Edges"#,'font':{'size':18}
}, secondary_y=False)
fig.update_yaxes(showgrid=False,nticks=6,zeroline=False, title={'text':metric#,'font':{'size':18}
}, #nticks=13,
secondary_y=True)


fig.add_annotation(x=0.02, y=-0.094,
            text="CASTLE",
            showarrow=False,
            yshift=0,
            xref='paper',
            yref='paper',)

chosen_tau=0.05
y_co = agg_stats[agg_stats['theta']==chosen_tau]['accuracy_mean']
fig.add_annotation(x=chosen_tau, y=1.0185,
            text=str(chosen_tau),
            showarrow=True,
            yshift=0,
            xref='x',ax=0,
            yref='y2',ay=90)
fig.add_annotation(x=chosen_tau, y=1.0185,
            text=str(1.02),
            showarrow=True,
            yshift=0,
            xref='x',ax=350,
            yref='y2',ay=0)
fig.add_annotation(x=chosen_tau, y=30.6,
            text=str(31),
            showarrow=True,
            yshift=0,
            xref='x',ax=-100,
            yref='y',ay=0)
            
fig.show()

output_folder = "figures"
if not os.path.exists(output_folder):
    os.mkdir(output_folder)

out_path = os.path.join(output_folder,f"plot_tauoptim_{version}.png")

if save:
    import kaleido
    fig.write_image(out_path)



Unnamed: 0,theta,count,folds,accuracy_mean,accuracy_std,N_edges,Std
0,-0.5,25,5,0.662208,0.081241,72.0,0.0
1,0.0,25,5,1.062381,0.312212,36.0,0.0
2,0.05,25,5,1.018462,0.348869,30.6,2.929733
4,0.15,25,5,1.049525,0.365987,23.4,0.816497
9,0.35,25,5,1.081789,0.323505,18.4,1.527525
13,0.5,25,5,1.083272,0.321736,16.8,1.5
20,0.752,20,5,1.174837,0.294267,14.25,0.850696
25,0.95,15,5,1.339808,0.030016,11.0,2.236068
27,1.316,20,5,1.339957,0.023627,8.75,1.681947
29,1.692,25,5,1.331741,0.02689,6.2,1.0
