# MVPC Pipeline — Running MVPC on Synthetic Data
This notebook loads the synthetic datasets generated in `01_generate_synthetic_data.ipynb` and runs the full MVPC pipeline:
- missingness‑parent detection  
- initial skeleton extraction  
- corrected skeleton search  
- orientation of the corrected skeleton  


In [1]:
import sys
import os

# Path to the project root (one level above the notebooks folder)
project_root = os.path.abspath("..")

# Add to Python path if not already present
if project_root not in sys.path:
    sys.path.append(project_root)

print("Project root added:", project_root)

Project root added: /home/zervaki/Thesis_New


In [2]:
%load_ext autoreload
%autoreload 2


In [26]:

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm


from mvpc.mvpc_pipeline import MVPC
from mvpc.ci_tests.gauss_permc import gauss_ci_td, gauss_ci_permc
from mvpc.ci_tests.gauss_drw import gauss_ci_drw

# binary tests
from mvpc.ci_tests.bin_drw import bin_ci_drw
from mvpc.ci_tests.bin_permc import bin_ci_permc
from mvpc.ci_tests.bin_td import bin_ci_td  

from mvpc.missingness import detection_prt_m
from mvpc.skeleton import skeleton2


In [27]:

from mvpc.ci_tests.gauss_permc import PERMC_COUNTER, PERMC_FAIL 
from mvpc.ci_tests.gauss_drw import DRW_COUNTER

In [28]:
# Load MAR / MNAR data 
data_mar = pd.read_csv("../data/synthetic_data_csv_files/missing_mar.csv").values 
data_mnar = pd.read_csv("../data/synthetic_data_csv_files/missing_mnar.csv").values 
print("Shapes:", data_mar.shape, data_mnar.shape)

Shapes: (2000, 20) (2000, 20)


In [29]:
# --- Define three MVPC variants --- 
mvpc_td = MVPC( 
    indep_test=gauss_ci_td, 
    corr_test=gauss_ci_td, 
    alpha=0.05 
)
 
mvpc_permc = MVPC( 
    indep_test=gauss_ci_td, 
    corr_test=gauss_ci_permc, 
    alpha=0.05 
)

mvpc_drw = MVPC( 
    indep_test=gauss_ci_td, 
    corr_test=gauss_ci_drw, 
    alpha=0.05 
)

In [30]:
# --- Run all methods on MAR / MNAR ---

res_mar_td = mvpc_td.run(data_mar) 
res_mnar_td = mvpc_td.run(data_mnar)

res_mar_permc = mvpc_permc.run(data_mar) 
res_mnar_permc = mvpc_permc.run(data_mnar)

res_mar_drw = mvpc_drw.run(data_mar) 
res_mnar_drw = mvpc_drw.run(data_mnar)

[Step 1] m_inds (vars with NaNs): [1, 2, 5, 7, 14, 16]


Detecting parents of missingness indicators:   0%|          | 0/6 [00:00<?, ?it/s]

[Step 1] R_ind=1, parents=[np.int64(15)]




[Step 1] R_ind=2, parents=[np.int64(4), np.int64(8), np.int64(13)]




[Step 1] R_ind=5, parents=[np.int64(0)]




[Step 1] R_ind=7, parents=[np.int64(4)]




[Step 1] R_ind=14, parents=[np.int64(11)]


Detecting parents of missingness indicators: 100%|██████████| 6/6 [00:00<00:00, 202.67it/s]


[Step 1] R_ind=16, parents=[np.int64(13)]
[Step 1] m_inds_filtered (with ≥1 parent): [1, 2, 5, 7, 14, 16]
[Step 1] prt dict: {1: [np.int64(15)], 2: [np.int64(4), np.int64(8), np.int64(13)], 5: [np.int64(0)], 7: [np.int64(4)], 14: [np.int64(11)], 16: [np.int64(13)]}


                                                                 

[Step 1] m_inds (vars with NaNs): [1, 2, 4, 7, 8, 15]


Detecting parents of missingness indicators:   0%|          | 0/6 [00:00<?, ?it/s]

[Step 1] R_ind=1, parents=[np.int64(15)]




[Step 1] R_ind=2, parents=[np.int64(8), np.int64(13)]




[Step 1] R_ind=4, parents=[np.int64(1), np.int64(16)]




[Step 1] R_ind=7, parents=[np.int64(4)]




[Step 1] R_ind=8, parents=[np.int64(10)]


Detecting parents of missingness indicators: 100%|██████████| 6/6 [00:00<00:00, 217.57it/s]


[Step 1] R_ind=15, parents=[np.int64(5)]
[Step 1] m_inds_filtered (with ≥1 parent): [1, 2, 4, 7, 8, 15]
[Step 1] prt dict: {1: [np.int64(15)], 2: [np.int64(8), np.int64(13)], 4: [np.int64(1), np.int64(16)], 7: [np.int64(4)], 8: [np.int64(10)], 15: [np.int64(5)]}


                                                                 

[Step 1] m_inds (vars with NaNs): [1, 2, 5, 7, 14, 16]


Detecting parents of missingness indicators:   0%|          | 0/6 [00:00<?, ?it/s]

[Step 1] R_ind=1, parents=[np.int64(15)]




[Step 1] R_ind=2, parents=[np.int64(4), np.int64(8), np.int64(13)]




[Step 1] R_ind=5, parents=[np.int64(0)]




[Step 1] R_ind=7, parents=[np.int64(4)]




[Step 1] R_ind=14, parents=[np.int64(11)]


Detecting parents of missingness indicators: 100%|██████████| 6/6 [00:00<00:00, 206.82it/s]


[Step 1] R_ind=16, parents=[np.int64(13)]
[Step 1] m_inds_filtered (with ≥1 parent): [1, 2, 5, 7, 14, 16]
[Step 1] prt dict: {1: [np.int64(15)], 2: [np.int64(4), np.int64(8), np.int64(13)], 5: [np.int64(0)], 7: [np.int64(4)], 14: [np.int64(11)], 16: [np.int64(13)]}


                                                                 

[Step 1] m_inds (vars with NaNs): [1, 2, 4, 7, 8, 15]


Detecting parents of missingness indicators:   0%|          | 0/6 [00:00<?, ?it/s]

[Step 1] R_ind=1, parents=[np.int64(15)]




[Step 1] R_ind=2, parents=[np.int64(8), np.int64(13)]




[Step 1] R_ind=4, parents=[np.int64(1), np.int64(16)]




[Step 1] R_ind=7, parents=[np.int64(4)]




[Step 1] R_ind=8, parents=[np.int64(10)]


Detecting parents of missingness indicators: 100%|██████████| 6/6 [00:00<00:00, 223.33it/s]


[Step 1] R_ind=15, parents=[np.int64(5)]
[Step 1] m_inds_filtered (with ≥1 parent): [1, 2, 4, 7, 8, 15]
[Step 1] prt dict: {1: [np.int64(15)], 2: [np.int64(8), np.int64(13)], 4: [np.int64(1), np.int64(16)], 7: [np.int64(4)], 8: [np.int64(10)], 15: [np.int64(5)]}


                                                                 

[Step 1] m_inds (vars with NaNs): [1, 2, 5, 7, 14, 16]


Detecting parents of missingness indicators:   0%|          | 0/6 [00:00<?, ?it/s]

[Step 1] R_ind=1, parents=[np.int64(15)]




[Step 1] R_ind=2, parents=[np.int64(4), np.int64(8), np.int64(13)]




[Step 1] R_ind=5, parents=[np.int64(0)]




[Step 1] R_ind=7, parents=[np.int64(4)]




[Step 1] R_ind=14, parents=[np.int64(11)]


Detecting parents of missingness indicators: 100%|██████████| 6/6 [00:00<00:00, 200.83it/s]


[Step 1] R_ind=16, parents=[np.int64(13)]
[Step 1] m_inds_filtered (with ≥1 parent): [1, 2, 5, 7, 14, 16]
[Step 1] prt dict: {1: [np.int64(15)], 2: [np.int64(4), np.int64(8), np.int64(13)], 5: [np.int64(0)], 7: [np.int64(4)], 14: [np.int64(11)], 16: [np.int64(13)]}


                                                                           

[Step 1] m_inds (vars with NaNs): [1, 2, 4, 7, 8, 15]


Detecting parents of missingness indicators:   0%|          | 0/6 [00:00<?, ?it/s]

[Step 1] R_ind=1, parents=[np.int64(15)]




[Step 1] R_ind=2, parents=[np.int64(8), np.int64(13)]




[Step 1] R_ind=4, parents=[np.int64(1), np.int64(16)]




[Step 1] R_ind=7, parents=[np.int64(4)]




[Step 1] R_ind=8, parents=[np.int64(10)]


Detecting parents of missingness indicators: 100%|██████████| 6/6 [00:00<00:00, 153.82it/s]


[Step 1] R_ind=15, parents=[np.int64(5)]
[Step 1] m_inds_filtered (with ≥1 parent): [1, 2, 4, 7, 8, 15]
[Step 1] prt dict: {1: [np.int64(15)], 2: [np.int64(8), np.int64(13)], 4: [np.int64(1), np.int64(16)], 7: [np.int64(4)], 8: [np.int64(10)], 15: [np.int64(5)]}


                                                                           

In [31]:
# --- Inspect missingness parents (Step 1, TD-based) ---
print("=== MAR prt_m (TD) ===") 
print("m:", res_mar_td["prt_m"]["m"]) 
print("prt:", res_mar_td["prt_m"]["prt"])

=== MAR prt_m (TD) ===
m: [1, 2, 5, 7, 14, 16]
prt: {1: [np.int64(15)], 2: [np.int64(4), np.int64(8), np.int64(13)], 5: [np.int64(0)], 7: [np.int64(4)], 14: [np.int64(11)], 16: [np.int64(13)]}


In [32]:
print("\n=== MNAR prt_m (TD) ===") 
print("m:", res_mnar_td["prt_m"]["m"]) 
print("prt:", res_mnar_td["prt_m"]["prt"])


=== MNAR prt_m (TD) ===
m: [1, 2, 4, 7, 8, 15]
prt: {1: [np.int64(15)], 2: [np.int64(8), np.int64(13)], 4: [np.int64(1), np.int64(16)], 7: [np.int64(4)], 8: [np.int64(10)], 15: [np.int64(5)]}


In [33]:
# Optionally show that DRW / PermC share the same prt_m

print("\nCheck prt_m consistency across methods (MAR):") 
print("TD: ", res_mar_td["prt_m"]["prt"]) 
print("PermC:", res_mar_permc["prt_m"]["prt"]) 
print("DRW: ", res_mar_drw["prt_m"]["prt"])


Check prt_m consistency across methods (MAR):
TD:  {1: [np.int64(15)], 2: [np.int64(4), np.int64(8), np.int64(13)], 5: [np.int64(0)], 7: [np.int64(4)], 14: [np.int64(11)], 16: [np.int64(13)]}
PermC: {1: [np.int64(15)], 2: [np.int64(4), np.int64(8), np.int64(13)], 5: [np.int64(0)], 7: [np.int64(4)], 14: [np.int64(11)], 16: [np.int64(13)]}
DRW:  {1: [np.int64(15)], 2: [np.int64(4), np.int64(8), np.int64(13)], 5: [np.int64(0)], 7: [np.int64(4)], 14: [np.int64(11)], 16: [np.int64(13)]}


In [34]:
# --- Ground truth missingness parents --- 

gt_mar = pd.read_csv("../data/synthetic_data_csv_files/missingness_structure_mar.csv") 
true_ms_mar = gt_mar["m_ind"].tolist() 
true_prt_ms_mar = gt_mar["parent_m_ind"].tolist() 
true_prt_mar = {m: p for m, p in zip(true_ms_mar, true_prt_ms_mar)}

In [35]:
gt_mnar = pd.read_csv("../data/synthetic_data_csv_files/missingness_structure_mnar.csv") 
true_ms_mnar = gt_mnar["m_ind"].tolist() 
true_prt_ms_mnar = gt_mnar["parent_m_ind"].tolist() 
true_prt_mnar = {m: p for m, p in zip(true_ms_mnar, true_prt_ms_mnar)}

In [36]:
def evaluate_missingness_parents(true_prt, est_prt): 
    rows = [] 
    for m, true_parent in true_prt.items(): 
        est = est_prt.get(m, []) 
        rows.append({ 
            "R_ind": m, 
            "true_parent": true_parent, 
            "estimated_parents": est, 
            "correct": true_parent in est 
        }) 
    return pd.DataFrame(rows)

In [37]:
eval_mar_td = evaluate_missingness_parents(true_prt_mar, res_mar_td["prt_m"]["prt"]) 
eval_mnar_td = evaluate_missingness_parents(true_prt_mnar, res_mnar_td["prt_m"]["prt"])

In [38]:
eval_mar_drw = evaluate_missingness_parents(true_prt_mar, res_mar_drw["prt_m"]["prt"]) 
eval_mnar_drw = evaluate_missingness_parents(true_prt_mnar, res_mnar_drw["prt_m"]["prt"])

In [39]:
print(pd.concat([ 
    eval_mar_td.assign(method="TD"), 
    eval_mar_drw.assign(method="DRW") 
]))

   R_ind  true_parent estimated_parents  correct method
0      2            8        [4, 8, 13]     True     TD
1      7            4               [4]     True     TD
2      1           15              [15]     True     TD
3      5            0               [0]     True     TD
4     16           13              [13]     True     TD
5     14           11              [11]     True     TD
0      2            8        [4, 8, 13]     True    DRW
1      7            4               [4]     True    DRW
2      1           15              [15]     True    DRW
3      5            0               [0]     True    DRW
4     16           13              [13]     True    DRW
5     14           11              [11]     True    DRW


In [40]:
print(pd.concat([ 
    eval_mnar_td.assign(method="TD"), 
    eval_mnar_drw.assign(method="DRW") 
]))

   R_ind  true_parent estimated_parents  correct method
0      2            8           [8, 13]     True     TD
1      7            4               [4]     True     TD
2      1           15              [15]     True     TD
3      8           10              [10]     True     TD
4      4           16           [1, 16]     True     TD
5     15            5               [5]     True     TD
0      2            8           [8, 13]     True    DRW
1      7            4               [4]     True    DRW
2      1           15              [15]     True    DRW
3      8           10              [10]     True    DRW
4      4           16           [1, 16]     True    DRW
5     15            5               [5]     True    DRW


In [41]:
# --- SHD for corrected skeletons ---


def shd_skeleton(G_est, G_true): 
    G_est_sym = ((G_est + G_est.T) > 0).astype(int) 
    G_true_sym = ((G_true + G_true.T) > 0).astype(int) 
    return np.sum(G_est_sym != G_true_sym)

In [42]:
adj_true_mar = pd.read_csv("../data/synthetic_data_csv_files/adjacency_mar.csv").values 
adj_true_mnar = pd.read_csv("../data/synthetic_data_csv_files/adjacency_mnar.csv").values

In [43]:
shd_td_mar = shd_skeleton(res_mar_td["G_corrected"], adj_true_mar) 
shd_permc_mar = shd_skeleton(res_mar_permc["G_corrected"], adj_true_mar) 
shd_drw_mar = shd_skeleton(res_mar_drw["G_corrected"], adj_true_mar)

In [44]:
shd_td_mnar = shd_skeleton(res_mnar_td["G_corrected"], adj_true_mnar) 
shd_permc_mnar = shd_skeleton(res_mnar_permc["G_corrected"], adj_true_mnar) 
shd_drw_mnar = shd_skeleton(res_mnar_drw["G_corrected"], adj_true_mnar)

In [45]:
print("MAR SHD:") 
print("TD-PC:", shd_td_mar) 
print("PermC:", shd_permc_mar) 
print("DRW:", shd_drw_mar)

MAR SHD:
TD-PC: 16
PermC: 14
DRW: 10


In [46]:
print("MNAR SHD:") 
print("TD-PC:", shd_td_mnar) 
print("PermC:", shd_permc_mnar) 
print("DRW:", shd_drw_mnar)

MNAR SHD:
TD-PC: 12
PermC: 10
DRW: 8


In [47]:
# --- PermC / DRW diagnostics --- 
print("\nPERMC_COUNTER:", PERMC_COUNTER) 
print("PERMC_FAIL:", PERMC_FAIL) 
print("DRW_COUNTER:", DRW_COUNTER)


PERMC_COUNTER: {'total_calls': 355, 'used': 63, 'fallback': 292}
PERMC_FAIL: {'no_W': 15, 'tw_too_small': 0, 'no_W_columns': 0, 'regression_fail': 0, 'success': 63}
DRW_COUNTER: {'used': 110, 'fallback': 286}


In [48]:
from mvpc.utils.mvpc_utils import PERMC_DIAG
print("PERMC_DIAG:", PERMC_DIAG)


PERMC_DIAG: {'total_calls': 751, 'used': 188, 'fallback': 563}
