In [10]:
import os
import json
import numpy as np
import pandas as pd
from collections import defaultdict
from scipy.stats import pearsonr

ROOT = "."
MOTIF_LOSSES = [0.0, 1]
EPS = 1e-8
SPLIT = None  # set to "test" if needed


# -------------------------------------------------
# Utils
# -------------------------------------------------
def load_jsonl(path):
    with open(path) as f:
        return [json.loads(l) for l in f]


def pearson_safe(x, y):
    if len(x) < 2:
        return np.nan
    return pearsonr(x, y)[0]


def motif_node_consistency(scores):
    """Within-motif node score consistency"""
    # if len(scores) < 2:
    #     return None
    p = np.abs(scores)
    p = p / (p.sum() + 1e-8)
    return 1 - (-np.sum(p * np.log(p + 1e-8)))


# -------------------------------------------------
# Discover runs
# -------------------------------------------------
runs = []

for ml in MOTIF_LOSSES:
    ml_root = os.path.join(ROOT, f"MotifLoss_{str(ml)}")
    if not os.path.isdir(ml_root):
        continue

    for dataset in os.listdir(ml_root):
        droot = os.path.join(ml_root, dataset)
        if not os.path.isdir(droot):
            continue

        for arch in os.listdir(droot):
            aroot = os.path.join(droot, arch)
            if not os.path.isdir(aroot):
                continue

            for fold in os.listdir(aroot):
                froot = os.path.join(aroot, fold)
                if not os.path.isdir(froot):
                    continue

                for seed in os.listdir(froot):
                    sroot = os.path.join(froot, seed)
                    if not os.path.isdir(sroot):
                        continue

                    node_path = os.path.join(sroot, "node_scores.jsonl")
                    edge_path = os.path.join(sroot, "masked-edge-impact.jsonl")

                    if not os.path.exists(node_path):
                        continue

                    runs.append({
                        "dataset": dataset,
                        "arch": arch,
                        "fold": fold,
                        "seed": seed,
                        "motif_loss": ml,
                        "node_path": node_path,
                        "edge_path": edge_path if os.path.exists(edge_path) else None
                    })


# -------------------------------------------------
# PART A — Within-motif node consistency + score stats
# -------------------------------------------------
part_a_rows = []

for r in runs:
    node_data = load_jsonl(r["node_path"])

    motif_scores = defaultdict(list)
    for rec in node_data:
        if SPLIT and rec["split"] != SPLIT:
            continue
        motif_scores[rec["motif_index"]].append(rec["score"])

    motif_consistencies = []
    motif_means = []
    motif_stds = []

    for scores in motif_scores.values():
        scores = np.asarray(scores)
        if len(scores) == 0:
            continue

        # consistency
        c = motif_node_consistency(scores)
        if c is not None:
            motif_consistencies.append(c)

        # score stats
        motif_means.append(scores.mean())
        motif_stds.append(scores.std(ddof=0))

    if not motif_consistencies:
        continue

    part_a_rows.append({
        "dataset": r["dataset"],
        "arch": r["arch"],
        "fold": r["fold"],
        "seed": r["seed"],
        "motif_loss": r["motif_loss"],

        # consistency
        "avg_within_motif_consistency": float(np.mean(motif_consistencies)),
        "std_within_motif_consistency": float(np.std(motif_consistencies)),

        # score statistics
        "avg_node_score_mean": float(np.mean(motif_means)),
        "avg_node_score_std": float(np.mean(motif_stds)),

        "num_motifs": len(motif_consistencies)
    })

part_a_df = pd.DataFrame(part_a_rows)
part_a_df.to_csv("per_run_within_motif_stats.csv", index=False)


# -------------------------------------------------
# PART B — Avg node score vs masked-edge impact
# -------------------------------------------------
part_b_rows = []

for r in runs:
    if r["edge_path"] is None:
        continue

    node_data = load_jsonl(r["node_path"])
    edge_data = load_jsonl(r["edge_path"])

    # ---- average node score per motif ----
    motif_scores = defaultdict(list)
    for rec in node_data:
        if SPLIT and rec["split"] != SPLIT:
            continue
        motif_scores[rec["motif_index"]].append(rec["score"])

    motif_avg_node_score = {
        m: float(np.mean(v))
        for m, v in motif_scores.items()
        if len(v) > 0
    }

    # ---- average impact per motif ----
    motif_impacts = defaultdict(list)
    for rec in edge_data:
        if SPLIT and rec["split"] != SPLIT:
            continue
        if rec["motif_idx"] == -1:
            continue  # skip whole-graph masking
        impact = rec["new_prediction"] - rec["old_prediction"]
        motif_impacts[rec["motif_idx"]].append(impact)

    motif_avg_impact = {
        m: float(np.mean(v))
        for m, v in motif_impacts.items()
        if len(v) > 0
    }

    # ---- correlation across motifs ----
    common_motifs = sorted(
        set(motif_avg_node_score) & set(motif_avg_impact)
    )

    x = [motif_avg_node_score[m] for m in common_motifs]
    y = [motif_avg_impact[m] for m in common_motifs]

    part_b_rows.append({
        "dataset": r["dataset"],
        "arch": r["arch"],
        "fold": r["fold"],
        "seed": r["seed"],
        "motif_loss": r["motif_loss"],
        "pearson_avg_node_score_vs_impact": pearson_safe(x, y),
        "num_motifs": len(common_motifs)
    })

part_b_df = pd.DataFrame(part_b_rows)
part_b_df.to_csv("avg_node_score_vs_masked_edge_impact.csv", index=False)


# -------------------------------------------------
# Done
# -------------------------------------------------
print("Saved:")
print(" - per_run_within_motif_consistency.csv")
print(" - motif_loss_avg_consistency.csv")
print(" - consistency_vs_masked_edge_impact.csv")


Saved:
 - per_run_within_motif_consistency.csv
 - motif_loss_avg_consistency.csv
 - consistency_vs_masked_edge_impact.csv


In [2]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkokatea[0m ([33mteam20[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [6]:
!pwd

/nfs/hpc/share/kokatea/ChemIntuit/GSAT


In [3]:
wandb.tensorboard.patch(root_logdir="./data/BBBP/logs")

In [1]:
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator

path = "./data/BBBP/logs/06_22_2025-18_57_53-BBBP-GIN-seed2-GSAT/events.out.tfevents.1750644321.cn-c22.hpc.engr.oregonstate.edu.61705.7"

ea = EventAccumulator(path)
ea.Reload()

print(ea.Tags())

{'images': [], 'audio': [], 'histograms': [], 'scalars': ['metric/best_clf_epoch', 'metric/best_clf_valid_loss', 'metric/best_clf_train', 'metric/best_clf_valid', 'metric/best_clf_test', 'metric/best_x_roc_train', 'metric/best_x_roc_valid', 'metric/best_x_roc_test', 'metric/best_x_precision_train', 'metric/best_x_precision_valid', 'metric/best_x_precision_test'], 'distributions': [], 'tensors': [], 'graph': False, 'meta_graph': False, 'run_metadata': []}


In [2]:
import os
import wandb
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator


# ============================================================
# CONFIG
# ============================================================

LOG_ROOT = "./data/BBBP/logs"
WANDB_PROJECT = "GSAT-BBBP"


# ============================================================
# HELPERS
# ============================================================

def parse_run_name(run_name):
    """
    Example:
    06_22_2025-18_57_53-BBBP-GIN-seed2-GSAT
    """
    parts = run_name.split("-")

    meta = {}

    try:
        meta["date"] = parts[0]
        meta["time"] = parts[1]
        meta["dataset"] = parts[2]
        meta["architecture"] = parts[3]
        meta["seed"] = int(parts[4].replace("seed", ""))
        meta["method"] = parts[5]
    except Exception:
        meta["run_name_raw"] = run_name

    return meta


def load_event_scalars(event_file):
    ea = EventAccumulator(event_file)
    ea.Reload()

    results = {}
    for tag in ea.Tags().get("scalars", []):
        events = ea.Scalars(tag)
        if len(events) == 0:
            continue

        # GSAT logs only once → take final value
        results[tag] = events[-1].value

    return results


def find_event_file(run_dir):
    for f in os.listdir(run_dir):
        if "tfevents" in f:
            return os.path.join(run_dir, f)
    return None


# ============================================================
# MAIN
# ============================================================

def main():
    wandb.login()

    for run_name in sorted(os.listdir(LOG_ROOT)):
        run_dir = os.path.join(LOG_ROOT, run_name)

        if not os.path.isdir(run_dir):
            continue

        event_file = find_event_file(run_dir)

        if event_file is None:
            print(f"[SKIP] No event file in {run_name}")
            continue

        print(f"[UPLOAD] {run_name}")

        metrics = load_event_scalars(event_file)
        metadata = parse_run_name(run_name)

        run = wandb.init(
            project=WANDB_PROJECT,
            name=run_name,
            config=metadata,
            reinit=True,
        )

        # --------------------------------------------
        # Store final metrics as SUMMARY (important)
        # --------------------------------------------
        for k, v in metrics.items():
            run.summary[k] = v

        run.finish()

    print("✅ All TensorBoard runs uploaded to W&B summaries.")


if __name__ == "__main__":
    main()


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkokatea[0m ([33mteam20[0m). Use [1m`wandb login --relogin`[0m to force relogin


[UPLOAD] 06_22_2025-18_50_43-BBBP-GIN-seed0-GSAT


0,1
metric/best_clf_epoch,0.0
metric/best_clf_test,0.0
metric/best_clf_train,0.0
metric/best_clf_valid,0.0
metric/best_clf_valid_loss,0.0
metric/best_x_precision_test,0.0
metric/best_x_precision_train,0.0
metric/best_x_precision_valid,0.0
metric/best_x_roc_test,0.0
metric/best_x_roc_train,0.0


[UPLOAD] 06_22_2025-18_54_55-BBBP-GIN-seed0-GSAT


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668594333653648, max=1.0…

[UPLOAD] 06_22_2025-18_55_23-BBBP-GIN-seed0-GSAT


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668331863669058, max=1.0…

VBox(children=(Label(value='0.005 MB of 0.024 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.214824…

0,1
metric/best_clf_epoch,0.0
metric/best_clf_test,0.0
metric/best_clf_train,0.0
metric/best_clf_valid,0.0
metric/best_clf_valid_loss,0.0
metric/best_x_precision_test,0.0
metric/best_x_precision_train,0.0
metric/best_x_precision_valid,0.0
metric/best_x_roc_test,0.0
metric/best_x_roc_train,0.0


[UPLOAD] 06_22_2025-18_57_53-BBBP-GIN-seed0-GSAT


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668450032981733, max=1.0…

0,1
metric/best_clf_epoch,0.0
metric/best_clf_test,0.0
metric/best_clf_train,0.0
metric/best_clf_valid,0.0
metric/best_clf_valid_loss,0.0
metric/best_x_precision_test,0.0
metric/best_x_precision_train,0.0
metric/best_x_precision_valid,0.0
metric/best_x_roc_test,0.0
metric/best_x_roc_train,0.0


[UPLOAD] 06_22_2025-18_57_53-BBBP-GIN-seed1-GSAT


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668282899384698, max=1.0…

VBox(children=(Label(value='0.005 MB of 0.024 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.214608…

0,1
metric/best_clf_epoch,54.0
metric/best_clf_test,143.86632
metric/best_clf_train,1006.43713
metric/best_clf_valid,137.56757
metric/best_clf_valid_loss,0.33077
metric/best_x_precision_test,0.0
metric/best_x_precision_train,0.0
metric/best_x_precision_valid,0.0
metric/best_x_roc_test,0.0
metric/best_x_roc_train,0.0


[UPLOAD] 06_22_2025-18_57_53-BBBP-GIN-seed2-GSAT


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016669313586317003, max=1.0…

0,1
metric/best_clf_epoch,0.0
metric/best_clf_test,0.0
metric/best_clf_train,0.0
metric/best_clf_valid,0.0
metric/best_clf_valid_loss,0.0
metric/best_x_precision_test,0.0
metric/best_x_precision_train,0.0
metric/best_x_precision_valid,0.0
metric/best_x_roc_test,0.0
metric/best_x_roc_train,0.0


[UPLOAD] 06_22_2025-18_57_53-BBBP-GIN-seed3-GSAT


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668062393243113, max=1.0…

0,1
metric/best_clf_epoch,60.0
metric/best_clf_test,135.55615
metric/best_clf_train,1002.52991
metric/best_clf_valid,130.81081
metric/best_clf_valid_loss,0.38099
metric/best_x_precision_test,0.0
metric/best_x_precision_train,0.0
metric/best_x_precision_valid,0.0
metric/best_x_roc_test,0.0
metric/best_x_roc_train,0.0


[UPLOAD] 06_22_2025-18_57_53-BBBP-GIN-seed4-GSAT


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668231948278843, max=1.0…

0,1
metric/best_clf_epoch,56.0
metric/best_clf_test,141.58824
metric/best_clf_train,992.08539
metric/best_clf_valid,139.92973
metric/best_clf_valid_loss,0.4836
metric/best_x_precision_test,0.0
metric/best_x_precision_train,0.0
metric/best_x_precision_valid,0.0
metric/best_x_roc_test,0.0
metric/best_x_roc_train,0.0


[UPLOAD] 07_17_2025-12_44_38-BBBP-GIN-seed0-GSAT


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668189029830197, max=1.0…

VBox(children=(Label(value='0.005 MB of 0.024 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.218438…

[UPLOAD] 07_17_2025-12_44_57-BBBP-GIN-seed0-GSAT


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668209068787593, max=1.0…

0,1
metric/best_clf_epoch,0.0
metric/best_clf_test,0.0
metric/best_clf_train,0.0
metric/best_clf_valid,0.0
metric/best_clf_valid_loss,0.0
metric/best_x_precision_test,0.0
metric/best_x_precision_train,0.0
metric/best_x_precision_valid,0.0
metric/best_x_roc_test,0.0
metric/best_x_roc_train,0.0


[UPLOAD] 07_17_2025-13_20_05-BBBP-GIN-seed0-GSAT


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666818243296196, max=1.0)…

0,1
metric/best_clf_epoch,0.0
metric/best_clf_test,0.0
metric/best_clf_train,0.0
metric/best_clf_valid,0.0
metric/best_clf_valid_loss,0.0
metric/best_x_precision_test,0.0
metric/best_x_precision_train,0.0
metric/best_x_precision_valid,0.0
metric/best_x_roc_test,0.0
metric/best_x_roc_train,0.0


[UPLOAD] 07_17_2025-13_21_45-BBBP-GIN-seed0-GSAT


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668004969445368, max=1.0…

0,1
metric/best_clf_epoch,0.0
metric/best_clf_test,0.0
metric/best_clf_train,0.0
metric/best_clf_valid,0.0
metric/best_clf_valid_loss,0.0
metric/best_x_precision_test,0.0
metric/best_x_precision_train,0.0
metric/best_x_precision_valid,0.0
metric/best_x_roc_test,0.0
metric/best_x_roc_train,0.0


[UPLOAD] 07_17_2025-13_43_21-BBBP-GIN-seed0-GSAT


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668250031458835, max=1.0…

VBox(children=(Label(value='0.005 MB of 0.025 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.206010…

0,1
gsat_best/best_clf_epoch,0.0
gsat_best/best_clf_test,0.0
gsat_best/best_clf_train,0.0
gsat_best/best_clf_valid,0.0
gsat_best/best_clf_valid_loss,0.0
gsat_best/best_x_precision_test,0.0
gsat_best/best_x_precision_train,0.0
gsat_best/best_x_precision_valid,0.0
gsat_best/best_x_roc_test,0.0
gsat_best/best_x_roc_train,0.0


[UPLOAD] 07_17_2025-13_56_46-BBBP-GIN-seed0-GSAT


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666829459524403, max=1.0)…

0,1
metric/best_clf_epoch,81.0
metric/best_clf_test,128.29411
metric/best_clf_train,1026.04309
metric/best_clf_valid,133.97298
metric/best_clf_valid_loss,0.45599
metric/best_x_precision_test,0.0
metric/best_x_precision_train,0.0
metric/best_x_precision_valid,0.0
metric/best_x_roc_test,0.0
metric/best_x_roc_train,0.0


[UPLOAD] 07_17_2025-13_56_46-BBBP-GIN-seed99-GSAT-stat


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668326586174467, max=1.0…

0,1
metric/best_clf_epoch,81.0
metric/best_clf_epoch/std,0.0
metric/best_clf_test,128.29411
metric/best_clf_test/std,0.0
metric/best_clf_train,1026.04309
metric/best_clf_train/std,0.0
metric/best_clf_valid,133.97298
metric/best_clf_valid/std,0.0
metric/best_clf_valid_loss,0.45599
metric/best_clf_valid_loss/std,0.0


[UPLOAD] 07_17_2025-15_25_35-BBBP-GIN-seed0-GSAT


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666828014422208, max=1.0)…

0,1
metric/best_clf_epoch,0.0
metric/best_clf_test,0.0
metric/best_clf_train,0.0
metric/best_clf_valid,0.0
metric/best_clf_valid_loss,0.0
metric/best_x_precision_test,0.0
metric/best_x_precision_train,0.0
metric/best_x_precision_valid,0.0
metric/best_x_roc_test,0.0
metric/best_x_roc_train,0.0


✅ All TensorBoard runs uploaded to W&B summaries.


In [7]:
import os

LOGDIR = "./data/BBBP/logs"

for root, _, files in os.walk(LOGDIR):
    for f in files:
        if "tfevents" in f:
            print("FOUND:", os.path.join(root, f))

FOUND: ./data/BBBP/logs/07_17_2025-13_43_21-BBBP-GIN-seed0-GSAT/events.out.tfevents.1752785003.cn-gpu5.hpc.engr.oregonstate.edu.2770374.0
FOUND: ./data/BBBP/logs/07_17_2025-13_43_21-BBBP-GIN-seed0-GSAT/events.out.tfevents.1752785003.cn-gpu5.hpc.engr.oregonstate.edu.2770374.1
FOUND: ./data/BBBP/logs/07_17_2025-12_44_38-BBBP-GIN-seed0-GSAT/events.out.tfevents.1752781481.cn-gpu5.hpc.engr.oregonstate.edu.2769552.0
FOUND: ./data/BBBP/logs/07_17_2025-12_44_38-BBBP-GIN-seed0-GSAT/events.out.tfevents.1752781481.cn-gpu5.hpc.engr.oregonstate.edu.2769552.1
FOUND: ./data/BBBP/logs/06_22_2025-18_57_53-BBBP-GIN-seed2-GSAT/events.out.tfevents.1750644321.cn-c22.hpc.engr.oregonstate.edu.61705.7
FOUND: ./data/BBBP/logs/06_22_2025-18_57_53-BBBP-GIN-seed2-GSAT/events.out.tfevents.1750644528.cn-c22.hpc.engr.oregonstate.edu.61705.8
FOUND: ./data/BBBP/logs/06_22_2025-18_57_53-BBBP-GIN-seed2-GSAT/events.out.tfevents.1750644321.cn-c22.hpc.engr.oregonstate.edu.61705.6
FOUND: ./data/BBBP/logs/07_17_2025-15_25_35

In [4]:
run = wandb.init(
    project="GSAT-test",
    name="tensorboard-import",
    job_type="import",
    config={
        "dataset": "BBBP",
        "source": "tensorboard",
    }
)

In [5]:
wandb.finish()

In [12]:
import pandas as pd

# Load
df = pd.read_csv("per_run_within_motif_stats.csv")

# =========================
# 1. WITHIN-MOTIF CONSISTENCY
# =========================
consistency_wide = (
    df.pivot_table(
        index=["dataset", "arch", "fold", "seed"],
        columns="motif_loss",
        values="avg_within_motif_consistency"
    )
    .reset_index()
)

consistency_wide = consistency_wide.rename(columns={
    0: "consistency_ml0",
    1: "consistency_ml1"
})

consistency_wide = consistency_wide.dropna(
    subset=["consistency_ml0", "consistency_ml1"]
)

consistency_wide["consistency_delta_ml1_minus_ml0"] = (
    consistency_wide["consistency_ml1"]
    - consistency_wide["consistency_ml0"]
)

# =========================
# 2. AVG NODE SCORE MEAN
# =========================
score_wide = (
    df.pivot_table(
        index=["dataset", "arch", "fold", "seed"],
        columns="motif_loss",
        values="avg_node_score_mean"
    )
    .reset_index()
)

score_wide = score_wide.rename(columns={
    0: "avg_score_ml0",
    1: "avg_score_ml1"
})

score_wide = score_wide.dropna(
    subset=["avg_score_ml0", "avg_score_ml1"]
)

score_wide["avg_score_delta_ml1_minus_ml0"] = (
    score_wide["avg_score_ml1"]
    - score_wide["avg_score_ml0"]
)

# =========================
# 3. MERGE BOTH TABLES
# =========================
final = consistency_wide.merge(
    score_wide,
    on=["dataset", "arch", "fold", "seed"],
    how="inner"
)

# Save
final.to_csv("within_motif_consistency_and_score_comparison.csv", index=False)

print(final.head())



motif_loss          dataset      arch   fold   seed  consistency_ml0  \
0           Alkane_Carbonyl  modelGAT  fold0  seed0        -4.422129   
1           Alkane_Carbonyl  modelGAT  fold1  seed0        -4.333544   
2           Alkane_Carbonyl  modelGAT  fold2  seed0        -4.334370   
3           Alkane_Carbonyl  modelGAT  fold3  seed0        -4.297377   
4           Alkane_Carbonyl  modelGCN  fold0  seed0        -4.422141   

motif_loss  consistency_ml1  consistency_delta_ml1_minus_ml0  avg_score_ml0  \
0                 -4.422116                         0.000013       0.499054   
1                 -4.333552                        -0.000008       0.499993   
2                 -4.334376                        -0.000006       0.499593   
3                 -4.297376                         0.000001       0.500621   
4                 -4.422156                        -0.000014       0.501247   

motif_loss  avg_score_ml1  avg_score_delta_ml1_minus_ml0  
0                0.500886        

In [3]:
import os
import pandas as pd

ROOT = "/nfs/hpc/share/kokatea/ChemIntuit/GSAT"
MOTIF_LOSSES = [0, 1]

rows = []

for ml in MOTIF_LOSSES:
    ml_root = os.path.join(ROOT, f"MotifLoss_{ml}")
    if not os.path.isdir(ml_root):
        print(f"[MISSING DIR] {ml_root}")
        continue

    for dataset in os.listdir(ml_root):
        droot = os.path.join(ml_root, dataset)
        if not os.path.isdir(droot):
            continue

        for arch in os.listdir(droot):
            aroot = os.path.join(droot, arch)
            if not os.path.isdir(aroot):
                continue

            for fold in os.listdir(aroot):
                froot = os.path.join(aroot, fold)
                if not os.path.isdir(froot):
                    continue

                for seed in os.listdir(froot):
                    sroot = os.path.join(froot, seed)
                    if not os.path.isdir(sroot):
                        continue

                    node_path = os.path.join(sroot, "node_scores.jsonl")
                    edge_path = os.path.join(sroot, "masked-edge-impact.jsonl")

                    missing = []
                    if not os.path.exists(node_path):
                        missing.append("node_scores.jsonl")
                    if not os.path.exists(edge_path):
                        missing.append("masked-edge-impact.jsonl")

                    if missing:
                        rows.append({
                            "dataset": dataset,
                            "arch": arch,
                            "fold": fold,
                            "seed": seed,
                            "motif_loss": ml,
                            "missing_files": ",".join(missing),
                            "path": sroot
                        })

                        print(
                            f"[MISSING] MotifLoss_{ml} | {dataset} | {arch} | "
                            f"{fold} | {seed} → {missing}"
                        )

df = pd.DataFrame(rows)
df.to_csv("missing_jsonl_files.csv", index=False)

print("\nSummary:")
print(df["missing_files"].value_counts() if not df.empty else "No missing files 🎉")
print("\nSaved → missing_jsonl_files.csv")



Summary:
No missing files 🎉

Saved → missing_jsonl_files.csv
