In [1]:
import os
import json
import zipfile
from pathlib import PurePosixPath
from tqdm.auto import tqdm

# ----------------------------- helpers -----------------------------

def _safe_join(root, rel):
    root_abs = os.path.abspath(root)
    dest = os.path.abspath(os.path.join(root_abs, rel))
    if not (dest == root_abs or dest.startswith(root_abs + os.sep)):
        raise RuntimeError(f"Unsafe path detected: {dest}")
    return dest

def _detect_dataset_dirs(runs_root):
    datasets = set()
    experiments = []
    for exp in tqdm([d for d in os.listdir(runs_root) if os.path.isdir(os.path.join(runs_root, d))],
                    desc="Scanning experiments (existing)", leave=False):
        p_exp = os.path.join(runs_root, exp)
        experiments.append(exp)
        for ds in os.listdir(p_exp):
            p_ds = os.path.join(p_exp, ds)
            if os.path.isdir(p_ds):
                datasets.add(ds)
    print(f"Detected {len(datasets)} dataset folders in existing runs.")
    if datasets:
        print("Datasets:", sorted(datasets))
    return datasets

def _scan_existing_triples(runs_root):
    triples = set()
    n_models = 0
    for exp in tqdm([d for d in os.listdir(runs_root) if os.path.isdir(os.path.join(runs_root, d))],
                    desc="Indexing existing experiments", leave=False):
        p_exp = os.path.join(runs_root, exp)
        for ds in os.listdir(p_exp):
            p_ds = os.path.join(p_exp, ds)
            if not os.path.isdir(p_ds):
                continue
            for model in os.listdir(p_ds):
                p_model = os.path.join(p_ds, model)
                if not os.path.isdir(p_model):
                    continue
                # treat as a valid model-run only if it contains results.json
                if os.path.isfile(os.path.join(p_model, "results.json")):
                    triples.add((exp, ds, model))
                    n_models += 1
    print(f"Existing model-run folders indexed: {len(triples)} (across {n_models} dirs checked).")
    return triples

def _scan_zip_triples(zip_path):
    triples = set()      # (experiment, dataset, model)
    prefixes = {}        # triple -> prefix dir inside zip ending with '/'

    with zipfile.ZipFile(zip_path, "r") as zf:
        infos = zf.infolist()
        for info in tqdm(infos, desc="Scanning zip contents", leave=False):
            if info.is_dir():
                continue
            name = info.filename  # posix-style in zip
            p = PurePosixPath(name)
            if p.name != "results.json":
                continue
            # Expect: .../<exp>/<dataset>/<model>/results.json
            if len(p.parts) < 4:
                continue
            model_dir = p.parent
            dataset_dir = model_dir.parent
            exp_dir = dataset_dir.parent
            exp, ds, model = exp_dir.name, dataset_dir.name, model_dir.name
            triple = (exp, ds, model)
            triples.add(triple)
            prefix = str(model_dir) + "/"
            # Keep the shortest prefix if multiple seen
            if triple not in prefixes or len(prefix) < len(prefixes[triple]):
                prefixes[triple] = prefix

    print(f"Model-run folders discovered in zip (by results.json): {len(triples)}")
    return triples, prefixes

def _extract_triple_from_zip(zip_path, triple, prefix, dest_root, overwrite=False):
    exp, ds, model = triple
    dest_dir = os.path.join(dest_root, exp, ds, model)

    if os.path.exists(dest_dir) and not overwrite:
        return False  # already exists

    os.makedirs(dest_dir, exist_ok=True)

    with zipfile.ZipFile(zip_path, "r") as zf:
        for info in zf.infolist():
            name = info.filename
            if not name.startswith(prefix):
                continue
            rel = name[len(prefix):]
            if not rel:  # the folder itself
                continue
            out_path = _safe_join(dest_dir, rel)
            if name.endswith("/"):
                os.makedirs(out_path, exist_ok=True)
            else:
                os.makedirs(os.path.dirname(out_path), exist_ok=True)
                with zf.open(info, "r") as src, open(out_path, "wb") as dst:
                    dst.write(src.read())
    return True

# ----------------------------- main -----------------------------

def copy_new_model_runs(runs_root, runs_new_zip, dry_run=True, overwrite=False):
    print("=== Step 1/4: Detect dataset names in existing runs ===")
    detected_datasets = _detect_dataset_dirs(runs_root)

    print("\n=== Step 2/4: Index existing model-run folders ===")
    existing = _scan_existing_triples(runs_root)

    print("\n=== Step 3/4: Index model-run folders inside zip ===")
    zip_triples, prefixes = _scan_zip_triples(runs_new_zip)

    to_copy = sorted([t for t in zip_triples if t not in existing])

    # Stats
    ex_exps = {t[0] for t in existing}
    zip_exps = {t[0] for t in zip_triples}
    ex_datasets = {t[1] for t in existing}
    zip_datasets = {t[1] for t in zip_triples}

    print("\n=== Summary ===")
    print(f"Existing experiments: {len(ex_exps)} | in zip: {len(zip_exps)}")
    print(f"Existing datasets:   {len(ex_datasets)} | in zip: {len(zip_datasets)}")
    print(f"Existing model-runs: {len(existing)}")
    print(f"Zip model-runs:      {len(zip_triples)}")
    print(f"New model-runs to copy: {len(to_copy)}")
    if to_copy:
        preview = "\n  - " + "\n  - ".join("/".join(t) for t in to_copy[:10])
        more = "" if len(to_copy) <= 10 else f"\n  … and {len(to_copy) - 10} more"
        print("First ones:" + preview + more)

    if dry_run:
        print("\nDry run only. No files were copied. Set dry_run=False to perform the copy.")
        return []

    print("\n=== Step 4/4: Copying new model-run folders from zip ===")
    copied = []
    for triple in tqdm(to_copy, desc="Copying", leave=False):
        ok = _extract_triple_from_zip(
            runs_new_zip, triple, prefixes[triple], runs_root, overwrite=overwrite
        )
        if ok:
            copied.append(triple)

    print(f"Done. Copied {len(copied)} model-run folders.")
    return copied

# ----------------------------- how to run (in a Jupyter cell) -----------------------------
# runs_dir = "/path/to/runs"          # existing folder
# runs_zip = "/path/to/runs_new.zip"  # zip sitting next to it
#
# # 1) Preview what would be copied:
# _ = copy_new_model_runs(runs_dir, runs_zip, dry_run=True)
#
# # 2) Actually copy:
# copied = copy_new_model_runs(runs_dir, runs_zip, dry_run=False)
# copied[:5]


In [2]:
runs_dir = "/disk/10tb/home/shmelev/GENLINK/downstream_tasks/runs"          # existing folder
runs_zip = "/disk/10tb/home/shmelev/GENLINK/downstream_tasks/runs_new.zip"  # zip sitting next to it

# 1) Preview what would be copied:
_ = copy_new_model_runs(runs_dir, runs_zip, dry_run=True)

# 2) Actually copy:
copied = copy_new_model_runs(runs_dir, runs_zip, dry_run=False)
copied[:5]


=== Step 1/4: Detect dataset names in existing runs ===


Scanning experiments (existing):   0%|          | 0/20 [00:00<?, ?it/s]

Detected 146 dataset folders in existing runs.
Datasets: ['CR', 'CR_class_balance_interpolation_step_0', 'CR_class_balance_interpolation_step_1', 'CR_class_balance_interpolation_step_2', 'CR_class_balance_interpolation_step_3', 'CR_class_balance_interpolation_step_4', 'CR_class_balance_interpolation_step_5', 'CR_maxed_equal_class_balance', 'CR_real_masks', 'CR_real_masks_more_labeled_veritices', 'CR_real_masks_more_labeled_veritices_agreed', 'CR_real_masks_more_labeled_veritices_frac', 'CR_real_masks_threshold_14', 'NC_graph_rel_eng', 'NC_graph_rel_eng_non_diagonal_edge_prob_add_0', 'NC_graph_rel_eng_non_diagonal_edge_prob_add_0_0', 'NC_graph_rel_eng_non_diagonal_edge_prob_add_0_01', 'NC_graph_rel_eng_non_diagonal_edge_prob_add_0_02', 'Scandinavia', 'Scandinavia_all_probs_0', 'Scandinavia_non_diagonal_edge_prob_add_0', 'Scandinavia_non_diagonal_edge_prob_add_0_0', 'Scandinavia_non_diagonal_edge_prob_add_0_01', 'Scandinavia_non_diagonal_edge_prob_add_0_02', 'Volga', 'Volga_all_probs_0',

Indexing existing experiments:   0%|          | 0/20 [00:00<?, ?it/s]

Existing model-run folders indexed: 33089 (across 33089 dirs checked).

=== Step 3/4: Index model-run folders inside zip ===


Scanning zip contents:   0%|          | 0/110982 [00:00<?, ?it/s]

Model-run folders discovered in zip (by results.json): 40099

=== Summary ===
Existing experiments: 19 | in zip: 19
Existing datasets:   145 | in zip: 145
Existing model-runs: 33089
Zip model-runs:      40099
New model-runs to copy: 8660
First ones:
  - real_data_mask_0/CR/GL_GATConv_3l_128h_graph_based+_split_0
  - real_data_mask_0/CR/GL_GATConv_3l_128h_graph_based+_split_1
  - real_data_mask_0/CR/GL_GATConv_3l_128h_graph_based+_split_2
  - real_data_mask_0/CR/GL_GATConv_3l_128h_graph_based+_split_3
  - real_data_mask_0/CR/GL_GATConv_3l_128h_graph_based+_split_4
  - real_data_mask_0/CR/GL_GATConv_3l_128h_graph_based+_split_5
  - real_data_mask_0/CR/GL_GATConv_3l_128h_graph_based+_split_6
  - real_data_mask_0/CR/GL_GATConv_3l_128h_graph_based+_split_7
  - real_data_mask_0/CR/GL_GATConv_3l_128h_graph_based+_split_8
  - real_data_mask_0/CR/GL_GATConv_3l_128h_graph_based+_split_9
  … and 8650 more

Dry run only. No files were copied. Set dry_run=False to perform the copy.
=== Step 1/4: De

Scanning experiments (existing):   0%|          | 0/20 [00:00<?, ?it/s]

Detected 146 dataset folders in existing runs.
Datasets: ['CR', 'CR_class_balance_interpolation_step_0', 'CR_class_balance_interpolation_step_1', 'CR_class_balance_interpolation_step_2', 'CR_class_balance_interpolation_step_3', 'CR_class_balance_interpolation_step_4', 'CR_class_balance_interpolation_step_5', 'CR_maxed_equal_class_balance', 'CR_real_masks', 'CR_real_masks_more_labeled_veritices', 'CR_real_masks_more_labeled_veritices_agreed', 'CR_real_masks_more_labeled_veritices_frac', 'CR_real_masks_threshold_14', 'NC_graph_rel_eng', 'NC_graph_rel_eng_non_diagonal_edge_prob_add_0', 'NC_graph_rel_eng_non_diagonal_edge_prob_add_0_0', 'NC_graph_rel_eng_non_diagonal_edge_prob_add_0_01', 'NC_graph_rel_eng_non_diagonal_edge_prob_add_0_02', 'Scandinavia', 'Scandinavia_all_probs_0', 'Scandinavia_non_diagonal_edge_prob_add_0', 'Scandinavia_non_diagonal_edge_prob_add_0_0', 'Scandinavia_non_diagonal_edge_prob_add_0_01', 'Scandinavia_non_diagonal_edge_prob_add_0_02', 'Volga', 'Volga_all_probs_0',

Indexing existing experiments:   0%|          | 0/20 [00:00<?, ?it/s]

Existing model-run folders indexed: 33089 (across 33089 dirs checked).

=== Step 3/4: Index model-run folders inside zip ===


Scanning zip contents:   0%|          | 0/110982 [00:00<?, ?it/s]

Model-run folders discovered in zip (by results.json): 40099

=== Summary ===
Existing experiments: 19 | in zip: 19
Existing datasets:   145 | in zip: 145
Existing model-runs: 33089
Zip model-runs:      40099
New model-runs to copy: 8660
First ones:
  - real_data_mask_0/CR/GL_GATConv_3l_128h_graph_based+_split_0
  - real_data_mask_0/CR/GL_GATConv_3l_128h_graph_based+_split_1
  - real_data_mask_0/CR/GL_GATConv_3l_128h_graph_based+_split_2
  - real_data_mask_0/CR/GL_GATConv_3l_128h_graph_based+_split_3
  - real_data_mask_0/CR/GL_GATConv_3l_128h_graph_based+_split_4
  - real_data_mask_0/CR/GL_GATConv_3l_128h_graph_based+_split_5
  - real_data_mask_0/CR/GL_GATConv_3l_128h_graph_based+_split_6
  - real_data_mask_0/CR/GL_GATConv_3l_128h_graph_based+_split_7
  - real_data_mask_0/CR/GL_GATConv_3l_128h_graph_based+_split_8
  - real_data_mask_0/CR/GL_GATConv_3l_128h_graph_based+_split_9
  … and 8650 more

=== Step 4/4: Copying new model-run folders from zip ===


Copying:   0%|          | 0/8660 [00:00<?, ?it/s]

Done. Copied 8660 model-run folders.


[('real_data_mask_0', 'CR', 'GL_GATConv_3l_128h_graph_based+_split_0'),
 ('real_data_mask_0', 'CR', 'GL_GATConv_3l_128h_graph_based+_split_1'),
 ('real_data_mask_0', 'CR', 'GL_GATConv_3l_128h_graph_based+_split_2'),
 ('real_data_mask_0', 'CR', 'GL_GATConv_3l_128h_graph_based+_split_3'),
 ('real_data_mask_0', 'CR', 'GL_GATConv_3l_128h_graph_based+_split_4')]