# Data Analysis 

In [None]:
import re
import os
import pandas as pd

DATASETS_ROOT = "/Users/yasmineakaichi/fed-popper/fedpopper"   

def count_pos_neg_in_file(filepath):
    """Return (num_pos, num_neg) from a Popper exs.pl file."""
    with open(filepath, "r") as f:
        content = f.read()

    pos_count = len(re.findall(r"\bpos\(", content))
    neg_count = len(re.findall(r"\bneg\(", content))
    return pos_count, neg_count


def summarize_dataset_partitions(dataset_name):
    """Create a dataframe listing pos/neg per partition of a dataset."""
    dataset_path = os.path.join(DATASETS_ROOT, dataset_name)

    rows = []
    for part in sorted(os.listdir(dataset_path)):
        part_path = os.path.join(dataset_path, part)
        ex_file = os.path.join(part_path, "exs.pl")
        if not os.path.isfile(ex_file):
            continue

        pos, neg = count_pos_neg_in_file(ex_file)
        rows.append({
            "dataset": dataset_name,
            "partition": part,
            "pos": pos,
            "neg": neg,
            "total": pos + neg,
        })

    return pd.DataFrame(rows)


def summarize_all_datasets(dataset_list):
    """Return a single global summary dataframe."""
    frames = [summarize_dataset_partitions(ds) for ds in dataset_list]
    return pd.concat(frames, ignore_index=True)


# ðŸ‘‰ Liste les datasets que tu veux analyser
ALL_DATASETS = [
    "trains","trains_part1","trains_part2",  "trains_part3",  
    "iggp-rps", "iggp-rps_part1", "iggp-rps_part2","iggp-rps_part3",
    "zendo1","zendo1_part1","zendo1_part2","zendo1_part3"
]

df_summary = summarize_all_datasets(ALL_DATASETS)
df_summary


In [5]:
df_summary

# Data Partitionning 

In [2]:
import os
import shutil
import re
import random

def partition_data(kbpath, num_parts=2, shuffle=True):
    """
    Partitionne exs.pl en fichiers balanced, avec POS avant NEG dans chaque partition.
    """
    ex_file = os.path.join(kbpath, "exs.pl")
    bk_file = os.path.join(kbpath, "bk.pl")
    bias_file = os.path.join(kbpath, "bias.pl")

    # --- Lire tous les exemples POS / NEG ---
    with open(ex_file, "r") as f:
        lines = f.readlines()

    pos = [l for l in lines if l.strip().startswith("pos")]
    neg = [l for l in lines if l.strip().startswith("neg")]

    # --- Optionnel : mÃ©langer lâ€™ordre des exemples ---
    if shuffle:
        random.shuffle(pos)
        random.shuffle(neg)

    # --- Split Ã©quilibrÃ© ---
    pos_splits = [pos[i::num_parts] for i in range(num_parts)]
    neg_splits = [neg[i::num_parts] for i in range(num_parts)]

    # --- CrÃ©er les dossiers partitions ---
    new_dirs = []
    for i in range(num_parts):
        part_dir = f"{kbpath}_part{i+1}"
        os.makedirs(part_dir, exist_ok=True)

        # Copy bk.pl & bias.pl
        shutil.copy(bk_file, os.path.join(part_dir, "bk.pl"))
        shutil.copy(bias_file, os.path.join(part_dir, "bias.pl"))

        # Write balanced exs.pl (POS first then NEG)
        out_file = os.path.join(part_dir, "exs.pl")
        with open(out_file, "w") as f:
            # always POS first, then NEG
            for l in pos_splits[i]:
                f.write(l)
            for l in neg_splits[i]:
                f.write(l)

        new_dirs.append(part_dir)

        print(f"ðŸ“‚ Created {part_dir}: {len(pos_splits[i])} POS, {len(neg_splits[i])} NEG")

    return new_dirs


In [3]:
partition_data("noisy-alzheimer_acetyl", num_parts=3)

ðŸ“‚ Created noisy-alzheimer_acetyl_part1: 177 POS, 177 NEG
ðŸ“‚ Created noisy-alzheimer_acetyl_part2: 177 POS, 177 NEG
ðŸ“‚ Created noisy-alzheimer_acetyl_part3: 176 POS, 176 NEG


['noisy-alzheimer_acetyl_part1',
 'noisy-alzheimer_acetyl_part2',
 'noisy-alzheimer_acetyl_part3']