In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
# Load VCF
# ----------------------------
# VCF loader (plain .vcf, not gzipped)
# ----------------------------
def load_vcf_plain_with_ids(path, keep_autosomes=True):
    samples, meta, var_chr, cols = [], [], [], []
    with open(path, "r", encoding="utf-8", errors="replace") as f:
        for line in f:
            if line.startswith("##"):
                continue
            if line.startswith("#CHROM"):
                header = line.rstrip("\n").split("\t")
                samples = header[9:]
                continue
            parts = line.rstrip("\n").split("\t")
            if len(parts) < 10:
                continue
            chrom, pos, _id, ref, alt, qual, flt, info, fmt = parts[:9]
            geno = parts[9:]
            if keep_autosomes:
                c = chrom.replace("chr", "")
                if not c.isdigit():
                    continue
                if not (1 <= int(c) <= 22):
                    continue
            dos = []
            for g in geno:
                gt = g.split(":", 1)[0]
                if gt in ("./.", ".|."):
                    dos.append(np.nan)
                else:
                    a = gt.replace("|", "/").split("/")
                    try:
                        dos.append(sum(int(x) for x in a))
                    except:
                        dos.append(np.nan)
            meta.append((chrom, int(pos), ref, alt))
            var_chr.append(chrom.replace("chr", ""))
            cols.append(np.asarray(dos, dtype=float))
    G = np.vstack(cols).T if cols else np.empty((len(samples), 0))
    return samples, meta, var_chr, G

#real_samples, real_meta, VAR_CHR, G_real = load_vcf_plain_with_ids(REAL_VCF)
#print("Loaded REAL:", G_real.shape, " (samples × SNPs)")

In [5]:
# ================= Leaky baseline generator =================

def _flip_genotypes(dos_row, flip_rate=0.01, rng=None, keep_ultra_rare_mask=None):
    """
    dos_row: 1D float array in {0,1,2} or NaN
    flip_rate: per-SNP probability to perturb genotype by ±1 (clipped to [0,2])
    keep_ultra_rare_mask: boolean mask (m,) -> variants to *avoid* flipping (preserve leakage)
    """
    if rng is None:
        rng = np.random.default_rng(0)
    out = dos_row.copy()
    m = out.shape[0]
    flips = rng.random(m) < flip_rate
    if keep_ultra_rare_mask is not None:
        flips = flips & (~keep_ultra_rare_mask)  # DO NOT touch ultra-rare variants
    # only flip where not NaN
    valid = ~np.isnan(out)
    idx = np.where(flips & valid)[0]
    if idx.size:
        step = rng.choice([-1.0, 1.0], size=idx.size)
        out[idx] = np.clip(out[idx] + step, 0.0, 2.0)
    return out

def make_ultra_rare_mask(G, maf_thresh=0.001, eps=1e-9):
    """
    Returns boolean mask of variants that are ultra-rare in the provided matrix G (NaN tolerated).
    """
    G = np.asarray(G, dtype=float)
    # AF from cohort (ignoring NaN)
    sums = np.nansum(G, axis=0)  # sum of dosages
    nobs = np.sum(~np.isnan(G), axis=0)  # number of non-NaN
    with np.errstate(invalid="ignore", divide="ignore"):
        p = np.where(nobs > 0, (sums / 2.0) / nobs, 0.0)
    maf = np.minimum(p, 1.0 - p)
    return (maf + eps) < maf_thresh

def generate_leaky_copycat(
    G_real,
    n_samples: int | None = None,
    copy_frac=0.60,
    neardup_frac=0.30,
    random_frac=0.10,
    flip_rate_neardup=0.01,
    flip_rate_random=0.05,
    keep_ultra_rare=True,
    ultra_maf=0.001,
    seed=123
):
    """
    Build a leaky cohort from G_real (n_real x m) of size n_samples (defaults to n_real).
    Fractions refer to the *output* size and must sum to 1.
    """
    assert abs(copy_frac + neardup_frac + random_frac - 1.0) < 1e-9
    rng = np.random.default_rng(seed)
    n_real, m = G_real.shape
    if n_samples is None:
        n_samples = n_real

    keep_rare_mask = make_ultra_rare_mask(G_real, maf_thresh=ultra_maf) if keep_ultra_rare else None

    n_copy = int(round(copy_frac * n_samples))
    n_near = int(round(neardup_frac * n_samples))
    n_rand = n_samples - n_copy - n_near

    base_idx = rng.choice(n_real, size=n_samples, replace=True)
    Gb = G_real[base_idx, :]

    rows = []
    rows.extend(Gb[:n_copy, :])  # exact copies

    for i in range(n_copy, n_copy + n_near):
        rows.append(_flip_genotypes(Gb[i, :], flip_rate=flip_rate_neardup,
                                    rng=rng, keep_ultra_rare_mask=keep_rare_mask))
    for i in range(n_copy + n_near, n_copy + n_near + n_rand):
        rows.append(_flip_genotypes(Gb[i, :], flip_rate=flip_rate_random,
                                    rng=rng, keep_ultra_rare_mask=keep_rare_mask))

    return np.vstack(rows)

def generate_leaky_kindoped(
    G_real,
    n_samples: int | None = None,
    # base mix (copycat-ε)
    copy_frac=0.50,
    neardup_frac=0.40,
    random_frac=0.10,
    flip_rate_neardup=0.008,
    flip_rate_random=0.03,
    # kin-doping controls (on the *output*)
    dup_pairs_frac=0.20,
    sib_flip_rate=0.008,
    keep_ultra_rare=True,
    ultra_maf=0.001,
    seed=123
):
    """
    Copycat-ε + kin-doping. Output size = n_samples (defaults to n_real).
    dup_pairs_frac: fraction of output rows to turn into sibling-like near-dups.
    """
    rng = np.random.default_rng(seed)
    n_real, m = G_real.shape
    if n_samples is None:
        n_samples = n_real

    # Step 1: base leaky cohort of size n_samples
    G_base = generate_leaky_copycat(
        G_real, n_samples=n_samples,
        copy_frac=copy_frac, neardup_frac=neardup_frac, random_frac=random_frac,
        flip_rate_neardup=flip_rate_neardup, flip_rate_random=flip_rate_random,
        keep_ultra_rare=keep_ultra_rare, ultra_maf=ultra_maf, seed=seed
    )

    # Step 2: kin-doping
    k = int(round(dup_pairs_frac * n_samples))
    if k > 0:
        idx = rng.choice(n_samples, size=k, replace=False)
        for i in idx:
            G_base[i, :] = _flip_genotypes(G_base[i, :], flip_rate=sib_flip_rate, rng=rng)

    return G_base

In [6]:
def write_vcf_from_matrix(out_path, samples, meta, G, ref_keep_chr_format="as_is"):
    """
    Minimal VCF writer: writes GT only. Generates as many sample IDs as there are rows in G.
    """
    n, m = G.shape
    # Generate IDs deterministically; use original names when available, pad otherwise
    base = [f"{s}_LEAKY" for s in samples]
    if n <= len(base):
        new_samples = [f"{base[i]}{i:04d}" for i in range(n)]
    else:
        # use all provided names, then synthesize more
        new_samples = [f"{base[i]}{i:04d}" for i in range(len(base))]
        new_samples += [f"LEAKY_{i:04d}" for i in range(len(base), n)]

    with open(out_path, "w", encoding="utf-8") as f:
        f.write("##fileformat=VCFv4.2\n")
        f.write("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n")
        header = ["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"] + new_samples
        f.write("\t".join(header) + "\n")
        for j in range(m):
            chrom, pos, ref, alt = meta[j]
            if ref_keep_chr_format == "strip_chr":
                chrom = str(chrom).replace("chr","")
            col = G[:, j]
            gts = []
            for d in col:
                if np.isnan(d):
                    gts.append("./.")
                else:
                    d = int(round(d))
                    if d <= 0: gts.append("0/0")
                    elif d == 1: gts.append("0/1")
                    else: gts.append("1/1")
            row = [str(chrom), str(pos), ".", str(ref), str(alt), ".", "PASS", ".", "GT"] + gts
            f.write("\t".join(row) + "\n")
    return new_samples

In [None]:
REAL_VCF = "1000G_65K_SNP_chr1.vcf"

# Load REAL *after* defining the path
real_samples, real_meta, VAR_CHR, G_real = load_vcf_plain_with_ids(REAL_VCF)
print("Loaded REAL:", G_real.shape, " (samples × SNPs)")

# Choose output size for leaky baselines (match your GAN/RBM if you want)
N_SYN = 2500

# Copycat-ε leaky set of size N_SYN
G_leaky = generate_leaky_copycat(
    G_real,
    n_samples=N_SYN,
    copy_frac=0.80,
    neardup_frac=0.15,
    random_frac=0.05,
    flip_rate_neardup=0.003,
    flip_rate_random=0.05,
    keep_ultra_rare=True,
    ultra_maf=0.001,
    seed=777
)
LEAKY_VCF = "LEAKY_copycat_chr1.vcf"
write_vcf_from_matrix(LEAKY_VCF, real_samples, real_meta, G_leaky)
print("Leaky baseline written to:", LEAKY_VCF, "with shape", G_leaky.shape)

# Kin-doped leaky set of size N_SYN
G_leaky_hot = generate_leaky_kindoped(
    G_real,
    n_samples=N_SYN,
    dup_pairs_frac=0.20,
    sib_flip_rate=0.08,
    seed=888
)
write_vcf_from_matrix("LEAKY_kindoped_chr1.vcf", real_samples, real_meta, G_leaky_hot)

Loaded REAL: (2504, 65535)  (samples × SNPs)
Leaky baseline written to: LEAKY_copycat_chr1.vcf with shape (2500, 65535)
