In [1]:
!pip install pandas numpy faker tqdm psutil




Collecting faker
  Downloading Faker-36.1.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-36.1.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-36.1.0


In [2]:
import os
import random
import csv
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
from tqdm import tqdm
import tkinter as tk
from tkinter import ttk
import gc
import psutil
import time

# ---------------------------
# Helper Functions
# ---------------------------
def random_date_after(start_date, end_date):
    """Return a random datetime between start_date and end_date."""
    delta = (end_date - start_date).days
    random_days = np.random.randint(0, delta + 1)
    return start_date + timedelta(days=int(random_days))

def get_random_chromosome():
    """Randomly select a chromosome with realistic weighting.
       Autosomes (1-22) share 90% weight; X gets 8%, Y gets 1.5%, MT gets 0.5%."""
    autosomes = [str(i) for i in range(1, 23)]
    others = ["X", "Y", "MT"]
    chromosomes = autosomes + others
    weights = [0.9/22]*22 + [0.08, 0.015, 0.005]
    return random.choices(chromosomes, weights=weights, k=1)[0]

def get_functional_effect(region_type):
    """Return a functional effect based on the genomic region type."""
    if region_type == "exonic":
        return random.choices(["synonymous", "nonsynonymous", "stop_gain", "stop_loss"],
                              weights=[0.7, 0.25, 0.03, 0.02], k=1)[0]
    elif region_type in ["promoter", "enhancer"]:
        return "regulatory"
    elif region_type in ["5' UTR", "3' UTR"]:
        return "untranslated"
    else:
        return "non-coding"

def check_memory(limit_percent=80):
    """
    Pause processing if system memory usage exceeds limit_percent.
    (E.g. limit_percent=80 means pause if >80% memory is used.)
    """
    while psutil.virtual_memory().percent > limit_percent:
        print(f"Memory usage high: {psutil.virtual_memory().percent}% - waiting...")
        time.sleep(1)

# ---------------------------
# Global Parameters
# ---------------------------
NUM_INDIVIDUALS = 1_000_000    # For full-scale; for testing, you might use a lower number (e.g., 10_000)
NUM_SNPS = 1000              # Number of SNP markers
NUM_INDELS = 200             # Number of Indels
ANCESTRY_GROUPS = ["North European", "Sub-Saharan African", "Native American",
                   "East Asian", "South Asian", "Middle Eastern"]
SV_PROBABILITY = 0.05        # 5% of individuals have a structural variant record

# Use a more memory-efficient float type
float_dtype = np.float32

# Approximate chromosome lengths and region types
chr_lengths = {str(i): 150_000_000 for i in range(1, 23)}
chr_lengths.update({"X": 155_000_000, "Y": 60_000_000, "MT": 16_569})
region_types = ["intergenic", "intronic", "exonic", "promoter", "5' UTR", "3' UTR", "enhancer"]
region_weights = [0.5, 0.3, 0.1, 0.05, 0.02, 0.02, 0.01]

# Memory threshold: pause if memory usage exceeds this percent
MEMORY_THRESHOLD_PERCENT = 80

# ---------------------------
# Setup Output Directory and File Paths
# ---------------------------
output_dir = "data_lake"
os.makedirs(output_dir, exist_ok=True)
individuals_file = os.path.join(output_dir, "individuals.csv")
relationships_file = os.path.join(output_dir, "relationships.csv")
marriages_file = os.path.join(output_dir, "marriages.csv")
snp_def_file = os.path.join(output_dir, "snp_definitions.csv")
snp_file = os.path.join(output_dir, "snps.csv")
indel_def_file = os.path.join(output_dir, "indel_definitions.csv")
indel_file = os.path.join(output_dir, "indels.csv")
sv_file = os.path.join(output_dir, "structural_variants.csv")
health_file = os.path.join(output_dir, "health_phenotypes.csv")
lifestyle_file = os.path.join(output_dir, "lifestyle.csv")

# Initialize Faker
fake = Faker()

# =============================================================================
# Step 1: Generate Individuals Data in Chunks
# =============================================================================
def generate_individuals_chunk(start_idx, chunk_size, total):
    """Generate a chunk of individuals data and return a DataFrame."""
    n = chunk_size if start_idx + chunk_size <= total else total - start_idx
    ids = [f"ID_{i+1:07d}" for i in range(start_idx, start_idx+n)]
    genders = np.random.choice(["Male", "Female"], size=n)
    # Generate fixed pools per chunk for consistency
    male_first_pool = [fake.first_name_male() for _ in range(1000)]
    female_first_pool = [fake.first_name_female() for _ in range(1000)]
    last_pool = [fake.last_name() for _ in range(1000)]
    country_pool = [fake.country() for _ in range(500)]
    city_pool = [fake.city() for _ in range(500)]
    state_pool = [fake.state() for _ in range(500)]
    lat_pool = [float(fake.latitude()) for _ in range(500)]
    lon_pool = [float(fake.longitude()) for _ in range(500)]

    first_names = np.where(genders == "Male",
                           np.random.choice(male_first_pool, size=n),
                           np.random.choice(female_first_pool, size=n))
    last_names = np.random.choice(last_pool, size=n)
    countries = np.random.choice(country_pool, size=n)
    cities = np.random.choice(city_pool, size=n)
    states = np.random.choice(state_pool, size=n)
    latitudes = np.random.choice(lat_pool, size=n)
    longitudes = np.random.choice(lon_pool, size=n)

    start_birth = pd.Timestamp("1920-01-01")
    end_birth = pd.Timestamp("2010-12-31")
    birth_days = np.random.randint(0, (end_birth - start_birth).days + 1, size=n)
    birth_dates = start_birth + pd.to_timedelta(birth_days, unit="D")
    birth_dates_str = birth_dates.strftime("%Y-%m-%d")

    min_sample_dates = birth_dates + pd.to_timedelta(18*365, unit="D")
    max_sample_date = pd.Timestamp("2022-12-31")
    available_days = np.clip((max_sample_date - min_sample_dates).days, a_min=1, a_max=None)
    sample_offsets = np.array([np.random.randint(0, avail + 1) for avail in available_days])
    sample_collection_dates = min_sample_dates + pd.to_timedelta(sample_offsets, unit="D")
    sample_collection_dates_str = sample_collection_dates.strftime("%Y-%m-%d")

    processing_offsets = np.random.randint(7, 31, size=n)
    processing_dates = sample_collection_dates + pd.to_timedelta(processing_offsets, unit="D")
    processing_dates_str = processing_dates.strftime("%Y-%m-%d")

    coverage = np.maximum(5, np.random.normal(30, 5, n)).astype(float_dtype)
    coverage = np.round(coverage, 1)
    consent = np.random.choice(["Yes", "No"], size=n, p=[0.95, 0.05])
    ancestry = np.random.dirichlet(np.ones(len(ANCESTRY_GROUPS)), size=n) * 100
    ancestry = np.round(ancestry, 2)
    ancestry_df = pd.DataFrame(ancestry, columns=[g.replace(" ", "_").lower() for g in ANCESTRY_GROUPS])
    batch_ids = [f"BATCH_{random.randint(1,100):03d}" for _ in range(n)]

    df = pd.DataFrame({
        "individual_id": ids,
        "first_name": first_names,
        "last_name": last_names,
        "gender": genders,
        "birth_date": birth_dates_str,
        "country": countries,
        "city": cities,
        "state": states,
        "latitude": latitudes,
        "longitude": longitudes,
        "sample_collection_date": sample_collection_dates_str,
        "processing_date": processing_dates_str,
        "coverage": coverage,
        "consent": consent,
        "batch_id": batch_ids
    })
    df = pd.concat([df, ancestry_df], axis=1)
    df['birth_date_dt'] = pd.to_datetime(df['birth_date'])
    df['sample_collection_date_dt'] = pd.to_datetime(df['sample_collection_date'])
    return df

print("Generating individuals data in chunks...")
with open(individuals_file, "w", newline="") as f_out:
    first_chunk = True
    for start in tqdm(range(0, NUM_INDIVIDUALS, 100000), desc="Individuals Chunks"):
        check_memory(MEMORY_THRESHOLD_PERCENT)
        chunk_df = generate_individuals_chunk(start, 100000, NUM_INDIVIDUALS)
        if first_chunk:
            expected_cols = list(chunk_df.columns)
            chunk_df = chunk_df[expected_cols]
            chunk_df.to_csv(f_out, mode="w", index=False, header=True)
            first_chunk = False
        else:
            chunk_df = chunk_df[expected_cols]
            chunk_df.to_csv(f_out, mode="a", index=False, header=False)
        del chunk_df
        gc.collect()
print(f"Individuals data saved to {individuals_file}")

# =============================================================================
# Step 2: Generate Family Relationships (Using np.searchsorted)
# =============================================================================
print("Generating family relationships...")
individuals_df = pd.read_csv(individuals_file, parse_dates=["birth_date", "sample_collection_date"])
individuals_df['birth_date_dt'] = pd.to_datetime(individuals_df['birth_date'])
individuals_df['sample_collection_date_dt'] = pd.to_datetime(individuals_df['sample_collection_date'])
individuals_df["gender"] = individuals_df["gender"].astype("category")
individuals_df["consent"] = individuals_df["consent"].astype("category")
children = individuals_df[["individual_id", "birth_date_dt"]].copy().sort_values("birth_date_dt")
child_thresholds = (children["birth_date_dt"] - pd.DateOffset(years=20)).values.astype("datetime64[D]")
eligible_fathers = individuals_df[individuals_df['gender'] == "Male"].copy().sort_values("birth_date_dt")
father_births = eligible_fathers["birth_date_dt"].values.astype("datetime64[D]")
indices = np.searchsorted(father_births, child_thresholds)
father_ids = np.empty(len(indices), dtype=object)
mask = (indices == 0)
father_ids[~mask] = np.array(eligible_fathers["individual_id"], dtype=object)[indices[~mask] - 1]
father_ids[mask] = None
eligible_mothers = individuals_df[individuals_df['gender'] == "Female"].copy().sort_values("birth_date_dt")
mother_births = eligible_mothers["birth_date_dt"].values.astype("datetime64[D]")
indices_m = np.searchsorted(mother_births, child_thresholds)
mother_ids = np.empty(len(indices_m), dtype=object)
mask_m = (indices_m == 0)
mother_ids[~mask_m] = np.array(eligible_mothers["individual_id"], dtype=object)[indices_m[~mask_m] - 1]
mother_ids[mask_m] = None
relationships_df = pd.DataFrame({
    "individual_id": children["individual_id"],
    "father_id": father_ids,
    "mother_id": mother_ids
})
relationships_df.to_csv(relationships_file, index=False)
print(f"Relationships data saved to {relationships_file}")
del children, eligible_fathers, eligible_mothers, father_ids, mother_ids
gc.collect()

# =============================================================================
# Step 3: Generate Marriages Data (Vectorized Pairing)
# =============================================================================
print("Generating marriages data...")
individuals_df["age"] = (individuals_df["sample_collection_date_dt"] - individuals_df["birth_date_dt"]).dt.days / 365.25
eligible = individuals_df[individuals_df["age"] >= 18].copy()
males = eligible[eligible["gender"] == "Male"].sample(frac=1, random_state=42).reset_index(drop=True)
females = eligible[eligible["gender"] == "Female"].sample(frac=1, random_state=42).reset_index(drop=True)
min_len = min(len(males), len(females))
paired = pd.DataFrame({
    "male_id": males.loc[:min_len-1, "individual_id"].values,
    "female_id": females.loc[:min_len-1, "individual_id"].values,
    "male_age": males.loc[:min_len-1, "age"].values,
    "female_age": females.loc[:min_len-1, "age"].values,
    "male_birth": males.loc[:min_len-1, "birth_date_dt"].values,
    "female_birth": females.loc[:min_len-1, "birth_date_dt"].values,
    "male_sample": males.loc[:min_len-1, "sample_collection_date_dt"].values,
    "female_sample": females.loc[:min_len-1, "sample_collection_date_dt"].values,
})
paired = paired[np.abs(paired["male_age"] - paired["female_age"]) <= 10]
paired["male_18"] = paired["male_birth"] + pd.to_timedelta(18*365, unit="D")
paired["female_18"] = paired["female_birth"] + pd.to_timedelta(18*365, unit="D")
paired["marriage_start"] = paired[["male_18", "female_18"]].max(axis=1)
paired["marriage_end"] = paired[["male_sample", "female_sample"]].min(axis=1)
def vectorized_random_date(start_series, end_series):
    delta = (end_series - start_series).dt.days.clip(lower=0)
    random_days = (np.random.rand(len(delta)) * delta).astype(int)
    return start_series + pd.to_timedelta(random_days, unit="D")
paired["marriage_date"] = vectorized_random_date(paired["marriage_start"], paired["marriage_end"])
divorce_mask = np.random.rand(len(paired)) < 0.2
divorce_start = paired.loc[divorce_mask, "marriage_date"] + pd.to_timedelta(365, unit="D")
divorce_end = pd.Timestamp("2022-12-31")
paired.loc[divorce_mask, "divorce_date"] = vectorized_random_date(divorce_start, pd.Series(divorce_end, index=divorce_start.index))
marriages_df = paired[["male_id", "female_id", "marriage_date", "divorce_date"]].copy()
marriages_df.to_csv(marriages_file, index=False)
print(f"Marriages data saved to {marriages_file}")
del paired, eligible, males, females
gc.collect()

# =============================================================================
# Step 4: Generate SNP Definitions with Genomic Annotation
# =============================================================================
print("Generating SNP definitions with genomic annotations...")
nucleotides = ['A', 'C', 'G', 'T']
snp_definitions = []
for i in tqdm(range(NUM_SNPS), desc="SNP Definitions"):
    snp_id = f"SNP_{i+1:04d}"
    alleles = random.sample(nucleotides, 2)
    allele1_freq = random.uniform(0.1, 0.9)
    call_rate = round(random.uniform(0.95, 0.99), 3)
    quality_score = random.randint(20, 40)
    chromosome = get_random_chromosome()
    position = random.randint(1, chr_lengths[chromosome])
    region_type = random.choices(region_types, weights=region_weights, k=1)[0]
    functional_effect = get_functional_effect(region_type)
    snp_definitions.append({
        "snp_id": snp_id,
        "allele1": alleles[0],
        "allele2": alleles[1],
        "allele1_freq": round(allele1_freq, 3),
        "call_rate": call_rate,
        "quality_score": quality_score,
        "chromosome": chromosome,
        "position": position,
        "region_type": region_type,
        "functional_effect": functional_effect
    })
snp_def_df = pd.DataFrame(snp_definitions)
snp_def_df["chromosome"] = snp_def_df["chromosome"].astype("category")
snp_def_df["region_type"] = snp_def_df["region_type"].astype("category")
snp_def_df["functional_effect"] = snp_def_df["functional_effect"].astype("category")
snp_def_df.to_csv(snp_def_file, index=False)
print(f"SNP definitions saved to {snp_def_file}")
del snp_definitions
gc.collect()
# For later use in SNP genotype generation, we will use the dictionary form:
snp_annotations = snp_def_df.to_dict('records')

# =============================================================================
# Step 5: Generate SNP Genotypes (Chunked Writing to CSV, CPU Only)
# =============================================================================
print("Generating SNP genotypes for individuals (writing in chunks)...")
# Helper function for genotype computation for one SNP marker for a chunk of individuals.
def compute_snp_genotype_for_chunk(snp_info, genders):
    """Compute genotype calls for a given SNP marker for a chunk of individuals.
       genders: list-like of "Male" or "Female" for the individuals in this chunk."""
    n = len(genders)
    genotypes = np.full(n, "NA", dtype=object)
    # Determine which individuals have a call based on call_rate
    call_mask = np.random.rand(n) <= snp_info["call_rate"]
    if snp_info["chromosome"] not in ["X", "Y", "MT"]:
        p = snp_info["allele1_freq"]
        prob_aa = p ** 2
        prob_ab = 2 * p * (1 - p)
        r = np.random.rand(n)
        called_idx = np.where(call_mask)[0]
        if len(called_idx) > 0:
            geno = np.empty(len(called_idx), dtype=object)
            r_called = r[called_idx]
            geno[r_called < prob_aa] = ''.join(sorted(snp_info["allele1"] * 2))
            geno[(r_called >= prob_aa) & (r_called < (prob_aa + prob_ab))] = ''.join(sorted(snp_info["allele1"] + snp_info["allele2"]))
            geno[r_called >= (prob_aa + prob_ab)] = ''.join(sorted(snp_info["allele2"] * 2))
            genotypes[called_idx] = geno
        return genotypes
    elif snp_info["chromosome"] == "X":
        p = snp_info["allele1_freq"]
        r = np.random.rand(n)
        called_idx = np.where(call_mask)[0]
        if len(called_idx) > 0:
            # Process males and females separately:
            male_indices = [i for i in called_idx if genders[i] == "Male"]
            if male_indices:
                r_male = np.random.rand(len(male_indices))
                geno_male = np.where(r_male < p, snp_info["allele1"], snp_info["allele2"])
                for idx, val in zip(male_indices, geno_male):
                    genotypes[idx] = val
            female_indices = [i for i in called_idx if genders[i] == "Female"]
            if female_indices:
                r_female = np.random.rand(len(female_indices))
                prob_aa = p ** 2
                prob_ab = 2 * p * (1 - p)
                geno_female = np.empty(len(female_indices), dtype=object)
                geno_female[r_female < prob_aa] = ''.join(sorted(snp_info["allele1"] * 2))
                geno_female[(r_female >= prob_aa) & (r_female < (prob_aa + prob_ab))] = ''.join(sorted(snp_info["allele1"] + snp_info["allele2"]))
                geno_female[r_female >= (prob_aa + prob_ab)] = ''.join(sorted(snp_info["allele2"] * 2))
                for idx, val in zip(female_indices, geno_female):
                    genotypes[idx] = val
        return genotypes
    elif snp_info["chromosome"] == "Y":
        r = np.random.rand(n)
        called_idx = np.where(call_mask)[0]
        for i in called_idx:
            if genders[i] == "Male":
                r_val = random.random()  # CPU random value
                genotypes[i] = snp_info["allele1"] if r_val < snp_info["allele1_freq"] else snp_info["allele2"]
        return genotypes
    elif snp_info["chromosome"] == "MT":
        r = np.random.rand(n)
        called_idx = np.where(call_mask)[0]
        for i in called_idx:
            r_val = random.random()
            genotypes[i] = snp_info["allele1"] if r_val < snp_info["allele1_freq"] else snp_info["allele2"]
        return genotypes

# Get the list of genders for all individuals.
genders_list = individuals_df["gender"].tolist()

# Write the SNP genotype matrix one chunk at a time.
chunk_size = 100000
print("Writing SNP genotypes in chunks...")
with open(snp_file, "w", newline="") as f_out:
    writer = csv.writer(f_out)
    # Write header: first column is individual_id then one column per SNP marker.
    header = ["individual_id"] + [snp["snp_id"] for snp in snp_annotations]
    writer.writerow(header)
    # Process individuals in chunks
    for start in tqdm(range(0, NUM_INDIVIDUALS, chunk_size), desc="SNP Genotype Chunks"):
        end = min(start + chunk_size, NUM_INDIVIDUALS)
        chunk_genders = genders_list[start:end]
        chunk_ind_ids = individuals_df["individual_id"].iloc[start:end].tolist()
        # For each SNP marker, compute the genotype calls for this chunk.
        chunk_genotypes = []  # List of arrays; one per SNP.
        for snp in snp_annotations:
            check_memory(MEMORY_THRESHOLD_PERCENT)
            geno_chunk = compute_snp_genotype_for_chunk(snp, chunk_genders)
            chunk_genotypes.append(geno_chunk)
        # Write one row per individual in this chunk.
        for i in range(len(chunk_genders)):
            row = [chunk_ind_ids[i]] + [chunk_genotypes[j][i] for j in range(len(chunk_genotypes))]
            writer.writerow(row)
        del chunk_genotypes
        gc.collect()
print(f"SNP genotype data saved to {snp_file}")

# =============================================================================
# Step 6: Generate Indel Definitions and Genotypes (Chunked Writing to CSV, CPU Only)
# =============================================================================
print("Generating Indel definitions with genomic annotations...")
indel_definitions = []
for i in tqdm(range(NUM_INDELS), desc="Indel Definitions"):
    indel_id = f"INDEL_{i+1:04d}"
    ref_allele = random.choice(nucleotides)
    alt_length = random.randint(1, 10)
    alt_allele = ''.join(random.choices(nucleotides, k=alt_length))
    frequency = random.uniform(0.05, 0.5)
    call_rate = round(random.uniform(0.95, 0.99), 3)
    chromosome = get_random_chromosome()
    position = random.randint(1, chr_lengths[chromosome])
    region_type = random.choices(region_types, weights=region_weights, k=1)[0]
    functional_effect = get_functional_effect(region_type)
    indel_definitions.append({
        "indel_id": indel_id,
        "ref_allele": ref_allele,
        "alt_allele": alt_allele,
        "frequency": round(frequency, 3),
        "call_rate": call_rate,
        "length": alt_length,
        "chromosome": chromosome,
        "position": position,
        "region_type": region_type,
        "functional_effect": functional_effect
    })
indel_def_df = pd.DataFrame(indel_definitions)
indel_def_df["chromosome"] = indel_def_df["chromosome"].astype("category")
indel_def_df["region_type"] = indel_def_df["region_type"].astype("category")
indel_def_df["functional_effect"] = indel_def_df["functional_effect"].astype("category")
indel_def_df.to_csv(indel_def_file, index=False)
print(f"Indel definitions saved to {indel_def_file}")
del indel_definitions
gc.collect()
# Convert to list of dictionaries for later use.
indel_annotations = indel_def_df.to_dict('records')

def compute_indel_genotype_for_chunk(indel_info, genders):
    """Compute genotype calls for a given indel marker for a chunk of individuals."""
    n = len(genders)
    genotypes = np.full(n, "NA", dtype=object)
    call_mask = np.random.rand(n) <= indel_info["call_rate"]
    if indel_info["chromosome"] not in ["X", "Y", "MT"]:
        p = indel_info["frequency"]
        prob_rr = p ** 2
        prob_ra = 2 * p * (1 - p)
        r = np.random.rand(n)
        called_idx = np.where(call_mask)[0]
        if len(called_idx) > 0:
            geno = np.empty(len(called_idx), dtype=object)
            r_called = r[called_idx]
            geno[r_called < prob_rr] = ''.join(sorted(indel_info["ref_allele"] * 2))
            geno[(r_called >= prob_rr) & (r_called < (prob_rr + prob_ra))] = ''.join(sorted(indel_info["ref_allele"] + indel_info["alt_allele"]))
            geno[r_called >= (prob_rr + prob_ra)] = ''.join(sorted(indel_info["alt_allele"] * 2))
            genotypes[called_idx] = geno
        return genotypes
    elif indel_info["chromosome"] == "X":
        p = indel_info["frequency"]
        r = np.random.rand(n)
        called_idx = np.where(call_mask)[0]
        if len(called_idx) > 0:
            male_indices = [i for i in called_idx if genders[i] == "Male"]
            if male_indices:
                r_male = np.random.rand(len(male_indices))
                geno_male = np.where(r_male < p, indel_info["ref_allele"], indel_info["alt_allele"])
                for idx, val in zip(male_indices, geno_male):
                    genotypes[idx] = val
            female_indices = [i for i in called_idx if genders[i] == "Female"]
            if female_indices:
                r_female = np.random.rand(len(female_indices))
                prob_rr = p ** 2
                prob_ra = 2 * p * (1 - p)
                geno_female = np.empty(len(female_indices), dtype=object)
                geno_female[r_female < prob_rr] = ''.join(sorted(indel_info["ref_allele"] * 2))
                geno_female[(r_female >= prob_rr) & (r_female < (prob_rr + prob_ra))] = ''.join(sorted(indel_info["ref_allele"] + indel_info["alt_allele"]))
                geno_female[r_female >= (prob_rr + prob_ra)] = ''.join(sorted(indel_info["alt_allele"] * 2))
                for idx, val in zip(female_indices, geno_female):
                    genotypes[idx] = val
        return genotypes
    elif indel_info["chromosome"] == "Y":
        r = np.random.rand(n)
        called_idx = np.where(call_mask)[0]
        for i in called_idx:
            if genders[i] == "Male":
                r_val = random.random()
                genotypes[i] = indel_info["ref_allele"] if r_val < indel_info["frequency"] else indel_info["alt_allele"]
        return genotypes
    elif indel_info["chromosome"] == "MT":
        r = np.random.rand(n)
        called_idx = np.where(call_mask)[0]
        for i in called_idx:
            r_val = random.random()
            genotypes[i] = indel_info["ref_allele"] if r_val < indel_info["frequency"] else indel_info["alt_allele"]
        return genotypes

print("Writing indel genotypes in chunks...")
with open(indel_file, "w", newline="") as f_out:
    writer = csv.writer(f_out)
    header = ["individual_id"] + [indel["indel_id"] for indel in indel_annotations]
    writer.writerow(header)
    for start in tqdm(range(0, NUM_INDIVIDUALS, chunk_size), desc="Indel Genotype Chunks"):
        end = min(start + chunk_size, NUM_INDIVIDUALS)
        chunk_genders = genders_list[start:end]
        chunk_ind_ids = individuals_df["individual_id"].iloc[start:end].tolist()
        chunk_genotypes = []
        for indel in indel_annotations:
            check_memory(MEMORY_THRESHOLD_PERCENT)
            geno_chunk = compute_indel_genotype_for_chunk(indel, chunk_genders)
            chunk_genotypes.append(geno_chunk)
        for i in range(len(chunk_genders)):
            row = [chunk_ind_ids[i]] + [chunk_genotypes[j][i] for j in range(len(chunk_genotypes))]
            writer.writerow(row)
        del chunk_genotypes
        gc.collect()
print(f"Indel genotype data saved to {indel_file}")

# =============================================================================
# Step 7: Generate Structural Variants Data (with size bounds check)
# =============================================================================
print("Generating structural variants data...")
sv_data = []
variant_types_list = ["Deletion", "Duplication", "Inversion", "Translocation"]
variant_counter = 1
for ind in tqdm(individuals_df['individual_id'], desc="Structural Variants"):
    check_memory(MEMORY_THRESHOLD_PERCENT)
    if random.random() < SV_PROBABILITY:
        num_variants = random.randint(1, 3)
        for _ in range(num_variants):
            sv_id = f"SV_{variant_counter:06d}"
            variant_counter += 1
            sv_type = random.choice(variant_types_list)
            chromosome = get_random_chromosome()
            max_chr_length = chr_lengths[chromosome]
            # Compute maximum possible size for this chromosome.
            max_possible_size = max_chr_length - 1  # leave at least one base for start_pos
            # If the maximum possible size is less than our desired minimum variant size, skip this variant.
            if max_possible_size < 1000:
                continue
            # Choose size between 1,000 and the smaller of 1,000,000 or max_possible_size.
            size = random.randint(1000, min(1_000_000, max_possible_size))
            start_pos = random.randint(1, max_chr_length - size)
            end_pos = start_pos + size
            quality = round(random.uniform(20, 60), 1)
            sv_data.append({
                "individual_id": ind,
                "sv_id": sv_id,
                "variant_type": sv_type,
                "chromosome": chromosome,
                "start_pos": start_pos,
                "end_pos": end_pos,
                "size_bp": size,
                "quality": quality
            })
sv_df = pd.DataFrame(sv_data)
sv_df.to_csv(sv_file, index=False)
print(f"Structural variants data saved to {sv_file}")


# =============================================================================
# Step 8: Generate Health and Phenotype Data (Vectorized)
# =============================================================================
print("Generating health and phenotype data (vectorized)...")
n = len(individuals_df)
health_data = {
    "individual_id": individuals_df["individual_id"],
    "height_cm": np.round(np.random.normal(170, 10, n), 1),
    "weight_kg": np.round(np.random.normal(70, 15, n), 1)
}
health_data["bmi"] = np.round(health_data["weight_kg"] / ((health_data["height_cm"] / 100) ** 2), 1)
health_data["eye_color"] = np.random.choice(["Brown", "Blue", "Green", "Hazel", "Gray"], n)
health_data["blood_type"] = np.random.choice(["A", "B", "AB", "O"], n)
health_data["diabetes"] = np.random.choice(["Yes", "No"], n, p=[0.1, 0.9])
health_data["hypertension"] = np.random.choice(["Yes", "No"], n, p=[0.15, 0.85])
pd.DataFrame(health_data).to_csv(health_file, index=False)
print(f"Health and phenotype data saved to {health_file}")
del health_data
gc.collect()

# =============================================================================
# Step 9: Generate Lifestyle Data (Vectorized)
# =============================================================================
print("Generating lifestyle data (vectorized)...")
occupations = [
    "Accountant", "Actor", "Actuary", "Administrative Assistant", "Advertising Manager",
    "Aerospace Engineer", "Agricultural Scientist", "Air Traffic Controller", "Architect",
    "Artist", "Astronomer", "Attorney", "Biologist", "Biomedical Engineer", "Business Analyst",
    "Carpenter", "Chef", "Chemical Engineer", "Civil Engineer", "Clerk", "Computer Programmer",
    "Data Scientist", "Dentist", "Doctor", "Economist", "Editor", "Electrical Engineer",
    "Elementary School Teacher", "Engineer", "Environmental Scientist", "Farmer",
    "Financial Analyst", "Firefighter", "Graphic Designer", "Healthcare Administrator",
    "Historian", "Human Resources Manager", "Industrial Engineer", "Insurance Agent",
    "Interior Designer", "Investment Banker", "Journalist", "Lawyer", "Librarian",
    "Logistician", "Machinist", "Marketing Manager", "Mechanical Engineer", "Meteorologist",
    "Microbiologist", "Musician", "Nurse", "Occupational Therapist", "Operations Manager",
    "Optometrist", "Pharmacist", "Photographer", "Physicist", "Physician", "Plumber",
    "Police Officer", "Professor", "Project Manager", "Psychologist", "Public Relations Specialist",
    "Real Estate Agent", "Research Scientist", "Sales Manager", "Software Developer",
    "Statistician", "Surgeon", "Teacher", "Technical Writer", "Translator", "Urban Planner",
    "Veterinarian", "Web Developer", "Writer", "Chief Executive Officer", "Chief Financial Officer",
    "Consultant", "Executive Assistant", "Social Worker", "Event Planner", "Quality Assurance Analyst",
    "Supply Chain Manager", "Systems Analyst", "UX Designer", "Customer Service Representative",
    "Environmental Engineer", "Biochemist", "Geneticist", "Biomedical Researcher", "Forensic Scientist"
]
lifestyle_data = {
    "individual_id": individuals_df["individual_id"],
    "smoker": np.random.choice(["Yes", "No"], n, p=[0.3, 0.7]),
    "alcohol_consumption": np.random.choice(["None", "Occasional", "Regular", "Heavy"], n, p=[0.2, 0.5, 0.2, 0.1]),
    "exercise": np.random.choice(["Sedentary", "Moderate", "Active"], n, p=[0.4, 0.4, 0.2]),
    "occupation": np.random.choice(occupations, n)
}
pd.DataFrame(lifestyle_data).to_csv(lifestyle_file, index=False)
print(f"Lifestyle data saved to {lifestyle_file}")
del lifestyle_data
gc.collect()

# =============================================================================
# Step 10: Simple UI for Output Summary using Tkinter
# =============================================================================
def show_summary_ui(file_list):
    root = tk.Tk()
    root.title("Data Generation Summary")
    root.geometry("500x300")

    frame = ttk.Frame(root, padding=20)
    frame.pack(expand=True, fill="both")

    ttk.Label(frame, text="Data Generation Completed!", font=("Arial", 16, "bold")).pack(pady=10)

    summary_text = ""
    for fname in file_list:
        try:
            df = pd.read_csv(fname)
            rows = len(df)
        except Exception as e:
            rows = f"Error: {e}"
        summary_text += f"{os.path.basename(fname)}: {rows} rows\n"

    txt = tk.Text(frame, wrap="word", font=("Consolas", 10))
    txt.insert("end", summary_text)
    txt.config(state="disabled")
    txt.pack(expand=True, fill="both")

    ttk.Button(frame, text="Close", command=root.destroy).pack(pady=10)
    root.mainloop()

files_generated = [individuals_file, relationships_file, marriages_file,
                   snp_def_file, snp_file, indel_def_file, indel_file,
                   sv_file, health_file, lifestyle_file]

print("All stages completed. Launching output summary UI...")
show_summary_ui(files_generated)


Generating individuals data in chunks...


Individuals Chunks: 100%|██████████| 10/10 [00:42<00:00,  4.26s/it]


Individuals data saved to data_lake/individuals.csv
Generating family relationships...
Relationships data saved to data_lake/relationships.csv
Generating marriages data...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  paired["male_18"] = paired["male_birth"] + pd.to_timedelta(18*365, unit="D")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  paired["female_18"] = paired["female_birth"] + pd.to_timedelta(18*365, unit="D")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  paired["marriage_start"] = paired[["male_18", "

Marriages data saved to data_lake/marriages.csv
Generating SNP definitions with genomic annotations...


SNP Definitions: 100%|██████████| 1000/1000 [00:00<00:00, 64223.43it/s]

SNP definitions saved to data_lake/snp_definitions.csv





Generating SNP genotypes for individuals (writing in chunks)...
Writing SNP genotypes in chunks...


SNP Genotype Chunks: 100%|██████████| 10/10 [10:51<00:00, 65.11s/it]


SNP genotype data saved to data_lake/snps.csv
Generating Indel definitions with genomic annotations...


Indel Definitions: 100%|██████████| 200/200 [00:00<00:00, 27791.57it/s]


Indel definitions saved to data_lake/indel_definitions.csv
Writing indel genotypes in chunks...


Indel Genotype Chunks: 100%|██████████| 10/10 [02:57<00:00, 17.74s/it]


Indel genotype data saved to data_lake/indels.csv
Generating structural variants data...


Structural Variants: 100%|██████████| 1000000/1000000 [01:11<00:00, 13917.71it/s]


Structural variants data saved to data_lake/structural_variants.csv
Generating health and phenotype data (vectorized)...
Health and phenotype data saved to data_lake/health_phenotypes.csv
Generating lifestyle data (vectorized)...
Lifestyle data saved to data_lake/lifestyle.csv
All stages completed. Launching output summary UI...


TclError: no display name and no $DISPLAY environment variable