<a href="https://colab.research.google.com/github/amar9929-3/Bioinformatics_Project/blob/main/Notebooks/Simulate_Data_Lee.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np

# ----------------------------------------
# 1. Load your dataset
# ----------------------------------------
from google.colab import files
uploaded = files.upload()

import pandas as pd
df = pd.read_csv('Lee_3.1.csv')
df.head()

# ----------------------------------------
# 2. Identify categorical and numeric columns
# ----------------------------------------
cat_cols = ['Population']
num_cols = [c for c in df.columns if c not in cat_cols]

# Random number generator (fixed seed)
rng = np.random.default_rng(42)

# ----------------------------------------
# 3. Compute global covariance for fallback
# ----------------------------------------
global_cov = df[num_cols].cov() + np.eye(len(num_cols))*1e-6

sim_blocks = []

# ----------------------------------------
# 4. Simulate group-conditioned numeric data
# ----------------------------------------
for cats, group in df.groupby(cat_cols):
    n = len(group)
    gnum = group[num_cols]

    # Determine whether group covariance is usable
    if n >= 3 and gnum.var().sum() > 0:
        means = gnum.mean()
        cov = gnum.cov() + np.eye(len(num_cols))*1e-6
    else:
        # Use group mean but fallback to global covariance
        means = gnum.mean()
        cov = global_cov

    # Simulate numeric values
    sim = rng.multivariate_normal(means, cov, size=n)
    sim_df = pd.DataFrame(sim, columns=num_cols)

    # Clip to original ranges and round integers
    for col in num_cols:
        sim_df[col] = sim_df[col].clip(df[col].min(), df[col].max())
        if pd.api.types.is_integer_dtype(df[col]):
            sim_df[col] = np.round(sim_df[col]).astype(int)

    # ----------------------------------------
    # 5. Resample categorical columns
    # ----------------------------------------
    sim_cat_df = pd.DataFrame({
        col: rng.choice(
            df[col].unique(),
            size=n,
            p=df[col].value_counts(normalize=True).loc[df[col].unique()]
        )
        for col in cat_cols
    })

    # Combine group
    sim_blocks.append(pd.concat([sim_cat_df, sim_df], axis=1))

# ----------------------------------------
# 6. Final simulated dataset
# ----------------------------------------
simulated_df = pd.concat(sim_blocks, ignore_index=True)

# ----------------------------------------
# 7. Save output
# ----------------------------------------
simulated_df.to_csv("simulated_Lee.csv", index=False)

print("Simulated dataset created: simulated_Lee.csv")


Saving Lee_3.1.csv to Lee_3.1.csv
Simulated dataset created: simulated_Lee.csv
