In [1]:
import pandas as pd
import numpy as np

# === Load your real dataset ===
orig = pd.read_excel("Final_LungCancer Dataset.xlsx")

# Keep only lung_cancer == 1 rows (if not already all 1)
orig = orig[orig["lung_cancer"] == 1].copy()

# number of new patients you want
N_NEW = 8000

# Separate columns by type
float_cols  = orig.select_dtypes(include=["float64"]).columns.tolist()
int_cols    = orig.select_dtypes(include=["int64"]).columns.tolist()

# We will resample whole rows to preserve correlations,
# then add small jitter to continuous columns.
new = orig.sample(n=N_NEW, replace=True, random_state=42).reset_index(drop=True)

# Add slight noise to float columns to avoid perfect duplicates
for col in float_cols:
    # scale noise to about 1â€“2% of the column's std dev
    std = orig[col].std()
    if std > 0:
        noise = np.random.normal(0, 0.02 * std, N_NEW)
        new[col] = (new[col] + noise).round(2)

# Ensure integer columns stay integers and within observed bounds
for col in int_cols:
    low, high = orig[col].min(), orig[col].max()
    new[col] = new[col].clip(low, high).round().astype(int)

# Give unique patient IDs continuing from your max pid
max_pid = orig["pid"].max()
new["pid"] = np.arange(max_pid + 1, max_pid + 1 + N_NEW)

# Save
new.to_excel("Synthetic_LungCancer_next8k.xlsx", index=False)
print("Saved Synthetic_LungCancer_next8k.xlsx")


Saved Synthetic_LungCancer_next8k.xlsx


In [None]:
!pip install sdv


Collecting sdv
  Downloading sdv-1.27.0-py3-none-any.whl.metadata (14 kB)
Collecting boto3<2.0.0,>=1.28 (from sdv)
  Downloading boto3-1.40.35-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<2.0.0,>=1.31 (from sdv)
  Downloading botocore-1.40.35-py3-none-any.whl.metadata (5.7 kB)
Collecting cloudpickle>=2.1.0 (from sdv)
  Downloading cloudpickle-3.1.1-py3-none-any.whl.metadata (7.1 kB)
Collecting graphviz>=0.13.2 (from sdv)
  Downloading graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Collecting copulas>=0.12.1 (from sdv)
  Downloading copulas-0.12.3-py3-none-any.whl.metadata (9.5 kB)
Collecting ctgan>=0.11.0 (from sdv)
  Downloading ctgan-0.11.0-py3-none-any.whl.metadata (10 kB)
Collecting deepecho>=0.7.0 (from sdv)
  Downloading deepecho-0.7.0-py3-none-any.whl.metadata (10 kB)
Collecting rdt>=1.17.0 (from sdv)
  Downloading rdt-1.18.1-py3-none-any.whl.metadata (10 kB)
Collecting sdmetrics>=0.21.0 (from sdv)
  Downloading sdmetrics-0.23.0-py3-none-any.whl.metadata (9.4 kB)
Coll