In [1]:
import pandas as pd
import numpy as np

In [2]:
# Fetch laboratory data
glu = pd.read_sas("GLU_L.xpt", format="xport")
hdl = pd.read_sas("HDL_L.xpt", format="xport")
biopro = pd.read_sas("BIOPRO_L.xpt", format="xport")
tchol = pd.read_sas("TCHOL_L.xpt", format="xport")
trigly = pd.read_sas("TRIGLY_L.xpt", format="xport")
tst = pd.read_sas("TST_L.xpt", format="xport")

In [3]:
# Fetch demographics data with required columns
demo = pd.read_sas("DEMO_L.xpt", format="xport")
demo = demo[['SEQN', 'RIAGENDR', 'RIDAGEYR', 'RIDRETH3',
             'DMDEDUC2', 'INDFMPIR', 'DMDMARTZ', 'DMDHHSIZ',
             'RIDEXPRG', 'RIDSTATR']]

# Keep only examined participants
demo = demo[demo['RIDSTATR'] == 2]

# Decode numerical codes for interpretability
demo['RIAGENDR'] = demo['RIAGENDR'].map({1: 'Male', 2: 'Female'})
demo['DMDMARTZ'] = demo['DMDMARTZ'].map({
    1: 'Married/Living with partner',
    2: 'Widowed/Divorced/Separated',
    3: 'Never married'
})
demo['RIDRETH3'] = demo['RIDRETH3'].map({
    1: 'Mexican American',
    2: 'Other Hispanic',
    3: 'Non-Hispanic White',
    4: 'Non-Hispanic Black',
    6: 'Non-Hispanic Asian',
    7: 'Other/Multi-Racial'
})

In [4]:
combined_trials = (
    glu[['SEQN','LBXGLU']]
    .merge(hdl[['SEQN','LBDHDD']], on='SEQN', how='inner')
    .merge(tchol[['SEQN','LBXTC']], on='SEQN', how='inner')
    .merge(trigly[['SEQN','LBXTLG','LBDLDL']], on='SEQN', how='inner')
    .merge(biopro[['SEQN','LBXSAL','LBXSAPSI','LBXSASSI','LBXSATSI','LBXSCR','LBXSUA']], on='SEQN', how='inner')
    .merge(tst[['SEQN','LBXTST']], on='SEQN', how='left')
)

In [5]:
final_df = combined_trials.merge(demo, on='SEQN', how='inner')

In [6]:
num_cols = final_df.select_dtypes(include=[np.number]).columns
final_df[num_cols] = final_df[num_cols].apply(lambda x: x.fillna(x.mean()))
final_df['RIAGENDR'] = final_df['RIAGENDR'].fillna('Unknown')
final_df['RIDRETH3'] = final_df['RIDRETH3'].fillna('Other/Multi-Racial')

final_df.isna().sum()

SEQN          0
LBXGLU        0
LBDHDD        0
LBXTC         0
LBXTLG        0
LBDLDL        0
LBXSAL        0
LBXSAPSI      0
LBXSASSI      0
LBXSATSI      0
LBXSCR        0
LBXSUA        0
LBXTST        0
RIAGENDR      0
RIDAGEYR      0
RIDRETH3      0
DMDEDUC2      0
INDFMPIR      0
DMDMARTZ    575
DMDHHSIZ      0
RIDEXPRG      0
RIDSTATR      0
dtype: int64

In [7]:
np.random.seed(42)

# Randomize patients into Drug vs Placebo arms
final_df['treatment_arm'] = np.random.choice(['Drug', 'Placebo'], size=len(final_df))

# Simulate baseline-adjusted treatment effect
# Define baseline biomarker influences on response
# Lower glucose, lower ALT (better liver), and younger age increase odds of response
prob = (
    0.4
    - 0.002 * final_df['LBXGLU']          # high glucose → lower odds
    - 0.001 * final_df['LBXSATSI']        # high ALT → lower odds
    + 0.005 * (final_df['RIAGENDR'] == 'Female')  # females slightly more likely
    + 0.003 * (final_df['RIDRETH3'] == 'Non-Hispanic White')
)

# Add a treatment boost for the Drug arm
prob += (final_df['treatment_arm'] == 'Drug') * 0.25

# Clip between 0–1 and sample binary outcomes
prob = np.clip(prob, 0.01, 0.95)
final_df['Responded'] = np.random.binomial(1, prob)


In [8]:
final_df

Unnamed: 0,SEQN,LBXGLU,LBDHDD,LBXTC,LBXTLG,LBDLDL,LBXSAL,LBXSAPSI,LBXSASSI,LBXSATSI,...,RIDAGEYR,RIDRETH3,DMDEDUC2,INDFMPIR,DMDMARTZ,DMDHHSIZ,RIDEXPRG,RIDSTATR,treatment_arm,Responded
0,130378.0,113.000000,45.000000,264.000000,153.000000,188.000000,4.300000,91.000000,25.000000,39.000000,...,43.0,Non-Hispanic Asian,5.0,5.00,Married/Living with partner,4.0,1.979719,2.0,Drug,0
1,130379.0,99.000000,60.000000,214.000000,86.000000,137.000000,3.900000,64.000000,20.000000,17.000000,...,66.0,Non-Hispanic White,5.0,5.00,Married/Living with partner,2.0,1.979719,2.0,Placebo,0
2,130380.0,156.000000,49.000000,187.000000,375.000000,63.000000,3.700000,78.000000,15.000000,13.000000,...,44.0,Other Hispanic,3.0,1.41,Married/Living with partner,7.0,2.000000,2.0,Drug,0
3,130386.0,100.000000,46.000000,183.000000,142.000000,109.000000,4.300000,54.000000,20.000000,34.000000,...,34.0,Mexican American,4.0,1.33,Married/Living with partner,3.0,1.979719,2.0,Drug,0
4,130394.0,88.000000,48.000000,183.000000,57.000000,124.000000,4.400000,69.000000,24.000000,28.000000,...,51.0,Non-Hispanic White,5.0,5.00,Married/Living with partner,4.0,1.979719,2.0,Drug,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3991,142301.0,110.000000,48.000000,138.000000,143.000000,61.000000,4.300000,61.000000,28.000000,16.000000,...,80.0,Non-Hispanic White,5.0,1.20,Widowed/Divorced/Separated,1.0,1.979719,2.0,Drug,1
3992,142303.0,160.000000,34.000000,110.000000,154.000000,45.000000,4.300000,72.000000,30.000000,27.000000,...,69.0,Other/Multi-Racial,3.0,0.98,Widowed/Divorced/Separated,2.0,1.979719,2.0,Placebo,0
3993,142305.0,132.000000,51.000000,180.000000,185.000000,92.000000,4.000000,64.000000,30.000000,18.000000,...,76.0,Mexican American,1.0,2.25,Widowed/Divorced/Separated,4.0,1.979719,2.0,Drug,0
3994,142308.0,107.884532,54.509306,183.126904,115.967302,105.697126,4.069198,91.992023,22.089245,20.994581,...,50.0,Other Hispanic,4.0,1.95,Married/Living with partner,3.0,1.979719,2.0,Placebo,0


In [9]:
final_df.to_csv("sample_dataset_with_outcome.csv", index=False)