# Imports

In [3]:
import sys
sys.path.append('..')

import os
import json
import pickle
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, average_precision_score

In [4]:
DATA_DIR = '../data'
TRAIN_DATA = f'{DATA_DIR}/train.csv'
VAL_DATA = f'{DATA_DIR}/val.csv'
TEST_DATA = f'{DATA_DIR}/test.csv'

NUM_FEATURES = [
    'BMI',
    'HDL cholesterol',
    'LDL cholesterol',
    'Total cholesterol',
    'Triglycerides',
    'Diastolic blood pressure'
]
CAT_FEATURES = [
    'Age',
    'Sex',
    'Ever smoked',
    'Snoring',
    'Insomnia',
    'Daytime napping',
    'Chronotype',
    'Sleep duration',
]
DISEASES = [
    'Asthma',
    'Cataract',
    'Diabetes',
    'GERD',
    'Hay-fever & Eczema',
    'Major depression',
    'Myocardial infarction',
    'Osteoarthritis',
    'Pneumonia',
    'Stroke'
]

# Load data
DF_TRAIN = pd.read_csv(TRAIN_DATA)
DF_VAL = pd.read_csv(VAL_DATA)
DF_TEST = pd.read_csv(TEST_DATA)

# All Diseases

In [8]:
def train_and_predict(X_train, y_train, X_val, y_val, num_features, cat_features, disease_name):
    # Create preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), num_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ])
    
    # Create and train model
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(random_state=42, max_iter=1000))
    ])
    
    model.fit(X_train[num_features + cat_features], y_train)
    
    # Predict on validation set
    y_pred = model.predict_proba(X_val[num_features + cat_features])[:, 1]
    
    # Calculate AUROC and AUPRC
    auroc = roc_auc_score(y_val, y_pred)
    auprc = average_precision_score(y_val, y_pred)
    
    # Print AUROC and AUPRC
    print(f"Disease: {disease_name}")
    print(f"  AUROC: {auroc:.4f}")
    print(f"  AUPRC: {auprc:.4f}")
    
    return model, y_pred, auroc, auprc


def save_scores(eids, y_true, y_scores, disease, feature_set, out_dir):
    os.makedirs(os.path.join(out_dir, feature_set), exist_ok=True)
    trait_name = disease.lower().replace(" ", "-")
    filename = f"rs_{trait_name}.json"
    full_path = os.path.join(out_dir, feature_set, filename)
    
    with open(full_path, "w") as f:
        json.dump({
            "eids": eids.tolist(),
            "y_true": y_true.tolist(),
            "y_scores": y_scores.tolist()
        }, f)
    print(f"Scores saved successfully: {full_path}\n")

In [9]:
# Baseline features
baseline_features = ['Age', 'Sex', 'BMI']

# Expanded features
expanded_features = NUM_FEATURES + CAT_FEATURES

# Train models, predict, and save predictions for each disease
out_dir = '../scores/logreg'

for disease in DISEASES:
    print(f"Training models for {disease}")
    
    # Baseline model
    X_train, y_train = DF_TRAIN, DF_TRAIN[disease]
    X_val, y_val = DF_VAL, DF_VAL[disease]
    X_test, y_test = DF_TEST, DF_TEST[disease]
    num_features = ['Age', 'BMI']
    cat_features = ['Sex']
    
    # Validation
    model, pred, auroc, auprc = train_and_predict(X_train, y_train, X_val, y_val, num_features, cat_features, f"{disease} (Baseline)")
    
    # Test
    model, pred, auroc, auprc = train_and_predict(X_train, y_train, X_test, y_test, num_features, cat_features, f"{disease} (Baseline)")
    save_scores(X_test['eid'].values, y_test.values, pred, disease, 'baseline', out_dir)
    
    # Expanded model
    X_train, y_train = DF_TRAIN, DF_TRAIN[disease]
    X_val, y_val = DF_VAL, DF_VAL[disease]
    X_test, y_test = DF_TEST, DF_TEST[disease]
    num_features = NUM_FEATURES
    cat_features = CAT_FEATURES
    
    # Validation
    model, pred, auroc, auprc = train_and_predict(X_train, y_train, X_val, y_val, num_features, cat_features, f"{disease} (Expanded)")
    
    # Test
    model, pred, auroc, auprc = train_and_predict(X_train, y_train, X_test, y_test, num_features, cat_features, f"{disease} (Expanded)")
    save_scores(X_test['eid'].values, y_test.values, pred, disease, 'expanded', out_dir)

Training models for Asthma
Disease: Asthma (Baseline)
  AUROC: 0.5522
  AUPRC: 0.1839
Disease: Asthma (Baseline)
  AUROC: 0.5612
  AUPRC: 0.1786
Scores saved successfully: ../scores/logreg/baseline/rs_asthma.json

Disease: Asthma (Expanded)
  AUROC: 0.5754
  AUPRC: 0.1940
Disease: Asthma (Expanded)
  AUROC: 0.5658
  AUPRC: 0.1866
Scores saved successfully: ../scores/logreg/expanded/rs_asthma.json

Training models for Cataract
Disease: Cataract (Baseline)
  AUROC: 0.7454
  AUPRC: 0.3371
Disease: Cataract (Baseline)
  AUROC: 0.7469
  AUPRC: 0.3353
Scores saved successfully: ../scores/logreg/baseline/rs_cataract.json

Disease: Cataract (Expanded)
  AUROC: 0.7479
  AUPRC: 0.3404
Disease: Cataract (Expanded)
  AUROC: 0.7487
  AUPRC: 0.3414
Scores saved successfully: ../scores/logreg/expanded/rs_cataract.json

Training models for Diabetes
Disease: Diabetes (Baseline)
  AUROC: 0.7501
  AUPRC: 0.2679
Disease: Diabetes (Baseline)
  AUROC: 0.7592
  AUPRC: 0.2722
Scores saved successfully: ../sco

# Asthma

In [10]:
def create_data(df, embeds, eids, labels):
    # Create DataFrame from embeddings
    df_embeds = pd.DataFrame(embeds, index=eids)
    df_embeds.index.name = 'eid'
    embeds_cols = [f'E_{i}' for i in range(embeds.shape[1])]
    df_embeds.columns = embeds_cols

    # Merge DataFrames
    merged_df = df.join(df_embeds, how='inner')

    # Ensure labels align with merged DataFrame
    labels_series = pd.Series(labels, index=eids, name='label')
    aligned_labels = labels_series.loc[merged_df.index]

    return merged_df, aligned_labels.values, embeds_cols

In [11]:
# ResNet embeddings:
TRAIN_EMBEDS = f'{DATA_DIR}/train_spiro_embeds.pkl'
VAL_EMBEDS = f'{DATA_DIR}/val_spiro_embeds.pkl'
TEST_EMBEDS = f'{DATA_DIR}/test_spiro_embeds.pkl'

with open(TRAIN_EMBEDS, 'rb') as f:
    E_train, y_train, eids_train = pickle.load(f)

with open(VAL_EMBEDS, 'rb') as f:
    E_val, y_val, eids_val = pickle.load(f)

with open(TEST_EMBEDS, 'rb') as f:
    E_test, y_test, eids_test = pickle.load(f)

# Tabular data:
df_train = DF_TRAIN[['eid'] + NUM_FEATURES + CAT_FEATURES].set_index('eid')
df_val = DF_VAL[['eid'] + NUM_FEATURES + CAT_FEATURES].set_index('eid')
df_test = DF_TEST[['eid'] + NUM_FEATURES + CAT_FEATURES].set_index('eid')

# Merge tabular and embeddings:
X_train, y_train, embeds_cols = create_data(df_train, E_train, eids_train, y_train)
X_val, y_val, _ = create_data(df_val, E_val, eids_val, y_val)
X_test, y_test, _ = create_data(df_test, E_test, eids_test, y_test)

In [12]:
# Tabular only
num_features = NUM_FEATURES
cat_features = CAT_FEATURES

# Validation
model, pred, auroc, auprc = train_and_predict(X_train, y_train, X_val, y_val, num_features, cat_features, 'Asthma (Tabular)')

# Test
model, pred, auroc, auprc = train_and_predict(X_train, y_train, X_test, y_test, num_features, cat_features, 'Asthma (Tabular)')
save_scores(eids_test, y_test, pred, 'tabular', 'asthma', out_dir)

Disease: Asthma (Tabular)
  AUROC: 0.5681
  AUPRC: 0.1884
Disease: Asthma (Tabular)
  AUROC: 0.5699
  AUPRC: 0.1844
Scores saved successfully: ../scores/logreg/asthma/rs_tabular.json



In [13]:
# Spiro only
num_features = embeds_cols
cat_features = []

# Validation
model, pred, auroc, auprc = train_and_predict(X_train, y_train, X_val, y_val, num_features, cat_features, 'Asthma (Spiro)')

# Test
model, pred, auroc, auprc = train_and_predict(X_train, y_train, X_test, y_test, num_features, cat_features, 'Asthma (Spiro)')
save_scores(eids_test, y_test, pred, 'spiro', 'asthma', out_dir)

Disease: Asthma (Spiro)
  AUROC: 0.7241
  AUPRC: 0.3726
Disease: Asthma (Spiro)
  AUROC: 0.7273
  AUPRC: 0.3830
Scores saved successfully: ../scores/logreg/asthma/rs_spiro.json



In [14]:
# Tabular and Spiro
num_features = NUM_FEATURES + embeds_cols
cat_features = CAT_FEATURES

# Validation
model, pred, auroc, auprc = train_and_predict(X_train, y_train, X_val, y_val, num_features, cat_features, 'Asthma (Tabular + Spiro)')

# Test
model, pred, auroc, auprc = train_and_predict(X_train, y_train, X_test, y_test, num_features, cat_features, 'Asthma (Tabular + Spiro)')
save_scores(eids_test, y_test, pred, 'tabular_spiro', 'asthma', out_dir)

Disease: Asthma (Tabular + Spiro)
  AUROC: 0.7302
  AUPRC: 0.3831
Disease: Asthma (Tabular + Spiro)
  AUROC: 0.7403
  AUPRC: 0.4014
Scores saved successfully: ../scores/logreg/asthma/rs_tabular_spiro.json

