In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import StratifiedKFold, cross_val_score

In [2]:

embeddings_df = pd.read_csv("out/embeddings_processed.csv")
structured_df = pd.read_csv("out/data.csv")

embeddings_df = embeddings_df['embedding'].str.split(',', expand=True)

# convert all columns to float
embeddings_df = embeddings_df.astype(float)

# rename columns like 'emb_0', 'emb_1', ...
embeddings_df.columns = [f'emb_{i}' for i in range(embeddings_df.shape[1])]

data = pd.concat([embeddings_df, structured_df], axis=1)

data = data.drop(columns=["cases.submitter_id"])

# Step 3: Initial exploration
print("Initial data shape:", data.shape)
print(data.head())
print(data.info())
print(data['OS'].value_counts())

print(data.shape)


Initial data shape: (658, 788)
      emb_0     emb_1     emb_2     emb_3     emb_4     emb_5     emb_6  \
0 -0.372847  1.175885 -0.136382  0.584974 -0.075458  0.242736 -0.091984   
1 -0.356913  0.706686  0.037918  0.460797 -0.655461 -0.034329 -0.260771   
2 -0.650745  0.622192 -0.541576 -0.147439 -0.688726  0.005337  0.160113   
3 -0.243454  0.907251 -0.278341  0.578471 -0.371568  0.370125 -0.096557   
4 -0.501727  0.816661 -0.331489  0.102610 -0.754066  0.199273  0.021237   

      emb_7     emb_8     emb_9  ...  diagnoses.laterality  \
0  0.850612  1.272928 -0.353911  ...                  Left   
1  0.986547  1.546444 -0.446621  ...                 Right   
2  0.956877  0.844839 -0.026274  ...                 Right   
3  0.984593  1.147140 -0.293077  ...                 Right   
4  0.814512  1.281691 -0.193750  ...                 Right   

   diagnoses.morphology  diagnoses.prior_malignancy  \
0                8140/3                         yes   
1                8140/3            

In [3]:
for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = data[col].astype(str)

In [4]:
num_cols = data.select_dtypes(include=[np.number]).columns
cat_cols = data.select_dtypes(include=['object']).columns

In [5]:
num_imputer = SimpleImputer(strategy='median')
for col in num_cols:
    if pd.api.types.is_numeric_dtype(data[col]):
        data[col] = data[col].astype(float)
        data[col] = num_imputer.fit_transform(data[[col]])

In [6]:
if len(cat_cols) > 0:
    cat_imputer = SimpleImputer(strategy='most_frequent')
    data[cat_cols] = pd.DataFrame(
        cat_imputer.fit_transform(data[cat_cols]),
        columns=cat_cols,
        index=data.index
    )


In [7]:
le_dict = {}
for col in cat_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    le_dict[col] = le


In [8]:
X = data.drop('OS', axis=1)
y = data['OS']

# --- 1. Identify feature groups ---
# All columns starting with 'emb_' are embedding features
embedding_cols = [col for col in X.columns if col.startswith('emb_')]
# All other columns are structured data features (now label-encoded and imputed)
data_cols = [col for col in X.columns if not col.startswith('emb_')]

print(f"Number of embedding features: {len(embedding_cols)}")
print(f"Number of structured data features: {len(data_cols)}")

# --- 2. Define Pipelines ---

# Pipeline 1: For Embeddings (Scale -> PCA)
pca_pipeline = Pipeline(steps=[
    # It is essential to scale the data before applying PCA
    ('scaler', StandardScaler()),
    # Reduce the embeddings to 20 components
    ('pca', PCA(n_components=20))
])

# Pipeline 2: For Structured Data (Only Scale)
# We scale the structured data features to ensure they are on the same magnitude
# as the new PCA components, improving model performance.
structured_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
])


# --- 3. Create the ColumnTransformer (Combined Data) ---
preprocessor_combined = ColumnTransformer(
    transformers=[
        # Apply the PCA pipeline to the embedding columns
        ('pca_transform', pca_pipeline, embedding_cols),
        # Apply the scaling pipeline to the structured data columns
        ('structured_scale', structured_pipeline, data_cols)
    ],
    remainder='drop', # Drop any columns not explicitly named
    n_jobs=-1
)

# --- 4. Apply the Transformation (Combined Data) ---
X_combined = preprocessor_combined.fit_transform(X)

print(f"New total feature count: {X_combined.shape[1]} (20 PCA components + {len(data_cols)} scaled features)")


Number of embedding features: 768
Number of structured data features: 19
New total feature count: 39 (20 PCA components + 19 scaled features)


In [9]:
# === SCENARIO 1: JUST STRUCTURED DATA ===

# 1. Create a ColumnTransformer that isolates structured data
preprocessor_structured = ColumnTransformer(
    transformers=[
        # Apply scaling to structured features
        ('structured_scale', structured_pipeline, data_cols),
        # DROP embedding features
        ('drop_embeddings', 'drop', embedding_cols)
    ],
    remainder='drop', 
    n_jobs=-1
)

# 2. Apply the Transformation
X_structured_only = preprocessor_structured.fit_transform(X)

print("\n--- SCENARIO 1: Structured Data Only ---")
print(f"Total features: {X_structured_only.shape[1]}")

# 3. Run Cross-Validation on Structured Data Only
model = RandomForestClassifier(
    n_estimators=100, 
    random_state=42, 
    class_weight='balanced', 
    n_jobs=-1
)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = 'roc_auc'

cv_scores_structured = cross_val_score(
    estimator=model, 
    X=X_structured_only, 
    y=y, 
    cv=skf,             
    scoring=scoring,    
    n_jobs=-1           
)

print(f"Mean CV Score (Structured Only): {cv_scores_structured.mean():.4f}")
print(f"Std Dev CV Score: {cv_scores_structured.std():.4f}")


--- SCENARIO 1: Structured Data Only ---
Total features: 19
Mean CV Score (Structured Only): 0.6964
Std Dev CV Score: 0.0446


In [10]:
# === SCENARIO 2: JUST EMBEDDING DATA ===

# 1. Create a ColumnTransformer that isolates embedding data (with PCA)
preprocessor_embedding = ColumnTransformer(
    transformers=[
        # Apply scaling and PCA to embedding features
        ('pca_transform', pca_pipeline, embedding_cols),
        # DROP structured features
        ('drop_structured', 'drop', data_cols)
    ],
    remainder='drop', 
    n_jobs=-1
)

# 2. Apply the Transformation
X_embedding_only = preprocessor_embedding.fit_transform(X)

print("\n--- SCENARIO 2: Embedding Data Only ---")
print(f"Total features: {X_embedding_only.shape[1]}")

# 3. Run Cross-Validation on Embedding Data Only
model = RandomForestClassifier(
    n_estimators=100, 
    random_state=42, 
    class_weight='balanced', 
    n_jobs=-1
)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = 'roc_auc'

cv_scores_embedding = cross_val_score(
    estimator=model, 
    X=X_embedding_only, 
    y=y, 
    cv=skf,             
    scoring=scoring,    
    n_jobs=-1           
)

print(f"Mean CV Score (Embedding Only): {cv_scores_embedding.mean():.4f}")
print(f"Std Dev CV Score: {cv_scores_embedding.std():.4f}")


--- SCENARIO 2: Embedding Data Only ---
Total features: 20
Mean CV Score (Embedding Only): 0.5214
Std Dev CV Score: 0.0195


In [11]:
# === SCENARIO 3: COMBINED DATA (Baseline) ===
# This cell uses the X_combined variable calculated in cell 21.

# 1. Define the Model
model = RandomForestClassifier(
    n_estimators=100, 
    random_state=42, 
    class_weight='balanced', 
    n_jobs=-1
)

# 2. Define the Cross-Validation Strategy
skf = StratifiedKFold(
    n_splits=5,         
    shuffle=True,       
    random_state=42     
)

# 3. Define the Metric
scoring = 'roc_auc'

# 4. Perform Stratified Cross-Validation
print(f"Starting Stratified {skf.n_splits}-Fold Cross-Validation...")

cv_scores = cross_val_score(
    estimator=model, 
    X=X_combined, 
    y=y, 
    cv=skf,             
    scoring=scoring,    
    n_jobs=-1           
)

# 5. Output the Results
print(f"\nStratified {skf.n_splits}-Fold Cross-Validation ({scoring} Scores):")
print(cv_scores)
print(f"\nMean CV Score (Combined Data): {cv_scores.mean():.4f}")
print(f"Standard Deviation of CV Score: {cv_scores.std():.4f}")

Starting Stratified 5-Fold Cross-Validation...

Stratified 5-Fold Cross-Validation (roc_auc Scores):
[0.67716942 0.63106921 0.66567665 0.65050211 0.73471787]

Mean CV Score (Combined Data): 0.6718
Standard Deviation of CV Score: 0.0350
