In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import xgboost as xgb

# Step 1: Load merged data
df = pd.read_csv('merged_radiomics_metadata.csv')

# Step 2: Define features (X) and create quantile-based classes from "OS"
if "OS" not in df.columns:
    raise ValueError("Target column 'OS' not found in the dataset.")

df = df[df["OS"].notna()]  # Drop rows with missing OS
# Create 4 quantile-based classes for OS
quantile_classes, bins = pd.qcut(df["OS"], q=4, labels=[0, 1, 2, 3], retbins=True)
y = quantile_classes.astype(int)

# Print class info
print("Survival classes created based on OS quantiles:")
for i in range(4):
    print(f"Class {i}: OS in range ({bins[i]:.2f}, {bins[i+1]:.2f}]")

# Drop target and irrelevant columns from features
X = df.drop(columns=["OS", "Survival_Category", "1-dead 0-alive"], errors="ignore")

# Step 3: Encode categorical features
categorical_cols = X.select_dtypes(include=["object"]).columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

# Step 4: Handle missing values
imputer = SimpleImputer(strategy="median")
X_imputed = imputer.fit_transform(X)

# Step 5: Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Step 6: Apply LDA for dimensionality reduction
lda = LinearDiscriminantAnalysis(n_components=None)  # Auto: will reduce to n_classes - 1 (3 for 4 classes)
X_lda = lda.fit_transform(X_scaled, y)
print(f"\nLDA-reduced feature shape: {X_lda.shape}")

# Step 7: Apply SMOTE to balance classes
print("\nClass distribution before SMOTE:\n", y.value_counts())
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_lda, y)
print("Class distribution after SMOTE:\n", pd.Series(y_smote).value_counts())

# Step 8: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_smote, y_smote, test_size=0.3, stratify=y_smote, random_state=42
)

# Step 9: Define base models
rf = RandomForestClassifier(n_estimators=15, max_depth=8, random_state=42)
xgb_clf = xgb.XGBClassifier(
    n_estimators=10, max_depth=3, learning_rate=0.01,
    subsample=0.8, colsample_bytree=0.8, use_label_encoder=False,
    eval_metric='mlogloss', random_state=42
)
logreg = LogisticRegression(max_iter=500)

# Step 10: Define ensemble models
voting = VotingClassifier(estimators=[
    ("rf", rf), ("xgb", xgb_clf), ("logreg", logreg)
], voting="hard")

stacking = StackingClassifier(
    estimators=[("rf", rf), ("xgb", xgb_clf), ("logreg", logreg)],
    final_estimator=LogisticRegression(max_iter=100)
)

# Step 11: Train and evaluate
print("\n🔧 Training Voting Classifier...")
voting.fit(X_train, y_train)
y_pred_voting = voting.predict(X_test)
print("\nVoting Classifier Report:")
print(classification_report(y_test, y_pred_voting))

print("\n🔧 Training Stacking Classifier...")
stacking.fit(X_train, y_train)
y_pred_stacking = stacking.predict(X_test)
print("\nStacking Classifier Report:")
print(classification_report(y_test, y_pred_stacking))


Survival classes created based on OS quantiles:
Class 0: OS in range (13.00, 181.75]
Class 1: OS in range (181.75, 414.00]
Class 2: OS in range (414.00, 812.75]
Class 3: OS in range (812.75, 2209.00]

LDA-reduced feature shape: (216, 3)

Class distribution before SMOTE:
 OS
3    54
0    54
2    54
1    54
Name: count, dtype: int64
Class distribution after SMOTE:
 OS
3    54
0    54
2    54
1    54
Name: count, dtype: int64

🔧 Training Voting Classifier...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Voting Classifier Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        16
           1       0.94      1.00      0.97        16
           2       1.00      0.82      0.90        17
           3       0.94      1.00      0.97        16

    accuracy                           0.95        65
   macro avg       0.96      0.96      0.95        65
weighted avg       0.96      0.95      0.95        65


🔧 Training Stacking Classifier...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Stacking Classifier Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        16
           1       0.94      1.00      0.97        16
           2       1.00      0.82      0.90        17
           3       0.94      1.00      0.97        16

    accuracy                           0.95        65
   macro avg       0.96      0.96      0.95        65
weighted avg       0.96      0.95      0.95        65

