In [7]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('UCSF-PDGM-metadata_v2.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 501 entries, 0 to 500
Data columns (total 16 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   PatientID                              501 non-null    object 
 1   Sex                                    501 non-null    object 
 2   Age at MRI                             501 non-null    int64  
 3   WHO CNS Grade                          501 non-null    int64  
 4   Final pathologic diagnosis (WHO 2021)  501 non-null    object 
 5   MGMT status                            421 non-null    object 
 6   MGMT index                             413 non-null    float64
 7   1p/19q                                 410 non-null    object 
 8   IDH                                    501 non-null    object 
 9   1-dead 0-alive                         501 non-null    int64  
 10  OS                                     500 non-null    float64
 11  EOR   

In [5]:
# Load merged cleaned dataset
df = pd.read_csv("radiomics_cleaned_merged.csv")  # Replace with your actual merged filename

In [8]:
# Keep only patients who are dead (status = 1) with valid OS values
df = df[df["1-dead 0-alive"] == 1]
df = df[df["OS"].notna()]
df["OS"] = pd.to_numeric(df["OS"], errors="coerce")
df = df[df["OS"] > 0]

# Quartile binning on OS
percentiles = np.percentile(df["OS"], [25, 50, 75])
bins = [0, percentiles[0], percentiles[1], percentiles[2], np.inf]
labels = [0, 1, 2, 3]
df["Survival_Category"] = pd.cut(df["OS"], bins=bins, labels=labels)

In [9]:
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Drop identifiers and target columns
X = df.drop(columns=[
    "PatientID", "OS", "Survival_Category", "1-dead 0-alive",
    "BraTS21 ID", "BraTS21 Segmentation Cohort", "BraTS21 MGMT Cohort"
], errors="ignore")
y = df["Survival_Category"]

# Encode categorical variables
categorical_cols = X.select_dtypes(include=["object"]).columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

# Impute and scale
imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()
X_imputed = imputer.fit_transform(X)
X_scaled = scaler.fit_transform(X_imputed)


In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis(n_components=3)
X_lda = lda.fit_transform(X_scaled, y)

In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

X_train, X_test, y_train, y_test = train_test_split(
    X_lda, y, test_size=0.1, stratify=y, random_state=42
)
X_train, y_train = SMOTE(random_state=42).fit_resample(X_train, y_train)


In [None]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import xgboost as xgb

# Base models
rf = RandomForestClassifier(n_estimators=150, random_state=42)
xgb_clf = xgb.XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=6, random_state=42, use_label_encoder=False, eval_metric='mlogloss')
logreg = LogisticRegression(max_iter=1000)
lda_clf = LinearDiscriminantAnalysis()

# Voting and stacking
voting = VotingClassifier(estimators=[
    ("rf", rf), ("xgb", xgb_clf), ("logreg", logreg), ("lda", lda_clf)
], voting="hard")

stacking = StackingClassifier(
    estimators=[("rf", rf), ("xgb", xgb_clf), ("logreg", logreg), ("lda", lda_clf)],
    final_estimator=RandomForestClassifier()
)

In [21]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

print("Cross-validation scores:")
for name, model in [("Voting", voting), ("Stacking", stacking)]:
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    print(f"{name} Accuracy: {scores.mean():.3f} ± {scores.std():.3f}")


Cross-validation scores:


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Voting Accuracy: 0.670 ± 0.182


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Stacking Accuracy: 0.570 ± 0.179


In [19]:
from sklearn.metrics import classification_report

# Fit models
voting.fit(X_train, y_train)
stacking.fit(X_train, y_train)

# Predict
y_pred_voting = voting.predict(X_test)
y_pred_stacking = stacking.predict(X_test)

# Report
print("\nVoting Classifier Report:")
print(classification_report(y_test, y_pred_voting))

print("\nStacking Classifier Report:")
print(classification_report(y_test, y_pred_stacking))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Voting Classifier Report:
              precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.00      0.00      0.00         1
           2       0.33      1.00      0.50         1
           3       1.00      1.00      1.00         1

    accuracy                           0.60         5
   macro avg       0.58      0.62      0.54         5
weighted avg       0.67      0.60      0.57         5


Stacking Classifier Report:
              precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.00      0.00      0.00         1
           2       0.33      1.00      0.50         1
           3       1.00      1.00      1.00         1

    accuracy                           0.60         5
   macro avg       0.58      0.62      0.54         5
weighted avg       0.67      0.60      0.57         5



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
