In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import xgboost as xgb
from sklearn.ensemble import StackingRegressor

# Load dataset
file_path = r"D:\mlpr data\Glioblastoma-ML-model\stackAndModel\merged_data.csv"
df = pd.read_csv(file_path)

# Clean target
df["Survival_from_surgery_days_UPDATED"] = pd.to_numeric(df["Survival_from_surgery_days_UPDATED"], errors="coerce")
df = df.dropna(subset=["Survival_from_surgery_days_UPDATED"])
y = df["Survival_from_surgery_days_UPDATED"]

# Drop ID and target from features
X = df.drop(columns=["PatientID", "Survival_from_surgery_days_UPDATED"])

# Encode categoricals
categorical_cols = X.select_dtypes(include=["object"]).columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

# Handle missing values
X = X.apply(pd.to_numeric, errors="coerce").fillna(X.median())
feature_names = X.columns.tolist()

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --------- LDA-BASED FEATURE SELECTION ---------
# Step 1: Bin the target into 4 ordinal classes (for LDA)
percentiles = np.percentile(y, [25, 50, 75])
bins = [0, percentiles[0], percentiles[1], percentiles[2], np.inf]
labels = [0, 1, 2, 3]
y_binned = pd.cut(y, bins=bins, labels=labels)

# Step 2: Fit LDA and get feature importances
lda_temp = LinearDiscriminantAnalysis()
lda_temp.fit(X_scaled, y_binned)
lda_importance = np.abs(lda_temp.coef_).sum(axis=0)

# Step 3: Select top 500 features
importance_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": lda_importance
}).sort_values(by="Importance", ascending=False)

top_features = importance_df.head(500)["Feature"].tolist()
X = X[top_features]
X_scaled = scaler.fit_transform(X)  # re-scale after selecting top features

# --------- LDA TRANSFORMATION FOR FINAL INPUT ---------
# Use LDA to transform features (into n_classes - 1 = 3 components)
lda_final = LinearDiscriminantAnalysis(n_components=3)
X_lda = lda_final.fit_transform(X_scaled, y_binned)

# --------- REGRESSION ---------
X_train, X_test, y_train, y_test = train_test_split(X_lda, y, test_size=0.1, random_state=42)

# Define regressors
rf_reg = RandomForestRegressor(n_estimators=150, random_state=42)
xgb_reg = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=6, random_state=42)
lr_reg = LinearRegression()

# Stacking Regressor
stacking_reg = StackingRegressor(
    estimators=[
        ("RandomForest", rf_reg),
        ("XGBoost", xgb_reg),
        ("LinearReg", lr_reg)
    ],
    final_estimator=RandomForestRegressor()
)

# Train
stacking_reg.fit(X_train, y_train)

# Predict and evaluate
y_pred = stacking_reg.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nRegression Performance on LDA-Transformed Features:")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R² Score: {r2:.2f}")


  df = pd.read_csv(file_path)



Regression Performance on LDA-Transformed Features:
RMSE: 291.27
MAE: 166.78
R² Score: 0.66


In [5]:
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

# Convert the target to a binary class for classification task
threshold = 1000  # You can change this threshold based on your understanding
y_binary = (y >= threshold).astype(int)

# Train-Test Split (now for classification)
X_train, X_test, y_train, y_test = train_test_split(X_lda, y_binary, test_size=0.1, random_state=42)

# Classifier models (use classification models instead of regression models)
rf_clf = RandomForestClassifier(n_estimators=150, random_state=42)
xgb_clf = xgb.XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=6, random_state=42)
lr_clf = LogisticRegression()

# Stacking Classifier
stacking_clf = StackingClassifier(
    estimators=[
        ("RandomForest", rf_clf),
        ("XGBoost", xgb_clf),
        ("LogisticRegression", lr_clf)
    ],
    final_estimator=RandomForestClassifier()
)

# Train the stacking classifier
stacking_clf.fit(X_train, y_train)

# Predict probabilities (for AUC-ROC, we need probabilities)
y_prob = stacking_clf.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Calculate AUC-ROC score
from sklearn.metrics import roc_auc_score

auc_roc = roc_auc_score(y_test, y_prob)
print(f"AUC-ROC: {auc_roc:.2f}")


AUC-ROC: 0.93
