In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import xgboost as xgb
from sklearn.ensemble import StackingRegressor

# Load data
file_path = r"D:\mlpr data\Glioblastoma-ML-model\stackAndModel\merged_data.csv"
df = pd.read_csv(file_path)

# Clean target
df["Survival_from_surgery_days_UPDATED"] = pd.to_numeric(df["Survival_from_surgery_days_UPDATED"], errors="coerce")
df = df.dropna(subset=["Survival_from_surgery_days_UPDATED"])
y = df["Survival_from_surgery_days_UPDATED"]

# Drop unwanted columns and extract features
X = df.drop(columns=["PatientID", "Survival_from_surgery_days_UPDATED"])

# Encode categorical features
categorical_cols = X.select_dtypes(include=["object"]).columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

# Handle missing values
X = X.apply(pd.to_numeric, errors="coerce").fillna(X.median())

# Save feature names for importance mapping
feature_names = X.columns.tolist()

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- LDA-based Feature Selection (requires classification labels) ---

# Create temporary binned survival categories
percentiles = np.percentile(y, [25, 50, 75])
bins = [0, percentiles[0], percentiles[1], percentiles[2], np.inf]
labels = [0, 1, 2, 3]
y_binned = pd.cut(y, bins=bins, labels=labels)

# Fit LDA to compute feature importance
lda_temp = LinearDiscriminantAnalysis()
lda_temp.fit(X_scaled, y_binned)
lda_importance = np.abs(lda_temp.coef_).sum(axis=0)

# Rank top 500 features
importance_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": lda_importance
}).sort_values(by="Importance", ascending=False)

top_features = importance_df.head(500)["Feature"].tolist()

# Select top features
X = X[top_features]
X_scaled = scaler.fit_transform(X)  # re-scale after reducing dimensions

# --- Regression Pipeline ---

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.1, random_state=42)

# Define base regressors
rf_reg = RandomForestRegressor(n_estimators=150, random_state=42)
xgb_reg = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=6, random_state=42)
lr_reg = LinearRegression()

# Stacking Regressor
stacking_reg = StackingRegressor(
    estimators=[
        ("RandomForest", rf_reg),
        ("XGBoost", xgb_reg),
        ("LinearReg", lr_reg)
    ],
    final_estimator=RandomForestRegressor()
)

# Train model
stacking_reg.fit(X_train, y_train)

# Predict and evaluate
y_pred = stacking_reg.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nRegression Performance using LDA-selected features:")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R² Score: {r2:.2f}")

  df = pd.read_csv(file_path)



Regression Performance using LDA-selected features:
RMSE: 499.06
MAE: 328.77
R² Score: 0.00
