In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import SelectFromModel
from imblearn.over_sampling import SMOTE  # May not be needed for regression
import xgboost as xgb
from sklearn.ensemble import StackingRegressor
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis  # Will be removed for regression

# Load dataset
file_path = r"D:\mlpr data\Glioblastoma-ML-model\stackAndModel\merged_data.csv"
df = pd.read_csv(file_path)

# Convert target column to numeric
df["Survival_from_surgery_days_UPDATED"] = pd.to_numeric(df["Survival_from_surgery_days_UPDATED"], errors="coerce")

# Drop rows where target variable is NaN
df = df.dropna(subset=["Survival_from_surgery_days_UPDATED"])

# Separate features (X) and target (y)
X = df.drop(columns=["PatientID", "Survival_from_surgery_days_UPDATED"])
y = df["Survival_from_surgery_days_UPDATED"]

# Optional: filter features based on importance_df if available
# top_100_features = importance_df.head(420)['Feature'].tolist()
# X = X[top_100_features]

# Encode categorical columns
categorical_cols = X.select_dtypes(include=["object"]).columns
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

# Fill missing numeric values with median
X = X.apply(pd.to_numeric, errors="coerce").fillna(X.median())

# Standardize numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split (80:20)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.1, random_state=42)

# Define Base Regressors
rf_reg = RandomForestRegressor(n_estimators=150, random_state=42)
xgb_reg = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=6, random_state=42)
lr_reg = LinearRegression()

# Stacking Regressor
stacking_reg = StackingRegressor(
    estimators=[
        ("RandomForest", rf_reg),
        ("XGBoost", xgb_reg),
        ("LinearReg", lr_reg)
    ],
    final_estimator=RandomForestRegressor()
)

# Train the model
stacking_reg.fit(X_train, y_train)

# Predict
y_pred = stacking_reg.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Stacking Regressor Performance:")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R² Score: {r2:.2f}")


  df = pd.read_csv(file_path)
