In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sksurv.ensemble import RandomSurvivalForest
from sksurv.util import Surv
from sksurv.metrics import concordance_index_censored
from sklearn.model_selection import KFold
from scipy.stats import rankdata

df_train = pd.read_csv('../../data/train_pivot.csv')
df_test = pd.read_csv('../../data/eval_pivot.csv')

df_train = df_train.dropna(subset=['OS_YEARS', 'OS_STATUS'])

target_cols = ['OS_YEARS', 'OS_STATUS']
X = df_train.drop(columns=target_cols + ['ID'])
y_time = df_train['OS_YEARS']
y_event = df_train['OS_STATUS'].astype(bool)

X_test = df_test.drop(columns=['ID'], errors='ignore')
X_test = X_test.reindex(columns=X.columns, fill_value=0)

# ModÃ¨les
rsf = RandomSurvivalForest(
    n_estimators=1000,
    min_samples_leaf=15,
    max_features="sqrt",
    n_jobs=-1,
    random_state=42
)

xgb_model = xgb.XGBRegressor(
    objective='survival:cox',
    n_estimators=800,
    learning_rate=0.04,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    n_jobs=-1,
    random_state=42
)

# CV + Stacking
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores_rsf = []
scores_xgb = []
scores_stack = []

# K-Fold
final_preds_rsf = np.zeros(len(X_test))
final_preds_xgb = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y_event)):
    # Split
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_time_tr, y_time_val = y_time.iloc[train_idx], y_time.iloc[val_idx]
    y_event_tr, y_event_val = y_event.iloc[train_idx], y_event.iloc[val_idx]

    # RSF
    y_tr_struct = Surv.from_arrays(event=y_event_tr, time=y_time_tr)
    rsf.fit(X_tr, y_tr_struct)
    pred_rsf_val = rsf.predict(X_val)
    score_rsf = concordance_index_censored(y_event_val, y_time_val, pred_rsf_val)[0]
    scores_rsf.append(score_rsf)

    # XGBoost Cox
    y_xgb_tr = y_time_tr.copy().values
    y_xgb_tr[~y_event_tr] = -y_xgb_tr[~y_event_tr]

    y_xgb_val_target = y_time_val.copy().values
    y_xgb_val_target[~y_event_val] = -y_xgb_val_target[~y_event_val]

    xgb_model.fit(
        X_tr, y_xgb_tr,
        eval_set=[(X_val, y_xgb_val_target)],
        verbose=False
    )

    pred_xgb_val = xgb_model.predict(X_val)
    score_xgb = concordance_index_censored(y_event_val, y_time_val, pred_xgb_val)[0]
    scores_xgb.append(score_xgb)

    # Stacking
    pred_stack_val = (rankdata(pred_rsf_val) + rankdata(pred_xgb_val)) / 2
    score_stack = concordance_index_censored(y_event_val, y_time_val, pred_stack_val)[0]
    scores_stack.append(score_stack)

    print(f"Fold {fold+1} | RSF: {score_rsf:.4f} | XGB: {score_xgb:.4f} | >> STACK: {score_stack:.4f}")

    final_preds_rsf += rsf.predict(X_test) / 5
    final_preds_xgb += xgb_model.predict(X_test) / 5

print("\n--- Mean Results ---")
print(f"RSF  Avg C-Index: {np.mean(scores_rsf):.4f}")
print(f"XGB  Avg C-Index: {np.mean(scores_xgb):.4f}")
print(f"STACK Avg C-Index: {np.mean(scores_stack):.4f}")


final_risk_scores = (rankdata(final_preds_rsf) + rankdata(final_preds_xgb)) / 2
submission = pd.DataFrame({
    'ID': df_test['ID'],
    'risk_score': final_risk_scores
})
submission.to_csv('../../submissions/submission_stacking_pivot.csv', index=False)