In [9]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from xgboost import XGBClassifier

The below block sets up the main settings for the model (data file, target column, split size, and columns to drop to avoid cheating) and defines how probabilities will be converted into Low, Medium, or High risk.

In [None]:
# Path to your dataset file
CSV_PATH = "Breast Cancer METABRIC.csv"

# This is the column the model will try to predict
TARGET_COL = "Risk_Level"

# Makes results reproducible (same split every run)
RANDOM_STATE = 42

# 20% of data will be used for testing
TEST_SIZE = 0.20

# # Remove columns that directly reveal survival outcome (prevents data leakage)
DROP_COLS_IF_PRESENT = ["Overall Survival Months"]


# Function to convert predicted probability into risk category
def to_risk_bucket(p):
    # If probability is low → Low Risk
    if p < 0.33:
        return "Low"
    # If probability is moderate → Medium Risk
    elif p < 0.66:
        return "Medium"
    # If probability is high → High Risk
    return "High"

In [18]:
# Load the dataset from the CSV file
df = pd.read_csv(CSV_PATH)

# Print all column names (helps verify exact column spelling)
print(df.columns.tolist())

# Remove duplicate patient records
df = df.drop_duplicates()

# Create 3 risk groups based on survival months
# q=3 splits the data into 3 equal-sized groups (quantiles)
df["Risk_Level"] = pd.qcut(
    df["Overall Survival (Months)"],  # column used to define risk
    q=3,                              # divide into 3 equal groups
    labels=["High", "Medium", "Low"]  # shortest survival → High risk
)

# Print how many patients fall into each risk category
print("Risk level distribution:")
print(df["Risk_Level"].value_counts())

['Patient ID', 'Age at Diagnosis', 'Type of Breast Surgery', 'Cancer Type', 'Cancer Type Detailed', 'Cellularity', 'Chemotherapy', 'Pam50 + Claudin-low subtype', 'Cohort', 'ER status measured by IHC', 'ER Status', 'Neoplasm Histologic Grade', 'HER2 status measured by SNP6', 'HER2 Status', 'Tumor Other Histologic Subtype', 'Hormone Therapy', 'Inferred Menopausal State', 'Integrative Cluster', 'Primary Tumor Laterality', 'Lymph nodes examined positive', 'Mutation Count', 'Nottingham prognostic index', 'Oncotree Code', 'Overall Survival (Months)', 'Overall Survival Status', 'PR Status', 'Radio Therapy', 'Relapse Free Status (Months)', 'Relapse Free Status', 'Sex', '3-Gene classifier subtype', 'Tumor Size', 'Tumor Stage', "Patient's Vital Status"]
Risk level distribution:
Risk_Level
Medium    661
High      660
Low       660
Name: count, dtype: int64


In [None]:
# Remove survival-related columns so the model cannot cheat
# (We already used survival months to create Risk_Level)
df = df.drop(columns=["Overall Survival (Months)", "Overall Survival Status"])

# Drop any additional leakage columns if they exist in the dataset
df = df.drop(columns=[c for c in DROP_COLS_IF_PRESENT if c in df.columns], errors="ignore")

# Remove rows where the target (Risk_Level) is missing
df = df.dropna(subset=[TARGET_COL]).copy()

In [20]:
# If target isn't numeric 0/1, map it
# if df[TARGET_COL].dtype == "object":
#     classes = sorted(df[TARGET_COL].dropna().unique())
#     if len(classes) != 2:
#         raise ValueError(f"Target is not binary. Found: {classes}")
#     mapping = {classes[0]: 0, classes[1]: 1}
#     df[TARGET_COL] = df[TARGET_COL].map(mapping)


# Set target column to Risk_Level (Low/Medium/High)
TARGET_COL = "Risk_Level"

# Ensure there are no missing risk labels
df = df.dropna(subset=[TARGET_COL]).copy()

# Convert risk labels into numeric classes for XGBoost
# Low = 0, Medium = 1, High = 2
df[TARGET_COL] = df[TARGET_COL].map({
    "Low": 0,
    "Medium": 1,
    "High": 2
}).astype(int)


# Feature engineering: create nonlinear age feature
# Helps model capture nonlinear age-risk relationships
if "Age at Diagnosis" in df.columns:
    df["Age_Squared"] = df["Age at Diagnosis"] ** 2

Split the data into training and testing sets, preprocess the features (handle missing values and convert text to numbers), and train an XGBoost model to predict the risk category.

Then you evaluate how well the model performs by checking its accuracy, detailed classification metrics, and confusion matrix on unseen test data.

In [22]:

# SPLIT
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

# PREPROCESS

num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X_train.columns if c not in num_cols]

preprocess = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), num_cols),
        ("cat", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore")),
        ]), cat_cols),
    ],
    remainder="drop"
)

# MODEL (XGBoost)

xgb = XGBClassifier(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.9,
    colsample_bytree=0.9,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=RANDOM_STATE,
    n_jobs=-1
)

model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", xgb)
])

model.fit(X_train, y_train)

# EVALUATE (binary)
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))



Accuracy: 0.9093198992443325

Classification report:
               precision    recall  f1-score   support

           0       0.96      0.89      0.92       132
           1       0.86      0.91      0.89       133
           2       0.91      0.93      0.92       132

    accuracy                           0.91       397
   macro avg       0.91      0.91      0.91       397
weighted avg       0.91      0.91      0.91       397


Confusion matrix:
 [[117  11   4]
 [  4 121   8]
 [  1   8 123]]


In [None]:
# RISK BUCKETS (Low/Med/High)
proba_class1 = model.predict_proba(X_test)[:, 1]
risk_bucket = [to_risk_bucket(p) for p in proba_class1]

out = X_test.copy()
out["y_true"] = y_test.values
out["pred_prob_class1"] = proba_class1
out["risk_bucket"] = risk_bucket

out_path = "risk_predictions_test.csv"
out.to_csv(out_path, index=False)

print("\nSaved:", out_path)
print("\nRisk bucket counts:\n", pd.Series(risk_bucket).value_counts())


Saved: risk_predictions_test.csv

Risk bucket counts:
 Low       250
High      133
Medium     14
Name: count, dtype: int64


In [24]:
df = pd.read_csv("risk_predictions_test.csv")

In [30]:
df["risk_bucket"].head()

0     Low
1     Low
2    High
3     Low
4     Low
Name: risk_bucket, dtype: object