In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve

# ---------------------------------------------------------
# 1. LOAD DATA
# ---------------------------------------------------------
# Load the dataset (adjust the path if necessary)
df = pd.read_csv('rotterdam.csv')

print(f"Original Dataset Shape: {df.shape}")

Original Dataset Shape: (2982, 16)


In [None]:
# ---------------------------------------------------------
# TASK 2: LONG-TERM MORTALITY PREDICTION
# ---------------------------------------------------------

# 1. TARGET ENGINEERING (Similar to Task 1, but for DEATH)
cutoff_days = 3650 # 10 Years

# Create Target: Did patient die within cutoff?
# Class 1: Death == 1 AND dtime <= cutoff
# Class 0: Alive at cutoff (dtime > cutoff)
# Exclude: Died after cutoff (Target=0) or Censored before cutoff (Unknown)

df['target_death'] = np.where((df['death'] == 1) & (df['dtime'] <= cutoff_days), 1, 0)

# Filter for valid rows (Known status at 5 years)
valid_mask_death = (df['target_death'] == 1) | (df['dtime'] > cutoff_days)
df_death = df[valid_mask_death].copy()

print(f"Task 2 Dataset: {df_death.shape}")
print(f"Death Rate (Balance): {df_death['target_death'].mean():.2%}")

# 2. FEATURE SELECTION (LASSO for Mortality)
# We re-run Lasso because predictors for Death might differ (e.g., Age might be more important)

X_d = df_death[['age', 'nodes', 'pgr', 'er', 'size', 'grade', 'meno', 'hormon', 'chemo']]
y_d = df_death['target_death']

# Preprocessing (Same as before)
# Note: We must redefine X/y specific to this task
preproc_death = ColumnTransformer([
    ('num', StandardScaler(), ['age', 'nodes', 'pgr', 'er']),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), ['size', 'grade', 'meno', 'hormon', 'chemo'])
])

# Lasso Pipeline
lasso_death = Pipeline([
    ('preprocessor', preproc_death),
    ('clf', LogisticRegressionCV(cv=5, penalty='l1', solver='liblinear', class_weight='balanced', max_iter=2000))
])

lasso_death.fit(X_d, y_d)

# 3. EXTRACT SELECTED FEATURES
model_d = lasso_death.named_steps['clf']
feat_names = ['age', 'nodes', 'pgr', 'er'] + list(lasso_death.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out())

# Show meaningful coeffs
coef_death = pd.DataFrame({'Feature': feat_names, 'Coeff': model_d.coef_[0]})
selected_death = coef_death[coef_death['Coeff'] != 0].sort_values(by='Coeff', ascending=False)

print("\n--- Features Selected for Mortality Prediction ---")
print(selected_death)

# 4. FINAL MODEL FOR REPORT (Statsmodels)
# Based on Lasso results, we build the inferential table.
# (This block assumes Lasso picks similar vars; adjust 'cols_final' based on the print above!)

# Example: Assuming Lasso picks nodes, age, grade, size (typical for this dataset)
# We manually construct the design matrix again
import statsmodels.api as sm

# Define features based on Lasso output (Update this list after running the cell above!)
# Usually: Age becomes stronger for death, Nodes remains strong.
cols_final = ['age', 'nodes', 'grade', 'size', 'pgr'] 

X_fin_d = df_death[cols_final].copy()

# Manual Encoding
if 'size' in cols_final:
    X_fin_d['size_small'] = (X_fin_d['size'] == '<=20').astype(int)
    X_fin_d = X_fin_d.drop('size', axis=1)
if 'grade' in cols_final:
    X_fin_d = pd.concat([X_fin_d, pd.get_dummies(X_fin_d['grade'], prefix='grade', drop_first=True)], axis=1)
    X_fin_d = X_fin_d.drop('grade', axis=1)

X_fin_d = sm.add_constant(X_fin_d)

# Ensure all predictors are numeric (statsmodels errors on object/boolean dtypes)
X_fin_d = X_fin_d.apply(pd.to_numeric)
bool_cols = X_fin_d.select_dtypes(include=['bool']).columns
if len(bool_cols):
    X_fin_d[bool_cols] = X_fin_d[bool_cols].astype(int)

y_fin_d = df_death['target_death']

# Fit
logit_death = sm.Logit(y_fin_d, X_fin_d).fit()

# Report Table
conf_d = logit_death.conf_int()
conf_d['Odds Ratio'] = logit_death.params
conf_d.columns = ['Lower CI', 'Upper CI', 'Odds Ratio']
print("\n=== TASK 2 (MORTALITY) FINAL TABLE ===")
print(np.exp(conf_d).round(3))
print(logit_death.summary())

Task 2 Dataset: (2837, 17)
Death Rate (Balance): 26.54%

--- Features Selected for Mortality Prediction ---
Empty DataFrame
Columns: [Feature, Coeff]
Index: []
Optimization terminated successfully.
         Current function value: 0.509632
         Iterations 6

=== TASK 2 (MORTALITY) FINAL TABLE ===
            Lower CI  Upper CI  Odds Ratio
const          0.104     0.256       0.163
age            1.003     1.017       1.010
nodes          1.119     1.167       1.143
pgr            0.999     0.999       0.999
size_small     0.421     0.623       0.512
grade_3        1.107     1.727       1.383
                           Logit Regression Results                           
Dep. Variable:           target_death   No. Observations:                 2837
Model:                          Logit   Df Residuals:                     2831
Method:                           MLE   Df Model:                            5
Date:                Sun, 11 Jan 2026   Pseudo R-squ.:                  0.1193
Ti