In [44]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from lifelines import CoxPHFitter
from lifelines.utils import concordance_index

In [45]:
def stratified_concordance_index(y_true, y_pred, race):
    """
    Calculates the stratified concordance index.

    Args:
        y_true (pd.Series): True survival times.
        y_pred (pd.Series): Predicted risk scores.
        race (pd.Series): Race of each patient.

    Returns:
        float: Stratified concordance index.
    """
    races = race.unique()
    c_indices = []
    for r in races:
        mask = race == r
        c_index = concordance_index(y_true[mask], y_pred[mask])
        c_indices.append(c_index)
    return np.mean(c_indices) - np.std(c_indices)


In [46]:
def train_model(train_df):
    """
    Trains a survival model.

    Args:
        train_df (pd.DataFrame): Training data.

    Returns:
        model: Trained model.
    """
    # Preprocess categorical features
    categorical_features = ['race_group']
    label_encoders = {}
    for feature in categorical_features:
        label_encoder = LabelEncoder()
        train_df[feature + '_encoded'] = label_encoder.fit_transform(train_df[feature])
        label_encoders[feature] = label_encoder

    # Preprocess numerical features
    numerical_features = ['age_at_hct', 'donor_age', 'comorbidity_score', 'karnofsky_score']
    
    # Replace non-numeric values with NaN
    for feature in numerical_features:
        train_df[feature] = pd.to_numeric(train_df[feature], errors='coerce')
    
    # Fill NaN values with the median
    for feature in numerical_features:
        train_df[feature] = train_df[feature].fillna(train_df[feature].median())
        
    scaler = StandardScaler()
    train_df[numerical_features] = scaler.fit_transform(train_df[numerical_features])

    # Define features and target
    features = [feature + '_encoded' for feature in categorical_features] + numerical_features
    time_col = 'efs_time'
    event_col = 'efs'

    # Ensure all features are numeric before fitting the model
    for feature in features:
        train_df[feature] = pd.to_numeric(train_df[feature], errors='coerce')
        train_df[feature] = train_df[feature].fillna(train_df[feature].median()).astype(float)
    
    # Train Cox Proportional Hazards model
    cph = CoxPHFitter()
    cph.fit(train_df[features + [time_col, event_col]], duration_col=time_col, event_col=event_col)

    return cph, label_encoders, features, scaler

In [47]:
def predict_risk(model, test_df, label_encoders, features, scaler):
    """
    Predicts risk scores for the test data.

    Args:
        model: Trained model.
        test_df (pd.DataFrame): Test data.
        label_encoders: Label encoders for categorical features.
        features: List of features used in training.
        scaler: Scaler for numerical features.

    Returns:
        pd.Series: Predicted risk scores.
    """
    # Preprocess categorical features
    categorical_features = ['race_group']
    for feature in categorical_features:
        test_df[feature + '_encoded'] = label_encoders[feature].transform(test_df[feature])

    # Preprocess numerical features
    numerical_features = ['age_at_hct', 'donor_age', 'comorbidity_score', 'karnofsky_score']
    
    # Replace non-numeric values with NaN
    for feature in numerical_features:
        test_df[feature] = pd.to_numeric(test_df[feature], errors='coerce')
    
    # Fill NaN values with the median, handling empty slices
    for feature in numerical_features:
        median_val = test_df[feature].median()
        if not pd.isna(median_val):
            test_df[feature] = test_df[feature].fillna(median_val)
        else:
            test_df[feature] = test_df[feature].fillna(0) # Fill with 0 if median is NaN
    
    test_df[numerical_features] = scaler.transform(test_df[numerical_features])

    risk_scores = model.predict_partial_hazard(test_df[features])
    risk_scores.index = test_df['ID'] # Set the index to the ID column
    return risk_scores


In [48]:
def evaluate_model(model, test_df, label_encoders, features, scaler):
    """
    Evaluates the model using stratified concordance index.

    Args:
        model: Trained model.
        test_df (pd.DataFrame): Test data.
        label_encoders: Label encoders for categorical features.
        features: List of features used in training.
        scaler: Scaler for numerical features.

    Returns:
        float: Stratified concordance index.
    """
    risk_scores = predict_risk(model, test_df, label_encoders, features, scaler)
    stratified_c_index = stratified_concordance_index(test_df['efs_time'], risk_scores, test_df['race_group'])
    return stratified_c_index

In [49]:
def main():
    # Load data
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    # data_dictionary_df = pd.read_csv("data_dictionary.csv") # not used in this version
    # sample_submission_df = pd.read_csv("sample_submission.csv") # not used in this version

    # Train model
    model, label_encoders, features, scaler = train_model(train_df)

    # Evaluate model
    # stratified_c_index = evaluate_model(model, test_df, label_encoders, features, scaler)
    # print(f"Stratified C-index: {stratified_c_index}")

    # Predict risk scores
    risk_scores = predict_risk(model, test_df, label_encoders, features, scaler)

    # Create submission file
    submission_df = pd.DataFrame({'ID': risk_scores.index, 'prediction': risk_scores.values})
    submission_df.to_csv('submission.csv', index=False)
    print("Submission file created: submission.csv")


if __name__ == "__main__":
    main()

Submission file created: submission.csv
