In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer 


In [2]:
"""
To evaluate the equitable prediction of transplant survival outcomes,
we use the concordance index (C-index) between a series of event
times and a predicted score across each race group.
 
It represents the global assessment of the model discrimination power:
this is the model’s ability to correctly provide a reliable ranking
of the survival times based on the individual risk scores.
 
The concordance index is a value between 0 and 1 where:
 
0.5 is the expected result from random predictions,
1.0 is perfect concordance (with no censoring, otherwise <1.0),
0.0 is perfect anti-concordance (with no censoring, otherwise >0.0)

"""

import pandas as pd
import pandas.api.types
import numpy as np
from lifelines.utils import concordance_index

class ParticipantVisibleError(Exception):
    pass


def calc_score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str = 'ID') -> float:
    
    del solution[row_id_column_name]
    del submission[row_id_column_name]
    
    event_label = 'efs'
    interval_label = 'efs_time'
    prediction_label = 'prediction'
    for col in submission.columns:
        if not pandas.api.types.is_numeric_dtype(submission[col]):
            raise ParticipantVisibleError(f'Submission column {col} must be a number')
    # Merging solution and submission dfs on ID
    merged_df = pd.concat([solution, submission], axis=1)
    merged_df.reset_index(inplace=True)
    merged_df_race_dict = dict(merged_df.groupby(['race_group']).groups)
    metric_list = []
    for race in merged_df_race_dict.keys():
        # Retrieving values from y_test based on index
        indices = sorted(merged_df_race_dict[race])
        merged_df_race = merged_df.iloc[indices]
        # Calculate the concordance index
        c_index_race = concordance_index(
                        merged_df_race[interval_label],
                        -merged_df_race[prediction_label],
                        merged_df_race[event_label])
        metric_list.append(c_index_race)
    return float(np.mean(metric_list)-np.sqrt(np.var(metric_list)))

# row_id_column_name = "id"
# y_pred = {'prediction': {0: 1.0, 1: 0.0, 2: 1.0}}
# y_pred = pd.DataFrame(y_pred)
# y_pred.insert(0, row_id_column_name, range(len(y_pred)))
# y_true = { 'efs': {0: 1.0, 1: 0.0, 2: 0.0}, 'efs_time': {0: 25.1234,1: 250.1234,2: 2500.1234}, 'race_group': {0: 'race_group_1', 1: 'race_group_1', 2: 'race_group_1'}}
# y_true = pd.DataFrame(y_true)
# y_true.insert(0, row_id_column_name, range(len(y_true)))
# score(y_true.copy(), y_pred.copy(), row_id_column_name)

In [3]:
def createSubmission(X_test, y_pred):
    # Create the submission: ID, prediction
    submission = pd.DataFrame({
        'ID': X_test.ID, 
        'prediction': -y_pred
    })
    return submission

In [4]:

# Carregar os dados
data = pd.read_csv('/home/augusto/projects/kaggle/cibmtr/input/equity-post-HCT-survival-predictions/train.csv') 

# Separar variáveis preditoras (features) e variável alvo (target)
X = data.drop(['efs', 'efs_time', 'ID'], axis=1)  # Features
y = data['efs_time']  # Target

# Dividir os dados em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
# mostrar as categorias de cada coluna e quantos valore únicos tem em cada uma
# Deixar as resposta em colunas separadas vizualmente com no máximo 20 caracteres para o nome da coluna

for col in X.columns:
    print(f'{col:.<20}', f'{str(X[col].dtype):.<7}', X[col].nunique())
    

dri_score........... object. 11
psych_disturb....... object. 3
cyto_score.......... object. 7
diabetes............ object. 3
hla_match_c_high.... float64 3
hla_high_res_8...... float64 7
tbi_status.......... object. 8
arrhythmia.......... object. 3
hla_low_res_6....... float64 5
graft_type.......... object. 2
vent_hist........... object. 2
renal_issue......... object. 3
pulm_severe......... object. 3
prim_disease_hct.... object. 18
hla_high_res_6...... float64 6
cmv_status.......... object. 4
hla_high_res_10..... float64 8
hla_match_dqb1_high. float64 3
tce_imm_match....... object. 8
hla_nmdp_6.......... float64 5
hla_match_c_low..... float64 3
rituximab........... object. 2
hla_match_drb1_low.. float64 2
hla_match_dqb1_low.. float64 3
prod_type........... object. 2
cyto_score_detail... object. 5
conditioning_intensity object. 6
ethnicity........... object. 3
year_hct............ int64.. 13
obesity............. object. 3
mrd_hct............. object. 2
in_vivo_tcd......... object. 2
tce

In [None]:

# Criar um df com os dados de teste para obter o score, deve conter as linhas de data com o mesmo índice de X_test 
solution_test = data.loc[X_test.index]

# Criar pipelines para transformação de dados
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Criar o pipeline do modelo
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestRegressor(random_state=42))
])

# Treinar o modelo
pipeline.fit(X_train, y_train)

# Fazer previsões
y_pred = pipeline.predict(X_test)
X_test['ID'] = solution_test['ID']
# y_pred['ID'] = solution_test['ID']
submission = createSubmission(X_test, y_pred)
score = calc_score(solution_test, submission)
print(f"Score: {score}")

In [5]:

# print(classification_report(y_test, y_pred))

# Salvar o modelo (opcional)
# import joblib
# joblib.dump(pipeline, 'modelo_efs.joblib') 

# Carregar o modelo (opcional)
# loaded_model = joblib.load('modelo_efs.joblib')

In [6]:
# # Treinar o modelo
# pipeline.fit(X, y)

# X_test = pd.read_csv('data/equity-post-HCT-survival-predictions/test.csv')
# y_pred = pipeline.predict(X_test)

# # Create the submission: ID, prediction
# submission = pd.DataFrame({
#     'ID': X_test.ID, 
#     'prediction': y_pred
# })

# submission.to_csv('submission.csv', index = False)