In [10]:
from pathlib import Path

import polars as pl
import numpy as np
from lifelines.datasets import load_rossi
from lifelines import CoxPHFitter

In [24]:
data_path = Path("/Users/zacklarsen/Documents/Projects/kaggle-wids-datathon-2020/data/")
competition_path = Path("/Users/zacklarsen/Documents/Projects/kaggle-wids-datathon-2020/")
mlflow_path = Path(competition_path, "mlruns/")
training_v2_path = Path(data_path, "training_v2.csv")

In [3]:
X_train = pl.read_parquet(data_path / "X_train.parquet")
X_test = pl.read_parquet(data_path / "X_test.parquet")
y_train = pl.read_parquet(data_path / "y_train.parquet")
y_test = pl.read_parquet(data_path / "y_test.parquet")

## Example dataset

In [11]:
rossi_dataset = load_rossi()

In [12]:
rossi_dataset

Unnamed: 0,week,arrest,fin,age,race,wexp,mar,paro,prio
0,20,1,0,27,1,0,0,1,3
1,17,1,0,18,1,0,0,1,8
2,25,1,0,19,0,1,0,1,13
3,52,0,1,23,1,1,1,1,1
4,52,0,0,19,0,1,0,1,3
...,...,...,...,...,...,...,...,...,...
427,52,0,1,31,0,1,0,1,3
428,52,0,0,20,1,0,0,1,1
429,52,0,1,20,1,1,1,1,1
430,52,0,0,29,1,1,0,1,3


In [13]:
cph = CoxPHFitter()
cph.fit(rossi_dataset, duration_col='week', event_col='arrest')

cph.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'week'
event col,'arrest'
baseline estimation,breslow
number of observations,432
number of events observed,114
partial log-likelihood,-658.75
time fit was run,2023-10-21 15:59:14 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
fin,-0.38,0.68,0.19,-0.75,-0.0,0.47,1.0,0.0,-1.98,0.05,4.4
age,-0.06,0.94,0.02,-0.1,-0.01,0.9,0.99,0.0,-2.61,0.01,6.79
race,0.31,1.37,0.31,-0.29,0.92,0.75,2.5,0.0,1.02,0.31,1.7
wexp,-0.15,0.86,0.21,-0.57,0.27,0.57,1.3,0.0,-0.71,0.48,1.06
mar,-0.43,0.65,0.38,-1.18,0.31,0.31,1.37,0.0,-1.14,0.26,1.97
paro,-0.08,0.92,0.2,-0.47,0.3,0.63,1.35,0.0,-0.43,0.66,0.59
prio,0.09,1.1,0.03,0.04,0.15,1.04,1.16,0.0,3.19,<0.005,9.48

0,1
Concordance,0.64
Partial AIC,1331.50
log-likelihood ratio test,33.27 on 7 df
-log2(p) of ll-ratio test,15.37


In [14]:
X_train.glimpse()

Rows: 73370
Columns: 185
$ encounter_id                  <i64> 59954, 63518, 34401, 71581, 29019, 70076, 63091, 40553, 75009, 94808
$ patient_id                    <i64> 90449, 19384, 20558, 112066, 129440, 97970, 42207, 499, 125636, 104142
$ hospital_id                   <i64> 118, 185, 188, 62, 161, 89, 161, 186, 62, 128
$ age                           <f32> 86.0, 72.0, 36.0, 60.0, 27.0, 71.0, 35.0, 75.0, 72.0, 50.0
$ bmi                           <str> '28.16897519', '36.63508791', '27.45968419', '21.97735112', '19.960244', '21.31254263', '29.34003963', '29.1796875', '24.24339532', '22.3046875'
$ elective_surgery              <i64> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ ethnicity                     <str> 'Caucasian', 'Caucasian', 'African American', 'Caucasian', 'Caucasian', 'Other/Unknown', 'Hispanic', 'Caucasian', 'Caucasian', 'Caucasian'
$ gender                        <str> 'M', 'M', 'M', 'M', 'F', 'F', 'F', 'F', 'M', 'F'
$ height                        <str> '170.2', '170.1', '162.6',

In [15]:
y_train.glimpse()

Rows: 73370
Columns: 1
$ hospital_death <i64> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0



In [18]:
X_train.schema

{'encounter_id': Int64,
 'patient_id': Int64,
 'hospital_id': Int64,
 'age': Float32,
 'bmi': Utf8,
 'elective_surgery': Int64,
 'ethnicity': Utf8,
 'gender': Utf8,
 'height': Utf8,
 'hospital_admit_source': Utf8,
 'icu_admit_source': Utf8,
 'icu_id': Int64,
 'icu_stay_type': Utf8,
 'icu_type': Utf8,
 'pre_icu_los_days': Float64,
 'readmission_status': Int64,
 'weight': Utf8,
 'albumin_apache': Utf8,
 'apache_2_diagnosis': Utf8,
 'apache_3j_diagnosis': Utf8,
 'apache_post_operative': Int64,
 'arf_apache': Utf8,
 'bilirubin_apache': Utf8,
 'bun_apache': Utf8,
 'creatinine_apache': Utf8,
 'fio2_apache': Utf8,
 'gcs_eyes_apache': Utf8,
 'gcs_motor_apache': Utf8,
 'gcs_unable_apache': Utf8,
 'gcs_verbal_apache': Utf8,
 'glucose_apache': Utf8,
 'heart_rate_apache': Utf8,
 'hematocrit_apache': Utf8,
 'intubated_apache': Utf8,
 'map_apache': Utf8,
 'paco2_apache': Utf8,
 'paco2_for_ph_apache': Utf8,
 'pao2_apache': Utf8,
 'ph_apache': Utf8,
 'resprate_apache': Utf8,
 'sodium_apache': Utf8,
 '

In [23]:
y_train

hospital_death
i64
0
1
0
0
0
0
0
0
0
0


In [26]:
data = pl.read_csv(training_v2_path, infer_schema_length=10000, dtypes={"age": pl.Float32}, ignore_errors=True)

In [28]:
data.glimpse()

Rows: 91713
Columns: 186
$ encounter_id                  <i64> 66154, 114252, 119783, 79267, 92056, 33181, 82208, 120995, 80471, 42871
$ patient_id                    <i64> 25312, 59342, 50777, 46918, 34377, 74489, 49526, 50129, 10577, 90749
$ hospital_id                   <i64> 118, 81, 118, 118, 33, 83, 83, 33, 118, 118
$ hospital_death                <i64> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0
$ age                           <f32> 68.0, 77.0, 25.0, 81.0, 19.0, 67.0, 59.0, 70.0, 45.0, 50.0
$ bmi                           <str> '22.73', '27.42', '31.95', '22.64', 'NA', '27.56', '57.45', 'NA', 'NA', '25.71'
$ elective_surgery              <i64> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0
$ ethnicity                     <str> 'Caucasian', 'Caucasian', 'Caucasian', 'Caucasian', 'Caucasian', 'Caucasian', 'Caucasian', 'Caucasian', 'Caucasian', None
$ gender                        <str> 'M', 'F', 'F', 'F', 'M', 'M', 'F', 'M', 'M', 'M'
$ height                        <str> '180.3', '160', '172.7', '165.1', '188', '

In [37]:
pd_train_df = data.select(pl.col(["patient_id", "age", "weight", "ethnicity", "gender", "hospital_death"])).to_pandas()

In [38]:
pd_train_df.head(5)

Unnamed: 0,patient_id,age,weight,ethnicity,gender,hospital_death
0,25312,68.0,73.9,Caucasian,M,0
1,59342,77.0,70.2,Caucasian,F,0
2,50777,25.0,95.3,Caucasian,F,0
3,46918,81.0,61.7,Caucasian,F,0
4,34377,19.0,,Caucasian,M,0


In [40]:
cph = CoxPHFitter()
cph.fit(pd_train_df, event_col='hospital_death')

KeyError: None

In [None]:
cph.print_summary()