In [1]:
import numpy as np, os, sys
import pandas as pd
from tqdm import tqdm

## Load all the data so we explore it. 
pfile = '../Data/training_2024-11-04.pickle'
pfile_test = '../Data/testing_2024-11-04.pickle'
pfile_scoring = '../Data/evaluation_2024-11-04.pickle'
if os.path.isfile(pfile):
    SEPSISdat = pd.DataFrame.from_dict(pd.read_pickle(pfile))
    print(len(SEPSISdat.patient)) # should be n=200112

if os.path.isfile(pfile_test):
    SEPSISdat_test = pd.DataFrame.from_dict(pd.read_pickle(pfile_test))
    print(len(SEPSISdat_test.patient)) # should be n=41993

if os.path.isfile(pfile_scoring):
    SEPSISdat_scoring = pd.DataFrame.from_dict(pd.read_pickle(pfile_scoring))
    print(len(SEPSISdat_scoring.patient)) # should be n=391288, but you don't have this hidden file

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


200112
41993


In [2]:
import evaluate_sepsis_score as ev
import time
from get_sepsis_score_lr import load_sepsis_model, get_sepsis_score
# from get_sepsis_score_gbm import load_sepsis_model, get_sepsis_score
# Load model.
print('Loading sepsis model...')
model = load_sepsis_model()

# Iterate over files.
print('Predicting sepsis labels...')

def score_cohort(cohort):
    pats = np.unique(cohort.patient)
    num_patients = len(pats)
    scores = np.zeros(len(cohort))
    labels = np.full(len(cohort),False)
    ct = 0

    starttime = time.time()
    for k in tqdm(range(num_patients)):
      p = pats[k]
      data = cohort[cohort.patient==p] 
      data = data.drop(columns='patient')
      # Ignore SepsisLabel column if present.
      column_names = cohort.columns
      if column_names[-1] == 'SepsisLabel':
        column_names = column_names[:-1]
        data = data.drop(columns='SepsisLabel')
      # Make predictions.
      num_rows = len(data)
      data = data.to_numpy() 
      for t in range(num_rows):
        current_data = data[:t+1]
        current_score, current_label = get_sepsis_score(current_data, model)
        scores[ct] = current_score[0]
        labels[ct] = current_label
        ct+=1
    elapsed = round(time.time() - starttime)
    util = ev.evaluate_utility(cohort.patient,cohort.SepsisLabel,labels)
    
    return scores, labels, util, elapsed

score_train, labels_train, utility, elapsed = score_cohort(SEPSISdat)
print(utility,elapsed)
score_test, labels_test, utility, elapsed = score_cohort(SEPSISdat_test)
print(utility,elapsed)
if os.path.isfile(pfile_scoring):
    score_eval, labels_eval, utility, elapsed = score_cohort(SEPSISdat_scoring)
    print(utility,elapsed)

Loading sepsis model...
Predicting sepsis labels...


100%|██████████| 5000/5000 [02:21<00:00, 35.33it/s]
100%|██████████| 5000/5000 [03:26<00:00, 24.20it/s]


0.3498 142


100%|██████████| 1000/1000 [00:10<00:00, 93.56it/s]
100%|██████████| 1000/1000 [00:08<00:00, 116.98it/s]

0.3153 11





In [3]:
a = np.array([[1,2,3],[4,5,6]])
a[-1]

array([4, 5, 6])