In [1]:
import pandas as pd
import os
import json
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
DATA_HOME = '/content/gdrive/Shareddrives/PROJECT_ROOT_DIR'
CODE_DIR = os.path.join(DATA_HOME, "code")
OHDSI_VOCAB_HOME = os.path.join(DATA_HOME, 'ohdsi-vocab')
DS_HOME =  os.path.join(DATA_HOME, 'injury-icd-dataset')
CODE_HOME = os.path.join(DATA_HOME, "code")

In [4]:
LABEL_SUBSETS = ['all', 'non_sup', 'top10', 'top50', 'non_sup_5char', "non_sup_4_and_5char"]

In [5]:
icd_vocab = pd.read_csv(os.path.join(OHDSI_VOCAB_HOME, 'ICD10_CONCEPT.csv.gz'), sep='\t', 
                       dtype={'standard_concept': str, 'concept_code': str, 'invalid_reason': str})

In [6]:
cases = pd.read_csv(os.path.join(DS_HOME, 'case.csv'))

labels = dict()
with open(os.path.join(DS_HOME, 'label.txt')) as f:
  labels['all'] = f.read().split('\n')

with open(os.path.join(DS_HOME, 'label-non-superficial.txt')) as f:
  labels['non_sup'] = f.read().split('\n')

with open(os.path.join(DS_HOME, 'label-non-superficial-5-char.txt')) as f:
  labels['non_sup_5char'] = f.read().split('\n')

with open(os.path.join(DS_HOME, 'label-non-superficial-top10.txt')) as f:
  labels['top10'] = f.read().split('\n')

with open(os.path.join(DS_HOME, 'label-non-superficial-top50.txt')) as f:
  labels['top50']= f.read().split('\n')

labels["non_sup_4_and_5char"] = labels["non_sup"] + labels["non_sup_5char"] + ["S52.37"]

with open(os.path.join(DS_HOME, 'validation.txt')) as f:
  validation = f.read().split('\n')

with open(os.path.join(DS_HOME, 'train.txt')) as f:
  train = f.read().split('\n')

with open(os.path.join(DS_HOME, 'test.txt')) as f:
  test = f.read().split('\n')

case_labels_4char = pd.read_csv(os.path.join(DS_HOME, 'case-labels.csv'), dtype={'patient_id': str})
case_labels_5char = pd.read_csv(os.path.join(DS_HOME, 'case-labels-5-char.csv'), dtype={'patient_id': str})

validation_c2i = dict([(x,y) for y, x in enumerate(validation)])
train_c2i = dict([(x,y) for y, x in enumerate(train)])
test_c2i = dict([(x,y) for y, x in enumerate(test)])


label_c2i = {k : dict([(x,y) for y, x in enumerate(labels[k])]) for k in LABEL_SUBSETS}

targets = dict()

for k in LABEL_SUBSETS:
    l = labels[k]
    l_c2i = label_c2i[k]

    case_labels = case_labels_5char if "5char" in k else case_labels_4char
    validation_targets = np.zeros((len(validation), len(l)))

    for _, row in case_labels[case_labels.patient_id.isin(validation)].iterrows():
        if row.label in l_c2i:
            validation_targets[validation_c2i[row.patient_id], l_c2i[row.label]] = 1.

    train_targets = np.zeros((len(train), len(l)))

    for _, row in case_labels[case_labels.patient_id.isin(train)].iterrows():
        if row.label in l_c2i:
            train_targets[train_c2i[row.patient_id], l_c2i[row.label]] = 1.

    test_targets = np.zeros((len(test), len(l)))

    for _, row in case_labels[case_labels.patient_id.isin(test)].iterrows():
        if row.label in l_c2i:
            test_targets[test_c2i[row.patient_id], l_c2i[row.label]] = 1.

    targets[k] = {
        'validation_targets': validation_targets,
        'test_targets': test_targets,
        'train_targets': train_targets
    }

In [7]:
entity_icds = pd.read_csv(os.path.join(DS_HOME, 'comprehend-medical-case-entities.csv'))

  exec(code_obj, self.user_global_ns, self.user_ns)


In [8]:
present_entity_icds = entity_icds[entity_icds.is_negated_score == 0].copy()

In [9]:
cm_predictions = present_entity_icds[['patient_id', 'icd_code', 'icd_score', 'field']].copy()
cm_predictions["icd_code_5char"] = cm_predictions.icd_code.apply(lambda x: x[:6])
cm_predictions.icd_code = cm_predictions.icd_code.apply(lambda x: x[:5])
# cm_predictions = cm_predictions[cm_predictions.icd_code.isin(labels)].copy()
# cm_predictions = cm_predictions.sort_values('icd_score', ascending=False).drop_duplicates(['patient_id', 'icd_code'])
cm_predictions.patient_id = cm_predictions.patient_id.astype(str)

In [10]:
os.chdir(os.path.join(DATA_HOME, 'notebooks'))
import evaluate

def evaluate_cm(cm_predictions, labels, targets, ds_patient_ids, patient_c2i, label_c2i, threshold=None, use_5_char=False):
  cm_predictions = cm_predictions.sort_values('icd_score', ascending=False).drop_duplicates(['patient_id', 'icd_code'])
  cm_probs = np.zeros((len(ds_patient_ids), len(labels)))
  print("Matrix shape:", cm_probs.shape)
  for _, row in cm_predictions[cm_predictions.patient_id.isin(ds_patient_ids)].iterrows():
    if use_5_char and row.icd_code_5char in label_c2i:
        cm_probs[patient_c2i[row.patient_id], label_c2i[row.icd_code_5char]] = row.icd_score
    elif row.icd_code in label_c2i:
        cm_probs[patient_c2i[row.patient_id], label_c2i[row.icd_code]] = row.icd_score

  scores = evaluate.compute_metrics(targets, cm_probs, threshold=threshold)
  return scores

In [11]:
import importlib
importlib.reload(evaluate)

<module 'evaluate' from '/content/gdrive/Shareddrives/PROJECT_ROOT_DIR/notebooks/evaluate.py'>

In [12]:
from re import I
res = dict()
for k in LABEL_SUBSETS:
  print(k)
  use_5_char = "5char" in k
  train_score = evaluate_cm(cm_predictions[(cm_predictions.field=='tertiary_impression') | (cm_predictions.field=='tertiary_imaging_report')], labels[k], targets[k]['train_targets'], train, train_c2i, label_c2i[k], use_5_char=use_5_char)
  res[k+'_train'] = train_score
  res[k+'_validation'] = evaluate_cm(cm_predictions[(cm_predictions.field=='tertiary_impression') | (cm_predictions.field=='tertiary_imaging_report')],labels[k], targets[k]['validation_targets'], validation, validation_c2i, label_c2i[k], train_score['threshold'], use_5_char=use_5_char)
  res[k+'_test'] = evaluate_cm(cm_predictions[(cm_predictions.field=='tertiary_impression') | (cm_predictions.field=='tertiary_imaging_report')], labels[k], targets[k]['test_targets'], test, test_c2i, label_c2i[k], train_score['threshold'], use_5_char=use_5_char)

all
Matrix shape: (2435, 197)
Matrix shape: (521, 197)
Matrix shape: (522, 197)
non_sup
Matrix shape: (2435, 170)
Matrix shape: (521, 170)
Matrix shape: (522, 170)
top10
Matrix shape: (2435, 10)
Matrix shape: (521, 10)
Matrix shape: (522, 10)
top50
Matrix shape: (2435, 50)
Matrix shape: (521, 50)
Matrix shape: (522, 50)
non_sup_5char
Matrix shape: (2435, 329)
Matrix shape: (521, 329)
Matrix shape: (522, 329)
non_sup_4_and_5char
Matrix shape: (2435, 500)
Matrix shape: (521, 500)
Matrix shape: (522, 500)


In [14]:
pd.DataFrame(res).to_csv('awscm_metrics.csv')