In [None]:
import pandas as pd
import os
import json

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
DATA_HOME = '/content/gdrive/Shareddrives/PROJECT_ROOT_DIR'
OHDSI_VOCAB_HOME = os.path.join(DATA_HOME, 'ohdsi-vocab')
DS_HOME =  os.path.join(DATA_HOME, 'injury-icd-dataset')

In [None]:
icd_vocab = pd.read_csv(os.path.join(OHDSI_VOCAB_HOME, 'ICD10_CONCEPT.csv.gz'), sep='\t', 
                       dtype={'standard_concept': str, 'concept_code': str, 'invalid_reason': str})

In [None]:
cases = pd.read_csv(os.path.join(DS_HOME, 'case.csv'))
case_labels = pd.read_csv(os.path.join(DS_HOME, 'case-labels.csv'), dtype={'patient_id': str})

In [None]:
cm_raw = pd.read_csv(os.path.join(DS_HOME, 'comprehend-medical-predictions.csv'))

In [None]:
all_texts = []

for _, row in cases.iterrows():
  for col in ['tertiary_exam', 'tertiary_imaging_report', 'tertiary_impression']:
    all_texts.append({
        'patient_id': row.patient_id,
        'field': col,
        'text': row[col]
    })
all_texts = pd.DataFrame(all_texts)
all_texts = all_texts[~all_texts.text.isnull()].copy()
all_texts.text = all_texts.text.apply(lambda x: x[:10000])
all_texts = all_texts.merge(cm_raw, how='left')
assert not any(all_texts.icd_comprehend_medical.isnull())
all_texts.icd_comprehend_medical = all_texts.icd_comprehend_medical.apply(json.loads)

In [None]:
entity_icds = []
entities = []
for _, row in all_texts.iterrows():
  for entity in row.icd_comprehend_medical['Entities']:
    d={'patient_id': row.patient_id,
         'field': row.field,
         'entity_id': entity['Id'],
         'begin': entity['BeginOffset'],
         'end': entity['EndOffset'],
         'entity_score': entity['Score'],
         'entity_text': entity['Text'],
         'entity_type': entity['Type'],
         'entity_category': entity['Category'],
         'is_symptom_score': 0.0,
         'is_diagnosis_score': 0.0,
         'is_sign_score': 0.0,
         'is_negated_score': 0.0
         }
    for trait in entity.get('Traits', []):
      if trait['Name'] == 'SYMPTOM':
        d['is_symptom_score'] = trait['Score']
      elif trait['Name'] == 'DIAGNOSIS':
        d['is_diagnosis_score'] = trait['Score']
      elif trait['Name'] == 'SIGN':
        d['is_sign_score'] = trait['Score']
      elif trait['Name'] == 'NEGATION':
        d['is_negated_score'] = trait['Score']

    entities.append(d)
    
    for icd in entity['ICD10CMConcepts']:
      entity_icds.append({
          'patient_id': row.patient_id,
          'field': row.field,
          'entity_id':  entity['Id'],
          'icd_code': icd['Code'],
          'icd_name': icd['Description'],
          'icd_score': icd['Score'],
          'entity_text': entity['Text']
      })
entities = pd.DataFrame(entities)
entity_icds = pd.DataFrame(entity_icds)
entity_icds = entity_icds[entity_icds.icd_code.str.startswith('S')]
assert all(cases.patient_id.isin(entities.patient_id))
assert all(entity_icds.icd_code.isin(icd_vocab.concept_code))
entity_icds = entities.merge(entity_icds)
entity_icds = entity_icds.merge(icd_vocab.rename(columns={'concept_code': 'icd_code'}), how='left')

In [None]:
entity_icds.to_csv(os.path.join(DS_HOME, 'comprehend-medical-case-entities.csv'), index=False)

In [None]:
cases = pd.read_csv(os.path.join(DS_HOME, 'case.csv'), dtype={'patient_id': str})
case_labels = pd.read_csv(os.path.join(DS_HOME, 'case-labels.csv'), dtype={'patient_id': str})

case_labels = case_labels.groupby('patient_id', as_index=False).agg({'label': lambda x: sorted(x)})

with open(os.path.join(DS_HOME, 'label.txt')) as f:
  labels = f.read().split('\n')

cases = cases.merge(case_labels)

In [None]:
cases

In [None]:
label_names = dict(zip(icd_vocab.concept_code, icd_vocab.concept_name))

In [None]:
cm_predictions = entity_icds[['patient_id', 'icd_code', 'icd_score', 'is_negated_score']].copy() #Exclude negated codes
cm_predictions = cm_predictions[cm_predictions.is_negated_score == 0].copy()
cm_predictions.icd_code = cm_predictions.icd_code.apply(lambda x: x[:5])
cm_predictions = cm_predictions[cm_predictions.icd_code.isin(labels)].copy()
cm_predictions = cm_predictions.sort_values('icd_score', ascending=False).drop_duplicates(['patient_id', 'icd_code'])
cm_predictions = cm_predictions.groupby('patient_id', as_index=False).agg({'icd_code': list, 'icd_score': list})
cm_predictions.patient_id = cm_predictions.patient_id.astype(str)
cm_predictions = cases.merge(cm_predictions)

cm_predictions['tp'] = cm_predictions.apply(lambda row: list(filter(lambda x: x in row.label, row.icd_code)), axis=1)
cm_predictions['fp'] = cm_predictions.apply(lambda row: list(filter(lambda x: x not in row.label, row.icd_code)), axis=1)
cm_predictions['fn'] = cm_predictions.apply(lambda row: list(filter(lambda x: x not in row.icd_code, row.label)), axis=1)
cm_predictions.icd_score = cm_predictions.icd_score.apply(lambda x: '\n'.join([str(round(y, 2)) for y in x]))
for col in ['label', 'icd_code','tp', 'fp', 'fn']:
  cm_predictions[col] = cm_predictions[col].apply(lambda x: '\n'.join([f'{y} - {label_names[y]}' for y in x]))

In [None]:
cm_predictions.to_csv('cm_predictions.csv', index=False)