### Training & Evaluation of TraumaICDBERT
This is a demo for training a TraumaICDBERT model to predict injury ICD-10 codes using Google Colab GPUs.

- This notebook is intented to run on Google Colab platform. Using this notebook on Colab implies using Google Drive, for which you need to obtain access to the Shared "TraumaICDBERT" Google Drive (contact authors for access), which contains the related codebase and ICD-10 code vocabulary, which you would need to train the model

- If you would like to run this code locally instead of using Google Colab, that is easy too. Please download this notebook and modify this notebook to read/write a local file system instead of Google Drive.

- We have hidden the dataset with patient health records. In order to run the training and evaluation, please swap in your own dataset by following the comments in the code block named "### SET UP ###" below

- If you have questions, please reach out to the author (yifu.chen@stanford.edu) for technical support

In [1]:
!pip install datasets -q
!pip install transformers -q -U
!pip install wandb -q -U
!pip install openai
!pip install accelerate -U

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m63.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m62.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import pandas as pd
import os
from collections import defaultdict
import datasets
import sys, importlib
import random
import shutil
import openai
import json
from tqdm import tqdm

from sklearn.metrics.pairwise import cosine_similarity
pd.set_option('display.max_colwidth', None)
openai.api_key = "..." ### PASTE YOUR OPENAI API KEY HERE, FREE CREDIT WILL BE GRANTED FOR NEW ACCOUNTS

In [3]:
### SET UP ###


import os
from google.colab import drive

drive.mount('/content/gdrive', force_remount=True)
PROJECT_ROOT_DIR = "TraumaICDBERT"
CODE_HOME = f'/content/gdrive/Shareddrives/{PROJECT_ROOT_DIR}/code'
MODEL_HOME = f'/content/gdrive/Shareddrives/{PROJECT_ROOT_DIR}/models'

# THE DIRECTORY WHERE YOUR PATIENT DATA IS STORED
SECRET_HOME = '/content/gdrive/MyDrive/Academics/Publications/ICD10_Project/'

# YOUR OWN PATIENT DATA CSV FILE HERE. Example Format: https://docs.google.com/spreadsheets/d/19PKbWvzFohSQhzaMaz9lvfDuOqMZI8ZJM7aqzZ57Xeg/edit#gid=0
RAW_DATA_PATH = os.path.join(SECRET_HOME,"injury_icd_dataset.csv")

VOCAB_HOME = f'/content/gdrive/Shareddrives/{PROJECT_ROOT_DIR}/icd-codes'
TMP_DIR = '/content/tmp'

if not os.path.exists(TMP_DIR):
    os.mkdir(TMP_DIR)

os.chdir(CODE_HOME)

Mounted at /content/gdrive


In [4]:
### READ THE INJURY CODE VOCABULARY ###

icd10_concepts = pd.read_csv(os.path.join(VOCAB_HOME, "injury_codes_ICD10CM.csv"), low_memory=False)
injuries = icd10_concepts[icd10_concepts["concept_code"].apply(lambda x: len(x)<=5 and x.startswith("S"))]
injuries.to_csv(os.path.join(VOCAB_HOME, 'injury_ICD10.csv'))
injuries_4_char = injuries[injuries.concept_class_id == '4-char nonbill code'].copy()
injuries_4_char["label_name"] = injuries_4_char["concept_name"]
injuries_4_char["label"] = injuries_4_char["concept_code"]
injuries_4_char.to_csv(os.path.join(VOCAB_HOME, 'injury_ICD10_4_char.csv'))

In [5]:
### READ THE ANNOTATED PATIENT DATA ###

raw_data = pd.read_csv(RAW_DATA_PATH, on_bad_lines='skip')
case_icd_codes = raw_data[['patient_id', 'icd_code', 'icd_name', 'diagnosis_region', 'ais_code', 'ais_name']].copy()
case_icd_codes = case_icd_codes[case_icd_codes.icd_code.notnull()]
case_icd_codes["icd_code_4_char"] = case_icd_codes.icd_code.apply(lambda x: str(x)[:5])
case_icd_codes["icd_name_4_char"] = case_icd_codes.icd_name.apply(lambda x: str(x)[:5])


# Prepare the mapping from the patient id to the free text trauma notes
case_icd_codes = case_icd_codes.merge(injuries_4_char.rename(columns={'concept_code': 'icd_code_4_char'}), how='inner')
cases = raw_data[(raw_data.tertiary_impression != '') | (raw_data.tertiary_exam != '')][['patient_id', 'tertiary_exam', 'tertiary_imaging_report', 'tertiary_impression']].copy()
cases['total_text_len'] = cases.apply(lambda row: len(str(row.tertiary_exam)) + len(str(row.tertiary_imaging_report)) + len(str(row.tertiary_impression)), axis=1)
cases = cases.drop_duplicates(subset=["patient_id"], keep='first')
cases.to_csv(os.path.join(SECRET_HOME, 'case.csv'))

In [6]:
### EXPLORATORY DATA ANALYSIS ###

n_cases_by_4_char_code = case_icd_codes.groupby('icd_code_4_char', as_index=False).agg({'patient_id': 'count'}).rename(columns={'patient_id': 'n_cases'})
n_cases_by_4_char_code = n_cases_by_4_char_code[n_cases_by_4_char_code.n_cases > 5]
n_cases_by_4_char_code.to_csv(os.path.join(SECRET_HOME, 'n_cases_by_4_char_code.csv'))

In [7]:
### FILTER & DEFINE GROUND TRUTH LABELS ###
ground_truth_labels = n_cases_by_4_char_code.icd_code_4_char.tolist()

with open(os.path.join(SECRET_HOME, 'label-with-superficial.txt'), 'w') as f:
    f.write('\n'.join(ground_truth_labels))

In [8]:
### GENERATE GPT-3 EMBEDDINGS FOR LABELS ###

embeddings_path = os.path.join(SECRET_HOME, 'icd-name-davinci-001-embeddings.csv')
similarities_path = os.path.join(SECRET_HOME, 'icd-name-davinci-001-simularity-scores.csv')

if not os.path.exists(embeddings_path):
  model_id = 'text-similarity-davinci-001'
  res = openai.Embedding.create(input=ground_truth_labels, engine=model_id)
  embeddings = []
  for d in res['data']:
      embeddings.append({
          'label': ground_truth_labels[d['index']],
          'embedding': d['embedding']
      })
  embeddings = pd.DataFrame(embeddings)
  embeddings.embedding = embeddings.embedding.apply(json.dumps)
  embeddings = injuries_4_char.merge(embeddings, on='label')
  embeddings.to_csv(embeddings_path, index=False)
else:
  embeddings = pd.read_csv(embeddings_path)

if not os.path.exists(similarities_path):
  label_sim = []
  for _, row1 in tqdm(embeddings.iterrows()):
      for _, row2 in embeddings.iterrows():
        if row1.label != row2.label:
          label_sim.append({
              'label_1': row1.label,
              'label_2': row2.label,
              'label_name_1': row1.label_name,
              'label_name_2': row2.label_name,
              'davinci_cosine_similarity': cosine_similarity([eval(row1.embedding)], [eval(row2.embedding)])[0][0]
          })
  label_sim = pd.DataFrame(label_sim)
  min_sim = label_sim.davinci_cosine_similarity.min()
  max_sim = label_sim.davinci_cosine_similarity.max()
  label_sim['sim'] = label_sim.davinci_cosine_similarity.apply(lambda x: (x-min_sim)/(max_sim-min_sim))
  label_sim.to_csv(similarities_path, index=False)
else:
  label_sim = pd.read_csv(similarities_path)


In [9]:
### PREPARE THE ANNOTATED PATIENT DATA ###

# Prepare the mapping from the cases to the ground truth codes
case_labels = case_icd_codes[case_icd_codes.icd_code_4_char.isin(ground_truth_labels)][['patient_id', 'icd_code_4_char', 'concept_name']]
case_labels.rename(columns={'icd_code_4_char': 'label', 'concept_name': 'label_name'}, inplace=True)
case_labels.to_csv(os.path.join(SECRET_HOME, 'case-labels-with-superficial.csv'), index=False)


In [10]:
### SPLIT PATIENTS INTO TRAIN, VALIDATION, TEST SETS ###

patient_ids = cases.patient_id.sample(frac=1, random_state=42).unique()

train_ratio, validation_ratio, test_ratio = 0.7, 0.15, 0.15

train = patient_ids[:int(len(patient_ids)*train_ratio)]
validation = patient_ids[int(len(patient_ids)*train_ratio):int(len(patient_ids)*(train_ratio+validation_ratio))]
test = patient_ids[int(len(patient_ids)*(train_ratio+validation_ratio)):]


with open(os.path.join(SECRET_HOME, 'train.txt'), 'w') as f:
  f.write('\n'.join([str(x) for x in train]))

with open(os.path.join(SECRET_HOME, 'validation.txt'), 'w') as f:
  f.write('\n'.join([str(x) for x in validation]))

with open(os.path.join(SECRET_HOME, 'test.txt'), 'w') as f:
  f.write('\n'.join([str(x) for x in test]))

for x in validation:
  assert x not in train
  assert x not in test

for x in train:
  assert x not in validation
  assert x not in test

for x in test:
  assert x not in validation
  assert x not in train

In [12]:
### DEFINE TRAINING HYPERPARAMETERS ###

valid_labels = "4-char-with-superficial"                          #@param ["4-char", "4-char-top50", "4-char-top10", "5-char", "4-and-5-char", "4-char-with-superficial"]
experiment_name = "4-char-with-superficial-20230720"   #@param {type:"string",  allow-input: true} ["non-sup", "non-sup-pretrain", "non-sup-tune-after-pretrain", "non-sup-4and5-char-train-on-full", "non-sup-train-on-full", "4-char-with-superficial"]
model_name = "michiyasunaga/BioLinkBERT-base"         #@param ["michiyasunaga/BioLinkBERT-base", "michiyasunaga/BioLinkBERT-large", "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"]
metric_for_best_model = "eval_f1_score_weighted"          #@param ["eval_f1_score_macro", "eval_f1_score_micro", "eval_f1_score_weighted", "eval_auc_score_macro", "eval_auc_score_micro", "eval_auc_score_weighted"]
num_epochs = "6"                 #@param [3, 6, 10, 20, 30]
train_on_full = False            #@param {type:"boolean", desc: "Whether to train on both train and val dataset, and do final eval on holdout test set"}
evaluate_only = True           #@param {type:"boolean"}
learning_rate = "0.00002"        #@param [2e-5, 1e-5, 7e-6, 2e-6]
warmup_steps =  5000            #@param [1000, 2000, 3000, 5000]
per_device_train_batch_size = "16" #@param [512, 256, 128, 64, 32, 24, 16, 8, 6, 4]
per_device_eval_batch_size = "32" #@param [512, 256, 128, 64, 48, 32, 16, 12, 8]
learning_rate = float(learning_rate)
warmup_steps = int(warmup_steps)
per_device_train_batch_size = int(per_device_train_batch_size)
per_device_eval_batch_size = int(per_device_eval_batch_size)

Main function to train TraumaICDBERT


```
--model_name: Name of the Huggingface model to be trained.
--data_dir: A directory where the data is stored (referred to as $SECRET_HOME).
--model_dir: A directory where the training model will be stored ($MODEL_HOME).
--experiment_name: A unique name to identify each experimental run.
--valid_labels: The valid ICD-10 labels, such as "4-char" or "4-char-top50".
--is_evaluate: A boolean that when set to true only evaluates the model and does not train it.
--train_on_full: A boolean that when set to true, the model will train on the full dataset, and evaluate on the test set.
--num_train_epochs: The total number of training epochs to perform.
--metric_for_best_model: The metric used to determine the best model.
--learning_rate: Learning rate for the model.
--warmup_steps: Number of warmup steps for the training model.
--per_device_train_batch_size: Batch size per device for training.
--per_device_eval_batch_size: Batch size per device for evaluation.
Note: To evaluate the trained model, once the training completes, set $evaluate_only to True and re-run the notebook. This will only conduct evaluation and not perform any training. The performance of the model can then be analyzed.
```



In [13]:
!python train.py --model_name=$model_name \
                 --data_dir=$SECRET_HOME \
                 --model_dir=$MODEL_HOME \
                 --experiment_name=$experiment_name \
                 --valid_labels=$valid_labels \
                 --is_evaluate=$evaluate_only \
                 --train_on_full=$train_on_full \
                 --num_train_epochs=$num_epochs \
                 --metric_for_best_model=$metric_for_best_model \
                 --learning_rate=$learning_rate \
                 --warmup_steps=$warmup_steps \
                 --per_device_train_batch_size=$per_device_train_batch_size \
                 --per_device_eval_batch_size=$per_device_eval_batch_size

# TO EVALUATE THE TRAINED MODEL:
# ONCE TRAINING IS FINISHED, SET $evaluate_only TO TRUE AND RE-RUN THIS NOTEBOOK TO EVALUATE THE FINAL MODEL

In [27]:
#### EVALUATION CODE ####

if evaluate_only:
  import pandas as pd
  from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
  import numpy as np
  import pickle

  path = "/content/gdrive/Shareddrives/TraumaICDBERT/models/michiyasunaga/BioLinkBERT-base/4-char-with-superficial-20230720/eval_results.pickle"

  with open(path, 'rb') as f:
    eval_results = pickle.load(f)

  label_to_name = {}

  for label in ground_truth_labels:
    # find the concept name from injuries_4_char
    concept_name = injuries_4_char[injuries_4_char.label == label].concept_name
    label_to_name[label] = concept_name.values[0]

  def calculate_metrics(y_true, y_pred, y_prob):
      if len(np.unique(y_true)) == 1:
          metrics = {
              "accuracy": np.nan,
              "AUC": np.nan,  # or 0.0 if you prefer
              "precision": np.nan,  # or 0.0 if you prefer
              "recall": np.nan,  # or 0.0 if you prefer
              "f1": np.nan  # or 0.0 if you prefer
          }
      else:
          metrics = {
              "accuracy": accuracy_score(y_true, y_pred),
              "AUC": roc_auc_score(y_true, y_prob),
              "precision": precision_score(y_true, y_pred, zero_division=0),
              "recall": recall_score(y_true, y_pred, zero_division=0),
              "f1": f1_score(y_true, y_pred, zero_division=0)
          }
      return metrics

  def create_label_indices_mapping(label_to_name):
      label_to_index = {label: idx for idx, label in enumerate(label_to_name.keys())}
      return label_to_index

  def evaluate_classifier(eval_results, label_to_name):
      metrics_df = pd.DataFrame(columns=["label", "concept_name", "accuracy", "AUC", "precision", "recall", "F1"])
      y_prob = eval_results["probs"]
      y_true = eval_results["targets"]
      label_to_index = create_label_indices_mapping(label_to_name)

      num_labels = y_true.shape[1]
      for label, concept_name in label_to_name.items():
          label_index = label_to_index.get(label, -1)
          if label_index >= 0 and label_index < num_labels:
              y_true_label = y_true[:, label_index]
              y_prob_label = y_prob[:, label_index]
              y_pred_label = np.round(y_prob_label)
              metrics = calculate_metrics(y_true_label, y_pred_label, y_prob_label)
              metrics["label"] = label
              metrics["concept_name"] = concept_name
              metrics_df = pd.concat(
              [
                  metrics_df,
                  pd.DataFrame(metrics, index=[0])
              ]
          )
          else:
              print(f"Label {label} ({concept_name}) not found in eval_results.")

      return metrics_df, y_true_label, y_pred_label, y_prob_label

  metrics_df, y_true_label, y_pred_label, y_prob_label = evaluate_classifier(eval_results, label_to_name)

  eval_metrics_path = os.path.join(MODEL_HOME, model_name, experiment_name, "eval_results.csv")
  metrics_df.to_csv(eval_metrics_path, index=False)
  print(f"Evaluation results for each ICD code have been saved to {eval_metrics_path}")
  print(eval_results['metrics'])
  display(metrics_df)

Evaluation results for each ICD code have been saved to /content/gdrive/Shareddrives/TraumaICDBERT/models/michiyasunaga/BioLinkBERT-base/4-char-with-superficial-20230720/eval_results.csv
