https://huggingface.co/docs/transformers/en/tasks/sequence_classification

In [106]:
import pandas as pd
import datasets
import evaluate
import numpy as np

In [91]:
df1 = pd.read_csv('../data/xray_and_ekg_report/EKGnegCTA.csv')
df2 = pd.read_csv('../data/xray_and_ekg_report/EKGposCTA.csv')

# convert 'PE' from bool to int
df1['PE'] = 0
df2['PE'] = 1

# some text=NaN drop these rows
df1 = df1.dropna(subset=['text_EKG'])
df2 = df2.dropna(subset=['text_EKG'])

# some text=ECG interpreted by ordering physician, contains no information about the EKG, remove
interp_by_doctor = lambda x: 'ECG interpreted by ordering physician' in x
df1 = df1[~df1['text_EKG'].apply(interp_by_doctor)]
df2 = df2[~df2['text_EKG'].apply(interp_by_doctor)]


Np = len(df2)
df1 = df1.sample(n=Np, random_state=42)

df = pd.concat([df1, df2], axis=0)
cols = {'text_EKG': 'text', 'PE': 'label'}
df = df[list(cols.keys())].rename(columns=cols)
len(df)

2306

In [102]:

ds = datasets.Dataset.from_pandas(df, preserve_index=False)
ds = ds.train_test_split(test_size=0.3)
ds['train'][:3]


{'text': ['Sinus rhythm. Normal ECG.. Compared to the ___ the\nrate has decreased. Other findings are similar.\nTRACING #2\n\n',
  'Sinus rhythm\nConsider biatrial abnormality\nLeft ventricular hypertrophy with ST-T abnormalities\nSince previous tracing of the same date, ST-T wave changes decreased\n\n',
  'Sinus rhythm.  Since the previous tracing ST-T wave abnormalities are less\nprominent.  Q wave in lead aVF is more apparent but of uncertain significance.\nClinical correlation is suggested.\nTRACING #2\n\n'],
 'label': [0, 0, 1]}

In [132]:
model_name = "distilbert/distilbert-base-uncased"
model_name = "emilyalsentzer/Bio_ClinicalBERT"

In [133]:
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding


def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenizer = AutoTokenizer.from_pretrained(model_name)
train_ds = ds.map(preprocess_function, batched=True)


Map:   0%|          | 0/1614 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 1614/1614 [00:00<00:00, 11960.22 examples/s]
Map: 100%|██████████| 692/692 [00:00<00:00, 12941.17 examples/s]


In [135]:
train_ds['train'][0]

{'text': 'Sinus rhythm. Normal ECG.. Compared to the ___ the\nrate has decreased. Other findings are similar.\nTRACING #2\n\n',
 'label': 0,
 'input_ids': [101,
  11850,
  1361,
  6795,
  119,
  2999,
  174,
  1665,
  1403,
  119,
  119,
  3402,
  1106,
  1103,
  168,
  168,
  168,
  1103,
  2603,
  1144,
  10558,
  119,
  1168,
  9505,
  1132,
  1861,
  119,
  19225,
  108,
  123,
  102],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1]}

In [138]:

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [139]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [140]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [143]:
training_args = TrainingArguments(
    output_dir="results/note_text_classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds["train"],
    eval_dataset=train_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

train_result = trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.671022,0.550578
2,No log,0.690701,0.553468
3,No log,0.690328,0.544798
4,No log,0.719718,0.556358
5,No log,0.753955,0.544798
6,No log,0.819104,0.547688
7,No log,0.829636,0.550578
8,No log,0.892638,0.528902
9,No log,0.902773,0.531792
10,No log,0.910393,0.527457


Checkpoint destination directory results/note_text_classifier/checkpoint-26 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory results/note_text_classifier/checkpoint-52 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory results/note_text_classifier/checkpoint-78 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory results/note_text_classifier/checkpoint-104 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory results/note_text_classifier/checkpoint-130 already exists and is non-empty. Saving will proceed but saved results may be invalid.


In [115]:
train_result.metrics

{'train_runtime': 14.3559,
 'train_samples_per_second': 224.855,
 'train_steps_per_second': 3.622,
 'total_flos': 111897695408448.0,
 'train_loss': 0.6809530991774339,
 'epoch': 2.0}

In [118]:

metrics = trainer.evaluate(eval_dataset=train_ds['test'])
metrics



{'eval_loss': 0.6830005049705505,
 'eval_accuracy': 0.5476878612716763,
 'eval_runtime': 0.7101,
 'eval_samples_per_second': 974.444,
 'eval_steps_per_second': 15.49,
 'epoch': 5.0}