https://huggingface.co/docs/transformers/en/tasks/sequence_classification

In [1]:
import pandas as pd
import datasets
import evaluate
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os

In [4]:
os.environ['CUDA_VISIBLE_DEVICES']

'0'

In [5]:
!nvidia-smi

Fri Mar 29 17:14:25 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 2080 Ti     On  | 00000000:3B:00.0 Off |                  N/A |
| 32%   31C    P8              19W / 250W |      1MiB / 11264MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 2080 Ti     On  | 00000000:5E:0

In [None]:
# Loading huggingface 

In [6]:
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding

In [17]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [10]:
df1 = pd.read_csv('../data/Modeling/EKGneg48.csv')
df2 = pd.read_csv('../data/Modeling/EKGpos48.csv')

# convert 'PE' from bool to int
df1['PE'] = 0
df2['PE'] = 1

# some text=NaN drop these rows
df1 = df1.dropna(subset=['text_y'])
df2 = df2.dropna(subset=['text_y'])

# some text=ECG interpreted by ordering physician, contains no information about the EKG, remove
interp_by_doctor = lambda x: 'ECG interpreted by ordering physician' in x
df1 = df1[~df1['text_y'].apply(interp_by_doctor)]
df2 = df2[~df2['text_y'].apply(interp_by_doctor)]


Np = len(df2)
df1 = df1.sample(n=Np, random_state=42)

df = pd.concat([df1, df2], axis=0)
cols = {'text_y': 'text', 'PE': 'label'}
df = df[list(cols.keys())].rename(columns=cols)
len(df)

2392

In [11]:
# This converts your dataframe to datasets and shows you the first 3 rows
ds = datasets.Dataset.from_pandas(df, preserve_index=False)
ds = ds.train_test_split(test_size=0.3)
ds['train'][:3]

{'text': ['Sinus tachycardia.  Left ventricular hypertrophy with secondary repolarization\nabnormalities.  Lateral myocardial infarction of indeterminate age.  Compared\nto the previous tracing evidence of left ventricular hypertrophy is new.  The\nsinus rate has increased.  The Q-T interval has shortened.  \nTRACING #4\n\n',
  'Sinus rhythm. Left axis deviation. Left anterior fascicular block.\nNon-diagnostic repolarization abnormalities. Compared to the ___ there is no diagnostic change.\nTRACING #1\n\n',
  'Probable atypical atrial flutter with variable conduction and occasional\nuniform ventricular premature beats.  There is low limb lead voltage.\nOthewise, normal intervals and normal axis.  Compared to the ___ the atial flutter persists but now conducts in a variable fashion.\n\n'],
 'label': [1, 0, 0]}

In [12]:
# This identifies which model you want to use
model_name = "distilbert/distilbert-base-uncased"
model_name = "emilyalsentzer/Bio_ClinicalBERT"

In [13]:
# This converts text to tokens
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenizer = AutoTokenizer.from_pretrained(model_name)
train_ds = ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/1674 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 1674/1674 [00:00<00:00, 9203.07 examples/s]
Map: 100%|██████████| 718/718 [00:00<00:00, 12525.67 examples/s]


In [14]:
# Showing the tokenization of the first row in the text column
train_ds['train'][0]

{'text': 'Sinus tachycardia.  Left ventricular hypertrophy with secondary repolarization\nabnormalities.  Lateral myocardial infarction of indeterminate age.  Compared\nto the previous tracing evidence of left ventricular hypertrophy is new.  The\nsinus rate has increased.  The Q-T interval has shortened.  \nTRACING #4\n\n',
 'label': 1,
 'input_ids': [101,
  11850,
  1361,
  27629,
  8992,
  10542,
  1465,
  119,
  1286,
  21828,
  4907,
  5552,
  177,
  24312,
  8005,
  22192,
  1114,
  3718,
  1231,
  23043,
  7710,
  8569,
  22832,
  4233,
  119,
  11937,
  1139,
  13335,
  2881,
  2916,
  1107,
  14794,
  5796,
  1104,
  1107,
  26514,
  1200,
  17379,
  1425,
  119,
  3402,
  1106,
  1103,
  2166,
  19225,
  2554,
  1104,
  1286,
  21828,
  4907,
  5552,
  177,
  24312,
  8005,
  22192,
  1110,
  1207,
  119,
  1103,
  11850,
  1361,
  2603,
  1144,
  2569,
  119,
  1103,
  186,
  118,
  189,
  14235,
  1144,
  12898,
  119,
  19225,
  108,
  125,
  102],
 'token_type_ids': [0,
 

In [15]:
# Pulls your data together into your batch size as input into the model
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
# Set which evaluation metrics you want to use
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 8.55MB/s]


In [18]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
training_args = TrainingArguments(
    output_dir="results/note_text_classifier",
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds["train"],
    eval_dataset=train_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

train_result = trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.698152,0.502786
2,No log,0.692021,0.519499
3,No log,0.726923,0.520891
4,No log,0.714685,0.536212
5,0.657800,0.745918,0.564067
6,0.657800,1.020749,0.545961
7,0.657800,1.323951,0.54039
8,0.657800,1.525181,0.527855
9,0.657800,1.715344,0.520891
10,0.304200,2.119939,0.548747


Checkpoint destination directory results/note_text_classifier/checkpoint-105 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory results/note_text_classifier/checkpoint-210 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory results/note_text_classifier/checkpoint-315 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory results/note_text_classifier/checkpoint-420 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory results/note_text_classifier/checkpoint-525 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory results/note_text_classifier/checkpoint-630 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint desti

In [23]:
train_result.metrics

{'train_runtime': 213.6955,
 'train_samples_per_second': 94.003,
 'train_steps_per_second': 5.896,
 'total_flos': 1121302234627320.0,
 'train_loss': 0.43640918428935704,
 'epoch': 12.0}

In [24]:
metrics = trainer.evaluate(eval_dataset=train_ds['test'])
metrics

{'eval_loss': 0.7217142581939697,
 'eval_accuracy': 0.5292479108635098,
 'eval_runtime': 1.7269,
 'eval_samples_per_second': 415.782,
 'eval_steps_per_second': 26.059,
 'epoch': 12.0}