In [2]:
! export CUDA_VISIBLE_DEVICES=5

import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="5, 6"

In [6]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import pickle

import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from pytorch_pretrained_bert.optimization import BertAdam
from tqdm.notebook import tqdm as tqdm
from tqdm import trange, tqdm
from torch.utils.data import TensorDataset
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

import logging
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

# Load the data

In [2]:
train = pd.read_csv("/data/datasets/mimic/processed/3days/train.csv")
test = pd.read_csv("/data/datasets/mimic/processed/3days/test.csv")
val = pd.read_csv("/data/datasets/mimic/processed/3days/val.csv")
val['Label'] = val['Label'].astype(int)
train['Label'] = train['Label'].astype(int)
test['Label'] = test['Label'].astype(int)

# Load the tokenizer and the model

In [8]:
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertConfig {
  "_name_or_path": "emilyalsentzer/Bio_ClinicalBERT",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.32.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

# Tokenize and preprocess the inputs

In [5]:
X_train, X_val, y_train, y_val = list(train['TEXT']), list(val['TEXT']), list(train['Label']), list(val['Label'])
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [9]:
X = list(train['TEXT'])
y = list(train['Label'])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [6]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [7]:
len(train_dataset), len(val_dataset)

(49800, 6026)

# Fine Tuning

In [8]:
from transformers import EarlyStoppingCallback
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

args = TrainingArguments(
    output_dir="train_checkpoint",
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    logging_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    seed=0,
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

# Train pre-trained model
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
500,0.7024,0.682607,0.576668,0.580912,0.653735,0.615176
1000,0.6843,0.666792,0.599734,0.604679,0.654697,0.628695
1500,0.69,0.696346,0.482244,0.0,0.0,0.0
2000,0.7027,0.699822,0.48241,0.0,0.0,0.0
2500,0.7045,0.699898,0.48241,0.0,0.0,0.0


TrainOutput(global_step=2500, training_loss=0.6967821044921875, metrics={'train_runtime': 838.2203, 'train_samples_per_second': 178.235, 'train_steps_per_second': 11.141, 'total_flos': 1.05244422144e+16, 'train_loss': 0.6967821044921875, 'epoch': 0.8})

In [10]:
from transformers import EarlyStoppingCallback
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

args = TrainingArguments(
    output_dir="train_lr_checkpoints",
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    logging_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-05,
    num_train_epochs=3,
    seed=42,
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

# Train pre-trained model
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
500,0.6512,0.696117,0.560405,0.546042,0.893556,0.677855
1000,0.6452,0.690446,0.571026,0.595836,0.532222,0.562235
1500,0.6289,0.689817,0.567707,0.583768,0.574223,0.578956
2000,0.6324,0.735378,0.580153,0.594786,0.592498,0.59364
2500,0.6334,0.727101,0.569698,0.577949,0.6252,0.600647
3000,0.5993,0.764785,0.576502,0.563636,0.805066,0.663058


TrainOutput(global_step=3000, training_loss=0.6317243957519532, metrics={'train_runtime': 994.8851, 'train_samples_per_second': 150.168, 'train_steps_per_second': 9.387, 'total_flos': 1.262933065728e+16, 'train_loss': 0.6317243957519532, 'epoch': 0.96})

# Predictions

In [7]:
! export CUDA_VISIBLE_DEVICES=7

In [8]:
from transformers import BertModel, BertConfig

# Load model configuration
config = BertConfig.from_pretrained("train_lr_checkpoints/checkpoint-1000/config.json", output_hidden_states=True)

# Load the model
model = BertForSequenceClassification.from_pretrained("train_lr_checkpoints/checkpoint-1000/pytorch_model.bin", config=config)

In [14]:
inputs = tokenizer(list(test['TEXT'][:1000].values), 
                             padding=True,
                             truncation=True,
                             return_tensors="pt",
                             max_length=512)

with torch.no_grad():
    batch_outputs = model(**inputs)

In [15]:
logits = batch_outputs.logits
predictions = logits.argmax(dim=1)
print("accuracy: ", accuracy_score(predictions, test['Label'][:1000]))

accuracy:  0.658


---
# CLS Tokens
---

In [16]:
num_hidden_states = config.hidden_size
num_hidden_states

768

In [17]:
last_hidden_states = batch_outputs.hidden_states[0]
features = last_hidden_states[:,0,:].detach().numpy()
features.shape

(1000, 768)

In [18]:
labels = test['Label'][:1000]
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [19]:
parameters = {'C': np.linspace(0.0001, 100, 20)}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(train_features, train_labels)

print('best parameters: ', grid_search.best_params_)
print('best scrores: ', grid_search.best_score_)

best parameters:  {'C': 0.0001}
best scrores:  0.6186666666666667


In [20]:
lr_clf = LogisticRegression(C=0.0001)
lr_clf.fit(train_features, train_labels)

In [21]:
lr_clf.score(test_features, test_labels)

0.628