### Load and combine datasets

In [16]:
import pandas as pd
import numpy as np


In [50]:
df_161 = pd.read_csv('/content/Colorectal_161_Summarized.csv', usecols=['summary', 'target_label'])
df_201 = pd.read_csv('/content/Colorectal_201_Summarized.csv', usecols=['summary', 'target_label'])
df_309 = pd.read_csv('/content/Colorectal_309_Summarized.csv', usecols=['summary', 'target_label'])
df_213 = pd.read_csv('/content/Esophagael_213_Summarized.csv', usecols=['summary', 'target_label'])
df_218 = pd.read_csv('/content/Lung_218_Summarized.csv', usecols=['summary', 'target_label'])
df_261 = pd.read_csv('/content/Lung_261_Summarized.csv', usecols=['summary', 'target_label'])

# Combine the dataframes into a single dataframe
datasets = [df_161, df_201, df_309, df_213, df_218, df_261]
#datasets = [df_161, df_201, df_309, df_213, df_261]
combined_df = pd.concat(datasets).reset_index(drop=True)

# Shuffle the combined dataframe to ensure data is mixed well
combined_df = combined_df.sample(frac=1).reset_index(drop=True)

combined_df.head()

Unnamed: 0,target_label,summary
0,0,The patient is a female who underwent laparosc...
1,0,The patient is a 62-year-old male of a race ca...
2,0,The patient is a 57-year-old white male with a...
3,0,The patient is a 62-year-old white female with...
4,1,The patient is a female who underwent conventi...


### Analyze and split the data

In [51]:
from sklearn.model_selection import train_test_split

# Count the occurrences of each class in the 'target_label' column
class_counts = combined_df['target_label'].value_counts()
print(class_counts)

# Split data into train and validation/test
train_df, temp_df = train_test_split(combined_df, test_size=0.3, random_state=42)

# Split the remaining data into validation and test sets
validation_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(len(train_df), len(validation_df), len(test_df))


target_label
0    3725
1    2277
Name: count, dtype: int64
4201 900 901


### Clinical BERT model define

In [52]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer

class ClinicalBERTClassifier(nn.Module):
    def __init__(self, bert_model='emilyalsentzer/Bio_ClinicalBERT', num_labels=2, device='cuda'):
        super(ClinicalBERTClassifier, self).__init__()
        self.device = torch.device(device if torch.cuda.is_available() else 'cpu')
        self.bert = AutoModel.from_pretrained(bert_model)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)  # Classifier layer
        self.to(self.device)

    def forward(self, input_ids, attention_mask):
        # Get the output from BERT
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)  # Apply dropout
        logits = self.classifier(pooled_output)  # Pass through the classifier
        return logits


### Data tokenization

In [53]:
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')

def tokenize_data(df):
    # Tokenize text to convert to BERT's input format
    # max_lenght 200 instead of default 512
    return tokenizer(df['summary'].tolist(), padding='max_length', truncation=True, max_length=150, return_tensors="pt")




### Get the right max_length

In [54]:
# Tokenize without truncation and find the maximum length
token_lens = []
for txt in combined_df['summary']:
    tokens = tokenizer.encode(txt, add_special_tokens=True)
    token_lens.append(len(tokens))

print('Max length: ', max(token_lens))

Max length:  139


### DataLoaders

In [60]:
from torch.utils.data import DataLoader, TensorDataset

def create_data_loader(df, tokenizer, batch_size=64):
    # Tokenize the text data
    encodings = tokenize_data(df)
    labels = torch.tensor(df['target_label'].values)

    # Create a dataset from the encodings and labels
    dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'], labels)
    # Create a DataLoader
    return DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=2)

# Assuming train_df, validation_df, and test_df are already defined
train_loader = create_data_loader(train_df, tokenizer)
validation_loader = create_data_loader(validation_df, tokenizer)
test_loader = create_data_loader(test_df, tokenizer)


### Metrics

In [56]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def calculate_metrics(preds, labels):
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='binary')
    recall = recall_score(labels, preds, average='binary')
    f1 = f1_score(labels, preds, average='binary')
    return accuracy, precision, recall, f1

### Train and Evaluate

In [57]:
def train(model, data_loader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    for i, batch in enumerate(data_loader):
        input_ids, attention_mask, label = batch[0].to(device), batch[1].to(device), batch[2].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs, label)

        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        # if i % 30 == 0:
        #   print(f"Batch {i}, Current Train Loss: {loss.item()}")
    return total_loss / len(data_loader)  # Return the average loss

def evaluate(model, data_loader, loss_fn, device):
    model.eval()
    total_loss = 0
    predictions, actual_labels = [], []

    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            input_ids, attention_mask, labels = batch[0].to(device), batch[1].to(device), batch[2].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs, labels)
            total_loss += loss.item()
            preds = outputs.argmax(dim=1)  # predicted classes
            predictions.append(preds)
            actual_labels.append(labels)

    predictions = torch.cat(predictions).cpu().numpy()
    actual_labels = torch.cat(actual_labels).cpu().numpy()
    average_loss = total_loss / len(data_loader)
    accuracy, precision, recall, f1 = calculate_metrics(predictions, actual_labels)  # additional metrics
    return average_loss, accuracy, precision, recall, f1


### Initialization + Training + Validating the model

In [58]:
# Initialize model
model = ClinicalBERTClassifier()

In [62]:
from sklearn.utils.class_weight import compute_class_weight
import torch.optim as optim
from torch.optim import AdamW
import torch

num_epochs = 7
learning_rate = 2e-5
weight_decay = 3e-4

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(combined_df['target_label']), y=combined_df['target_label'].values)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(model.device)

# Use these weights in your loss function
#loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)
loss_fn = nn.CrossEntropyLoss()

# Optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# Use a scheduler to reduce learning rate when a metric has stopped improving
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

# Early stopping criteria
best_val_loss = float('inf')
patience = 2
trigger_times = 0

for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, loss_fn, model.device)
    val_loss, val_acc, val_prec, val_recall, val_f1 = evaluate(model, validation_loader, loss_fn, model.device)
    scheduler.step()

    # Check for early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        trigger_times = 0
    else:
        trigger_times += 1
        if trigger_times >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

    print(f"Epoch {epoch + 1}:")
    print(f"  Train Loss: {train_loss}")
    print(f"  Validation Loss: {val_loss}")
    print(f"  Validation Accuracy: {val_acc}")
    print(f"  Validation Precision: {val_prec}")
    print(f"  Validation Recall: {val_recall}")
    print(f"  Validation F1: {val_f1}\n")


Epoch 1:
  Train Loss: 0.44519587990009424
  Validation Loss: 0.5031834701697032
  Validation Accuracy: 0.7944444444444444
  Validation Precision: 0.7865168539325843
  Validation Recall: 0.621301775147929
  Validation F1: 0.6942148760330578

Epoch 2:
  Train Loss: 0.43670621378855273
  Validation Loss: 0.4607324242591858
  Validation Accuracy: 0.7988888888888889
  Validation Precision: 0.7223796033994334
  Validation Recall: 0.7544378698224852
  Validation F1: 0.7380607814761215

Epoch 3:
  Train Loss: 0.42551373171083856
  Validation Loss: 0.4441918889681498
  Validation Accuracy: 0.7788888888888889
  Validation Precision: 0.7907949790794979
  Validation Recall: 0.5591715976331361
  Validation F1: 0.6551126516464472

Epoch 4:
  Train Loss: 0.41105386208404193
  Validation Loss: 0.4449521799882253
  Validation Accuracy: 0.8044444444444444
  Validation Precision: 0.7425149700598802
  Validation Recall: 0.7337278106508875
  Validation F1: 0.7380952380952381

Early stopping at epoch 5


### Save and load the model

In [63]:
path = '/content/drive/My Drive/Colab Notebooks/clinical_bert_classifier2.pth'

# Save the model state
torch.save(model.state_dict(), path)

In [32]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# To load the model state into a model's instance:
model = ClinicalBERTClassifier()  # Initialize the model again
model.load_state_dict(torch.load('clinical_bert_classifier.pth'))
model = model.to(model.device)  # Move model to appropriate device

### Test the model

In [64]:
def test(model, data_loader, loss_fn, device):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    predictions, labels = [], []

    with torch.no_grad():  # No gradient needed for evaluation
        for batch in data_loader:
            input_ids, attention_mask, label = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs, label)
            total_loss += loss.item()

            preds = outputs.argmax(dim=1)
            predictions.extend(preds.cpu().numpy())
            labels.extend(label.cpu().numpy())

    average_loss = total_loss / len(data_loader)
    accuracy, precision, recall, f1 = calculate_metrics(predictions, labels)
    return average_loss, accuracy, precision, recall, f1

# Assuming you have a DataLoader for your test data
test_loss, test_acc, test_prec, test_recall, test_f1 = test(model, test_loader, loss_fn, model.device)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_acc}")
print(f"Test Precision: {test_prec}")
print(f"Test Recall: {test_recall}")
print(f"Test F1: {test_f1}")

Test Loss: 0.47705430785814923
Test Accuracy: 0.781354051054384
Test Precision: 0.6916666666666667
Test Recall: 0.7432835820895523
Test F1: 0.7165467625899281
