In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, f1_score, accuracy_score, precision_score, recall_score
import matplotlib.pyplot as plt
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
import torch.nn.functional as F
from tqdm import tqdm
import sklearn.utils
import json
from collections import defaultdict
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from psutil import virtual_memory

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

In [None]:
def load_pubmed_sentences(file_path):
    sentences = []
    labels = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                label, sentence = line.split('\t', 1)
                labels.append(label)
                sentences.append(sentence)
            except ValueError:
                continue
    return sentences, labels


In [None]:
train_sents, train_labs = load_pubmed_sentences('/content/drive/MyDrive/PubMed/pubmed-rct-master/PubMed_20k_RCT/train.txt')
dev_sents, dev_labs = load_pubmed_sentences('/content/drive/MyDrive/PubMed/pubmed-rct-master/PubMed_20k_RCT/dev.txt')
test_sents, test_labs = load_pubmed_sentences('/content/drive/MyDrive/PubMed/pubmed-rct-master/PubMed_20k_RCT/test.txt')

label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labs)

dev_labels_encoded = label_encoder.transform(dev_labs)
test_labels_encoded = label_encoder.transform(test_labs)

df_train = pd.DataFrame({'text': train_sents, 'label': train_labels_encoded})
df_dev = pd.DataFrame({'text': dev_sents, 'label': dev_labels_encoded})
df_test = pd.DataFrame({'text': test_sents, 'label': test_labels_encoded})

In [None]:
print("Classes:", label_encoder.classes_)
print("Train size:", len(df_train))
print("Validation size:", len(df_dev))
print("Test size:", len(df_test))


In [None]:
lbls = df_train['label'].values

labels = np.unique(lbls)
labelCounts = np.unique(lbls, return_counts=True)[1]

print("Labels ", labels)
print("Label counts ", labelCounts)

plt.figure(figsize=(10, 6))
plt.title("Class Counts (Training Set)")
plt.pie(labelCounts, labels=labels, autopct='%1.1f%%', startangle=140, colors=plt.cm.Paired(range(len(labels))))
plt.axis('equal')
plt.show()

for i, label in enumerate(labels):
    print("The label", label, "has", labelCounts[i], "records in the training dataset.")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.2")

In [None]:
def encode_dataset(df, tokenizer, max_length=128):
    inputs = tokenizer(
        df['text'].tolist(),
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    )
    labels = torch.tensor(df['label'].values)
    return TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)

In [None]:
train_dataset = encode_dataset(df_train, tokenizer)
val_dataset = encode_dataset(df_dev, tokenizer)
test_dataset = encode_dataset(df_test, tokenizer)

train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=32)
val_loader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=32)
test_loader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=32)


In [None]:
def train_model(model, train_loader, val_loader, save_path, epochs=20,
                learning_rates=[2e-5, 5e-6, 1e-6, 2e-6], patience=7):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    best_val_f1 = 0

    for lr in learning_rates:
        print(f"\nTraining with learning rate: {lr}")
        optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8)
        total_steps = len(train_loader) * epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=int(0.1 * total_steps),
            num_training_steps=total_steps
        )

        patience_counter = 0

        for epoch in range(epochs):
            model.train()
            total_loss = 0

            for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}", unit="batch"):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, labels = batch

                model.zero_grad()
                outputs = model(input_ids=input_ids, attention_mask=input_mask, labels=labels)
                loss = outputs.loss
                loss.backward()
                total_loss += loss.item()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
                scheduler.step()

            avg_train_loss = total_loss / len(train_loader)
            print(f"Epoch {epoch + 1} Train Loss: {avg_train_loss:.4f}")

            # ----- Validation -----
            model.eval()
            preds, true = [], []
            with torch.no_grad():
                for batch in val_loader:
                    batch = tuple(t.to(device) for t in batch)
                    input_ids, input_mask, labels = batch
                    outputs = model(input_ids=input_ids, attention_mask=input_mask)
                    logits = outputs.logits
                    batch_preds = torch.argmax(logits, dim=1).cpu().numpy()
                    preds.extend(batch_preds)
                    true.extend(labels.cpu().numpy())

            f1 = f1_score(true, preds, average='micro')
            print(f"Epoch {epoch + 1}, Validation Micro F1: {f1:.4f}")

            if f1 > best_val_f1:
                best_val_f1 = f1
                patience_counter = 0
                torch.save(model.state_dict(), save_path)
                print("Best model saved.")
            else:
                patience_counter += 1
                print(f"No improvement. Patience: {patience_counter}/{patience}")
                if patience_counter >= patience:
                    print("Early stopping triggered.")
                    break


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "dmis-lab/biobert-base-cased-v1.2",
    num_labels=len(label_encoder.classes_)
)

train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    save_path="/content/drive/MyDrive/PubMed/best_biobert_pubmed.pt",
    epochs=20,
    learning_rates=[2e-5, 5e-6, 1e-6, 2e-6],
    patience=7
)


In [None]:
model.load_state_dict(torch.load("/content/drive/MyDrive/PubMed/best_biobert_pubmed.pt"))
model.eval()


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

test_preds = []
test_true = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating on Test Set"):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, labels = batch

        outputs = model(input_ids=input_ids, attention_mask=input_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()

        test_preds.extend(preds)
        test_true.extend(labels.cpu().numpy())

test_preds_labels = label_encoder.inverse_transform(test_preds)
test_true_labels = label_encoder.inverse_transform(test_true)

print("\nClassification Report on Test Set:")
print(classification_report(test_true_labels, test_preds_labels))