In [None]:

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)


In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, f1_score, precision_score, recall_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup, get_scheduler
from torch.optim import AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
import torch.nn.functional as F
from tqdm import trange, tqdm
import matplotlib.pyplot as plt
import sklearn.utils
import time
import re
from google.colab import runtime

In [None]:
MODEL_MAP = {
    "bert": "bert-base-uncased",
    "scibert": "allenai/scibert_scivocab_uncased",
    "biobert": "dmis-lab/biobert-base-cased-v1.2",
    "bluebert": "bionlp/bluebert_pubmed_uncased_L-24_H-1024_A-16"
}
DATASET_MAP = {
    "46985": "drive/MyDrive/WoS/data/WoSDataset_46985.tsv",
    "11967": "drive/MyDrive/WoS/data/WoSDataset_11967.tsv",
    "5736":  "drive/MyDrive/WoS/data/WoSDataset_5736.tsv"
}

In [None]:
selected_model_key = "scibert"      # select: "bert", "scibert", "biobert", "bluebert"
selected_dataset_key = "5736"   # select: "46985", "11967", "5736"

model_name = MODEL_MAP[selected_model_key]
dataset_path = DATASET_MAP[selected_dataset_key]

print(f"Using model: {model_name}")
print(f"Using dataset: {dataset_path}")

In [None]:
df = pd.read_csv(dataset_path, sep="\t")
num_labels = df["Y"].nunique()

print(df.head())

In [None]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
_, val_data = train_test_split(test_data, test_size=0.2, random_state=42)

print(np.shape(train_data))
print(np.shape(test_data))
print(np.shape(val_data))


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def encode_data(data, tokenizer, max_length=128):
    inputs = tokenizer(
        data["X"].tolist(), # abstracts
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    )
    labels = torch.tensor(data["Y"].astype(int).tolist())
    return TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)

In [None]:
train_dataset = encode_data(train_data, tokenizer)
val_dataset = encode_data(val_data, tokenizer)
test_dataset = encode_data(test_data, tokenizer)

In [None]:
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=32)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=32)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=32)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
model.to(device)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
num_training_steps = 20 * len(train_dataloader)
num_warmup_steps = 1e-4
lr_scheduler = get_scheduler("linear", optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

In [None]:
def calculate_metrics(valType, predictions, true_labels):
    if valType == 'test':
        cm = confusion_matrix(true_labels, predictions)
        print("Confusion Matrix:\n", cm)
        report = classification_report(true_labels, predictions, target_names=[str(i) for i in range(len(np.unique(true_labels)))])
        print("\nClassification Report:\n", report)

    f1_micro = f1_score(true_labels, predictions, average='micro')
    return f1_micro

In [None]:
epochs = 20
for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    start_time = time.time()

    for step, batch in enumerate(tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{epochs}", unit="batch")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, labels = batch

        model.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=input_mask, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        lr_scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)

     # Validation
    model.eval()
    predictions, true_labels = [], []
    for batch in val_dataloader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, labels = batch
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=input_mask)
        logits = outputs.logits
        batch_predictions = np.argmax(logits.detach().cpu().numpy(), axis=1)
        predictions.extend(batch_predictions)
        true_labels.extend(labels.cpu().numpy())

    val_micro_f1 = calculate_metrics('val', predictions, true_labels)
    elapsed_time = time.time() - start_time

    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"  Train Loss: {avg_train_loss}")
    print(f"  Val Micro F1: {val_micro_f1}")
    print(f"  Time: {elapsed_time // 60:.0f}m {elapsed_time % 60:.0f}s")

In [None]:
model.eval()
predictions, true_labels = [], []
for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    input_ids, input_mask, labels = batch
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=input_mask)
    logits = outputs.logits
    batch_predictions = np.argmax(logits.detach().cpu().numpy(), axis=1)
    predictions.extend(batch_predictions)
    true_labels.extend(labels.cpu().numpy())

calculate_metrics('test', predictions, true_labels)

save_path = f"/content/drive/MyDrive/WoS/{selected_model_key.upper()}_WoS_Abstracts_{selected_dataset_key}"
model.save_pretrained(save_path)

runtime.unassign()