In [1]:

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)


Fri Sep  5 13:08:23 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   29C    P0             45W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 89.6 gigabytes of available RAM

You are using a high-RAM runtime!


In [3]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, f1_score, precision_score, recall_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup, get_scheduler
from torch.optim import AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
import torch.nn.functional as F
from tqdm import trange, tqdm
import matplotlib.pyplot as plt
import sklearn.utils
import time
import re
from google.colab import runtime

In [5]:
MODEL_MAP = {
    "bert": "bert-base-uncased",
    "scibert": "allenai/scibert_scivocab_uncased",
    "biobert": "dmis-lab/biobert-base-cased-v1.2",
    "bluebert": "bionlp/bluebert_pubmed_uncased_L-24_H-1024_A-16"
}
DATASET_MAP = {
    "46985": "drive/MyDrive/WoS/data/WoSDataset_46985.tsv",
    "11967": "drive/MyDrive/WoS/data/WoSDataset_11967.tsv",
    "5736":  "drive/MyDrive/WoS/data/WoSDataset_5736.tsv"
}

In [6]:
selected_model_key = "scibert"      # select: "bert", "scibert", "biobert", "bluebert"
selected_dataset_key = "5736"   # select: "46985", "11967", "5736"

model_name = MODEL_MAP[selected_model_key]
dataset_path = DATASET_MAP[selected_dataset_key]

print(f"Using model: {model_name}")
print(f"Using dataset: {dataset_path}")

Using model: allenai/scibert_scivocab_uncased
Using dataset: drive/MyDrive/WoS/data/WoSDataset_5736.tsv


In [8]:
df = pd.read_csv(dataset_path, sep="\t")
num_labels = df["Y"].nunique()

print(df.head())

   Y        domain                                           keywords  \
0  2  biochemistry   candidatus phytoplasma    16s rrna biosecurit...   
1  2  biochemistry   alphabisabolol alphabisabolol synthase mevalo...   
2  2  biochemistry   rna virus cirv yeast programmed cell death ac...   
3  2  biochemistry   1 2dicloropropane dichloromethane gpt delta r...   
4  0           ECE   130 nm cmos pa programmable pa 24 ghz pa           

                                                   X  
0  phytoplasmas are insectvectored bacteria that ...  
1  background alphabisabolol also known as levome...  
2  a universal feature of the replication of posi...  
3  12dichloropropane 12dcp and dichloromethane dc...  
4  this paper presents the simulation results of ...  


In [9]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
_, val_data = train_test_split(test_data, test_size=0.2, random_state=42)

print(np.shape(train_data))
print(np.shape(test_data))
print(np.shape(val_data))


(4588, 4)
(1148, 4)
(230, 4)


In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

In [11]:
def encode_data(data, tokenizer, max_length=128):
    inputs = tokenizer(
        data["X"].tolist(), # abstracts
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    )
    labels = torch.tensor(data["Y"].astype(int).tolist())
    return TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)

In [12]:
train_dataset = encode_data(train_data, tokenizer)
val_dataset = encode_data(val_data, tokenizer)
test_dataset = encode_data(test_data, tokenizer)

In [13]:
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=32)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=32)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=32)

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
model.to(device)

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31090, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [18]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
num_training_steps = 20 * len(train_dataloader)
num_warmup_steps = 1e-4
lr_scheduler = get_scheduler("linear", optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

In [19]:
def calculate_metrics(valType, predictions, true_labels):
    if valType == 'test':
        cm = confusion_matrix(true_labels, predictions)
        print("Confusion Matrix:\n", cm)
        report = classification_report(true_labels, predictions, target_names=[str(i) for i in range(len(np.unique(true_labels)))])
        print("\nClassification Report:\n", report)

    f1_micro = f1_score(true_labels, predictions, average='micro')
    return f1_micro

In [21]:
epochs = 20
for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    start_time = time.time()

    for step, batch in enumerate(tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{epochs}", unit="batch")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, labels = batch

        model.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=input_mask, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        lr_scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)

     # Validation
    model.eval()
    predictions, true_labels = [], []
    for batch in val_dataloader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, labels = batch
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=input_mask)
        logits = outputs.logits
        batch_predictions = np.argmax(logits.detach().cpu().numpy(), axis=1)
        predictions.extend(batch_predictions)
        true_labels.extend(labels.cpu().numpy())

    val_micro_f1 = calculate_metrics('val', predictions, true_labels)
    elapsed_time = time.time() - start_time

    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"  Train Loss: {avg_train_loss}")
    print(f"  Val Micro F1: {val_micro_f1}")
    print(f"  Time: {elapsed_time // 60:.0f}m {elapsed_time % 60:.0f}s")

Epoch 1/20: 100%|██████████| 144/144 [00:21<00:00,  6.68batch/s]


Epoch 1/20
  Train Loss: 0.2656355656153109
  Val Micro F1: 0.9434782608695652
  Time: 0m 22s


Epoch 2/20: 100%|██████████| 144/144 [00:21<00:00,  6.71batch/s]


Epoch 2/20
  Train Loss: 0.10108996551005273
  Val Micro F1: 0.9521739130434783
  Time: 0m 22s


Epoch 3/20: 100%|██████████| 144/144 [00:21<00:00,  6.71batch/s]


Epoch 3/20
  Train Loss: 0.0430173337663291
  Val Micro F1: 0.9608695652173913
  Time: 0m 22s


Epoch 4/20: 100%|██████████| 144/144 [00:21<00:00,  6.71batch/s]


Epoch 4/20
  Train Loss: 0.020898166461039282
  Val Micro F1: 0.9695652173913043
  Time: 0m 22s


Epoch 5/20: 100%|██████████| 144/144 [00:21<00:00,  6.71batch/s]


Epoch 5/20
  Train Loss: 0.011006308822187647
  Val Micro F1: 0.9739130434782609
  Time: 0m 22s


Epoch 6/20: 100%|██████████| 144/144 [00:21<00:00,  6.70batch/s]


Epoch 6/20
  Train Loss: 0.004205113243086898
  Val Micro F1: 0.9695652173913043
  Time: 0m 22s


Epoch 7/20: 100%|██████████| 144/144 [00:21<00:00,  6.70batch/s]


Epoch 7/20
  Train Loss: 0.002666577974398226
  Val Micro F1: 0.9652173913043478
  Time: 0m 22s


Epoch 8/20: 100%|██████████| 144/144 [00:21<00:00,  6.71batch/s]


Epoch 8/20
  Train Loss: 0.0019156294518122901
  Val Micro F1: 0.9695652173913043
  Time: 0m 22s


Epoch 9/20: 100%|██████████| 144/144 [00:21<00:00,  6.69batch/s]


Epoch 9/20
  Train Loss: 0.00044934657802918484
  Val Micro F1: 0.9739130434782609
  Time: 0m 22s


Epoch 10/20: 100%|██████████| 144/144 [00:21<00:00,  6.69batch/s]


Epoch 10/20
  Train Loss: 0.00013442305847396105
  Val Micro F1: 0.9739130434782609
  Time: 0m 22s


Epoch 11/20: 100%|██████████| 144/144 [00:21<00:00,  6.69batch/s]


Epoch 11/20
  Train Loss: 0.00011794041190821897
  Val Micro F1: 0.9739130434782609
  Time: 0m 22s


Epoch 12/20: 100%|██████████| 144/144 [00:21<00:00,  6.69batch/s]


Epoch 12/20
  Train Loss: 0.00010395210807069411
  Val Micro F1: 0.9739130434782609
  Time: 0m 22s


Epoch 13/20: 100%|██████████| 144/144 [00:21<00:00,  6.69batch/s]


Epoch 13/20
  Train Loss: 9.518056539390172e-05
  Val Micro F1: 0.9739130434782609
  Time: 0m 22s


Epoch 14/20: 100%|██████████| 144/144 [00:21<00:00,  6.69batch/s]


Epoch 14/20
  Train Loss: 8.739819193376914e-05
  Val Micro F1: 0.9695652173913043
  Time: 0m 22s


Epoch 15/20: 100%|██████████| 144/144 [00:21<00:00,  6.69batch/s]


Epoch 15/20
  Train Loss: 8.022503612260657e-05
  Val Micro F1: 0.9695652173913043
  Time: 0m 22s


Epoch 16/20: 100%|██████████| 144/144 [00:21<00:00,  6.69batch/s]


Epoch 16/20
  Train Loss: 7.642302959235774e-05
  Val Micro F1: 0.9695652173913043
  Time: 0m 22s


Epoch 17/20: 100%|██████████| 144/144 [00:21<00:00,  6.69batch/s]


Epoch 17/20
  Train Loss: 7.213177402971066e-05
  Val Micro F1: 0.9695652173913043
  Time: 0m 22s


Epoch 18/20: 100%|██████████| 144/144 [00:21<00:00,  6.69batch/s]


Epoch 18/20
  Train Loss: 6.91505047143437e-05
  Val Micro F1: 0.9695652173913043
  Time: 0m 22s


Epoch 19/20: 100%|██████████| 144/144 [00:21<00:00,  6.69batch/s]


Epoch 19/20
  Train Loss: 6.794425697383606e-05
  Val Micro F1: 0.9695652173913043
  Time: 0m 22s


Epoch 20/20: 100%|██████████| 144/144 [00:21<00:00,  6.70batch/s]


Epoch 20/20
  Train Loss: 6.646136908885738e-05
  Val Micro F1: 0.9695652173913043
  Time: 0m 22s


In [22]:
model.eval()
predictions, true_labels = [], []
for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    input_ids, input_mask, labels = batch
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=input_mask)
    logits = outputs.logits
    batch_predictions = np.argmax(logits.detach().cpu().numpy(), axis=1)
    predictions.extend(batch_predictions)
    true_labels.extend(labels.cpu().numpy())

calculate_metrics('test', predictions, true_labels)

save_path = f"/content/drive/MyDrive/WoS/{selected_model_key.upper()}_WoS_Abstracts_{selected_dataset_key}"
model.save_pretrained(save_path)

runtime.unassign()

Confusion Matrix:
 [[231   7   3]
 [  5 324  14]
 [  0   4 560]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.96      0.97       241
           1       0.97      0.94      0.96       343
           2       0.97      0.99      0.98       564

    accuracy                           0.97      1148
   macro avg       0.97      0.97      0.97      1148
weighted avg       0.97      0.97      0.97      1148



0.9712543554006968