In [None]:
#Start

#### Code Summary

This 2nd approach where focuses on adapting the pretrained PubMedBERT model to the RSID domain by further pretraining on RSID sentences. It first installs the Transformers library and sets up dependencies. The RSID sentences and corresponding vocabulary tokens are loaded from JSON and pickle files. Next, a PubMedBERT tokenizer and pretrained model are initialized. The RSID tokens are added to the tokenizer vocabulary and the model embeddings are resized to fit the new tokens. The updated tokenizer and model are saved.

The RSID sentences are tokenized in batches using the BertTokenizerFast and padded/truncated to a max length of 512 tokens. This tokenized data is saved for later reuse. A 20% subset of the full data is taken to allow faster experimentation as proff of concept as this computationally very intense task. This subset is split 80/20 into train and evaluation sets. Pytorch DataLoaders are prepared for this subset data to enable batch training. This model is traned using high-end Google Colab A100 GPU with 40GB GPU and 128GB RAM
The model and AdamW optimizer are moved to GPU if available. Custom training and evaluation loops are defined to train the model on the RSID data using a masked LM objective.

The model is trained for 3 epochs with gradient accumulation for stability. After the last epoch it is evaluated on the evaluation set. The fine-tuned model and tokenizer are saved. The subset data, tokenizers, and models are copied to Google Drive for persistence. The adapting a pretrained model's tokenizer and weights to the new domain, preparing batched data, training with masked LM, and saving the adapted model. The goal is to specialize PubMedBERT to RSID which can then be used in downstream tasks.

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m105.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m86.9 MB/s[0m eta [36m0:00:00[0m
Co

In [None]:
import torch
import tqdm
import json
import pickle
import warnings
import logging
import random
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertModel
import warnings

warnings.filterwarnings("ignore")

In [None]:
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [None]:
!cp "/content/gdrive/My Drive/Ver S/rsid_sentences.json" "./"
!cp "/content/gdrive/My Drive/Ver S/rsid_tokens.pkl" "./"

In [None]:
# Constants for paths
RSID_TOKENS_PATH = "./rsid_tokens.pkl"
RSID_SENTENCES_PATH = "./rsid_sentences.json"
SAVED_TOKENIZED_PATH = "./tokenized_rsid_sentences.pkl"

# Constants for paths
TOKENIZER_PATH = './updated_tokenizer/'
MODEL_PATH = './updated_model/'

In [None]:
# Load RSID tokens and update tokenizer and model
with open(RSID_TOKENS_PATH, 'rb') as f:
    rsid_tokens = pickle.load(f)

In [None]:
# Load RSID sentences
rsid_sentences_dict = {}

with open(RSID_SENTENCES_PATH, "r") as f:
    for line in f:
        line_data = json.loads(line)
        rsid_sentences_dict.update(line_data)

logger.info(f"Loaded {len(rsid_sentences_dict)} RSID sentences")

In [None]:
# Initialize tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")
model = BertModel.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
# Add RSID tokens to the tokenizer
tokenizer.add_tokens(rsid_tokens)
model.resize_token_embeddings(len(tokenizer))

Embedding(189044, 768)

In [None]:
# Save the updated tokenizer and model
tokenizer.save_pretrained(TOKENIZER_PATH)
model.save_pretrained(MODEL_PATH)

In [None]:
# Tokenize RSID sentences in batches
def tokenize_batch(batch_sentences):
    return tokenizer(batch_sentences, add_special_tokens=True, max_length=512, truncation=True, padding='max_length')

tokenized_rsid_sentences = {}
batch_size = 500
pmids = list(rsid_sentences_dict.keys())

for i in tqdm.tqdm(range(0, len(pmids), batch_size), desc="Tokenizing RSID sentences in batches"):
    batch_pmids = pmids[i:i+batch_size]
    batch_sentences = [sentence for pmid in batch_pmids for sentence in rsid_sentences_dict[pmid]]
    tokenized_batch = tokenize_batch(batch_sentences)

    start_idx = 0
    for pmid in batch_pmids:
        end_idx = start_idx + len(rsid_sentences_dict[pmid])
        tokenized_rsid_sentences[pmid] = {
            "input_ids": tokenized_batch["input_ids"][start_idx:end_idx],
            "attention_mask": tokenized_batch["attention_mask"][start_idx:end_idx]
        }
        start_idx = end_idx

Tokenizing RSID sentences in batches: 100%|██████████| 216/216 [00:24<00:00,  8.98it/s]


In [None]:
# Save the tokenized data
with open(SAVED_TOKENIZED_PATH, 'wb') as f:
    pickle.dump(tokenized_rsid_sentences, f)

logger.info(f"Tokenized data saved to {SAVED_TOKENIZED_PATH}")

In [None]:
#Train

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertForMaskedLM, AdamW, get_linear_schedule_with_warmup
import random
import tqdm

In [None]:
torch.cuda.empty_cache()
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb=10'

In [None]:
# Constants
BATCH_SIZE = 16
EPOCHS = 3
LR = 5e-5
MASK_TOKEN = tokenizer.mask_token_id
MASK_PROB = 0.15
GRADIENT_ACCUMULATION_STEPS = 4

In [None]:
# Load tokenized data
with open("./tokenized_rsid_sentences.pkl", 'rb') as f:
    tokenized_data = pickle.load(f)

In [None]:
# Use a 20% subset from the data for proof of concept
subset_data = {}
num_samples = int(0.2 * len(tokenized_data))
all_pmids = list(tokenized_data.keys())
selected_pmids = random.sample(all_pmids, num_samples)

for pmid in selected_pmids:
    subset_data[pmid] = tokenized_data[pmid]

# Split this subset data into training and evaluation sets (80% train, 20% eval)
train_pmids = random.sample(selected_pmids, int(0.8 * num_samples))
eval_pmids = list(set(selected_pmids) - set(train_pmids))

train_data = {pmid: subset_data[pmid] for pmid in train_pmids}
eval_data = {pmid: subset_data[pmid] for pmid in eval_pmids}

In [None]:
# Prepare dataset
class RSIDDataset(Dataset):
    def __init__(self, data_subset):
        self.data = []
        for pmid, values in data_subset.items():
            for input_ids, attention_mask in zip(values["input_ids"], values["attention_mask"]):
                self.data.append((input_ids, attention_mask))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [None]:
train_dataset = RSIDDataset(train_data)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

eval_dataset = RSIDDataset(eval_data)
eval_dataloader = DataLoader(eval_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
# Prepare model and optimizer
model = BertForMaskedLM.from_pretrained(MODEL_PATH)
model.to(device)
optimizer = AdamW(model.parameters(), lr=LR)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
                 num_training_steps=(len(train_dataloader) // GRADIENT_ACCUMULATION_STEPS) * EPOCHS)

Some weights of BertForMaskedLM were not initialized from the model checkpoint at ./updated_model/ and are newly initialized: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def train(model, dataloader, optimizer, scheduler):
    model.train()
    total_loss = 0.0

    optimizer.zero_grad()  # Moved up for gradient accumulation

    for step, batch in enumerate(tqdm.tqdm(dataloader, desc="Training...")):
        input_ids, attention_mask = batch
        input_ids = torch.stack(input_ids).to(device)
        attention_mask = torch.stack(attention_mask).to(device)

        # Masking logic remains unchanged ...

        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss / GRADIENT_ACCUMULATION_STEPS  # Normalize the loss
        total_loss += loss.item()
        loss.backward()

        # Gradient accumulation
        if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Moved inside loop
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

    avg_loss = total_loss / len(dataloader)
    return avg_loss

In [None]:
# Evaluation function
with torch.no_grad():
  def evaluate(model, dataloader):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for batch in tqdm.tqdm(dataloader, desc="Evaluating..."):
            input_ids, attention_mask = batch
            input_ids = torch.stack(input_ids).to(device)
            attention_mask = torch.stack(attention_mask).to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss
            total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    return avg_loss

In [None]:
!nvidia-smi

Sun Oct 22 05:21:51 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    49W / 400W |  16359MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# Fine-tuning
for epoch in range(EPOCHS):
    train_loss = train(model, train_dataloader, optimizer, scheduler)
    print(f"Epoch {epoch + 1}/{EPOCHS} - Train Loss: {train_loss:.4f}")

    if epoch == EPOCHS - 1:
        eval_loss = evaluate(model, eval_dataloader)
        print(f"Final Evaluation after Epoch {epoch + 1} - Eval Loss: {eval_loss:.4f}")

Training...: 100%|██████████| 2158/2158 [26:22<00:00,  1.36it/s]


Epoch 1/3 - Train Loss: 0.0662


Training...: 100%|██████████| 2158/2158 [26:25<00:00,  1.36it/s]


Epoch 2/3 - Train Loss: 0.0069


Training...: 100%|██████████| 2158/2158 [26:25<00:00,  1.36it/s]


Epoch 3/3 - Train Loss: 0.0058


Evaluating...: 100%|██████████| 535/535 [02:13<00:00,  4.01it/s]

Final Evaluation after Epoch 3 - Eval Loss: 0.0192





In [None]:
# Save the fine-tuned model
save_model = "./trained_model/"
save_tokenizer = "./trained_tokenizer/"
model.save_pretrained(save_model)
tokenizer.save_pretrained(save_tokenizer)

('./trained_tokenizer/tokenizer_config.json',
 './trained_tokenizer/special_tokens_map.json',
 './trained_tokenizer/vocab.txt',
 './trained_tokenizer/added_tokens.json',
 './trained_tokenizer/tokenizer.json')

In [None]:
#Save subset data
with open('subset_data.pkl', 'wb') as f:
  pickle.dump(subset_data, f)

In [None]:
!cp "subset_data.pkl" "/content/gdrive/My Drive/Ver S/New/"
!cp -r "trained_tokenizer" "/content/gdrive/My Drive/Ver S/New/"
!cp -r "trained_model" "/content/gdrive/My Drive/Ver S/New/"

In [None]:
!cp -r "updated_tokenizer" "/content/gdrive/My Drive/Ver S/New/"
!cp -r "updated_model" "/content/gdrive/My Drive/Ver S/New/"

In [None]:
!cp -r "tokenized_rsid_sentences.pkl" "/content/gdrive/My Drive/Ver S/New/"

In [None]:
#End.