# Pretrain BERT Model on full length data 

### load data

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
from Bio import SeqIO
 
def is_common_aa_sequence(sequence):
    common_amino_acids = set('ACDEFGHIKLMNPQRSTVWY')
    return all(aa in common_amino_acids for aa in sequence)

fasta_file = "./data/human_uniprot-reviewed_yes+taxonomy_9606.fasta"
 
seqs = []
for record in SeqIO.parse(fasta_file, "fasta"):
    seq = str(record.seq)
    if is_common_aa_sequence(seq): # make sure that the sequence contains only common amino acids
        seqs.append(str(record.seq))
    

### tokenizer

In [2]:
# tokenizer
from transformers import BertTokenizer

#  Initial Tokenizer
AMINO_ACIDS = list("ACDEFGHIKLMNPQRSTVWY") + ["X"]  
SPECIAL_TOKENS = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
VOCAB = SPECIAL_TOKENS + AMINO_ACIDS  

print(VOCAB)  
with open("vocab.txt", "w") as f:
    for token in VOCAB:
        f.write(token + "\n")

with open("vocab.txt", "r") as f:
    file_vocab = [line.strip() for line in f.readlines()]

tokenizer = BertTokenizer(
    vocab_file="vocab.txt",
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
    do_lower_case=False,  
    tokenize_chinese_chars=False  
)

  from .autonotebook import tqdm as notebook_tqdm


['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 'X']


### training

In [None]:
import torch
from pathlib import Path
from transformers import (BertConfig, BertForMaskedLM, Trainer, TrainingArguments, 
                          DataCollatorForLanguageModeling, TrainerCallback)
from torch.utils.data import Dataset, random_split


class BestModelSaver(TrainerCallback):
    """ only save when val_loss is smaller """
    def __init__(self, save_path):
        self.best_val_loss = float("inf")
        self.save_path = save_path

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics is None or "eval_loss" not in metrics:
            return  

        val_loss = metrics["eval_loss"]
        if val_loss < self.best_val_loss:
            self.best_val_loss = val_loss
            print(f"new best val_loss: {val_loss:.4f} save model...")
            trainer.save_model(self.save_path)


class ProteinDataset(Dataset):
    def __init__(self, seqs, tokenizer, max_length=1024): 
        self.sequences = seqs
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = ' '.join(self.sequences[idx])
        tokens = self.tokenizer.encode(sequence, truncation=True, max_length=self.max_length, padding="max_length")
        return {"input_ids": torch.tensor(tokens, dtype=torch.long)}
        # return {key: val.squeeze(0) for key, val in tokens.items()}


dataset = ProteinDataset(seqs[:1000], tokenizer)

train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.3
)

config = BertConfig(
    vocab_size=len(VOCAB),
    hidden_size=768,
    num_hidden_layers=4, # transformer layers number
    num_attention_heads=16,
    intermediate_size=3072,
    max_position_embeddings=1024,
    type_vocab_size=1,
)

model = BertForMaskedLM(config)

training_args = TrainingArguments(
    output_dir="./model", # save model path
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    save_strategy="no",  
    num_train_epochs=30,
    logging_dir="./logs",
    logging_strategy="epoch",
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[BestModelSaver("./step1_pretrain_bert_with_layer4")],  
)

trainer.train()


2025-04-07 22:49:34.825633: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch,Training Loss,Validation Loss
1,2.812,2.752431
2,2.7342,2.737322
3,2.7238,2.706582
4,2.7076,2.717886
5,2.7048,2.717429
6,2.6968,2.703012
7,2.6909,2.703109
8,2.6926,2.705687
9,2.679,2.697639
10,2.6831,2.709753


new best val_loss: 2.7524 save model...
new best val_loss: 2.7373 save model...
new best val_loss: 2.7066 save model...
new best val_loss: 2.7030 save model...
new best val_loss: 2.6976 save model...
new best val_loss: 2.6949 save model...
new best val_loss: 2.6936 save model...
new best val_loss: 2.6862 save model...
new best val_loss: 2.6856 save model...
new best val_loss: 2.6770 save model...


TrainOutput(global_step=1500, training_loss=2.657573232014974, metrics={'train_runtime': 166.7432, 'train_samples_per_second': 143.934, 'train_steps_per_second': 8.996, 'total_flos': 4268140167168000.0, 'train_loss': 2.657573232014974, 'epoch': 30.0})