In [None]:
!pip install datasets transformers



In [None]:
# Step 1: Import Libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm.auto import tqdm
import random
from sklearn.model_selection import train_test_split
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers import normalizers
from tokenizers.normalizers import NFD, Lowercase, StripAccents
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordPieceTrainer
from tokenizers import decoders
from sklearn.metrics import accuracy_score
from transformers import RobertaTokenizer
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, AutoModelForMaskedLM, Trainer, TrainingArguments, pipeline
from datasets import Dataset

In [None]:
# Data paths
train_data_path = '/content/drive/MyDrive/PLM/ SwissprotDatasets/BalancedSwissprot/train.csv'
test_data_path = '/content/drive/MyDrive/PLM/ SwissprotDatasets/BalancedSwissprot/valid.csv'

In [None]:
# Load your train data
train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

In [None]:
train_df = train_df[train_df['Sequence'].str.len() > 20]  # remove short sequences
test_df = test_df[test_df['Sequence'].str.len() > 20]  # remove short sequences

In [None]:
train_df = train_df.iloc[0:5000,:]
test_df = test_df.iloc[0:2000,:]

In [None]:
train_df.shape, test_df.shape

((5000, 8), (2000, 8))

In [None]:
# === Format sequences with spaces between amino acids ===
def format_sequence(seq):
    return ' '.join(list(seq.strip()))

train_df['spaced_sequence'] = train_df['Sequence'].apply(format_sequence)
test_df['spaced_sequence'] = test_df['Sequence'].apply(format_sequence)

In [None]:
train_df['spaced_sequence']

Unnamed: 0,spaced_sequence
0,A T G T C G G A C G G C G C G G T G G T A C G ...
1,A T G A G A G C A G T T A G A T T A G T A G A ...
2,A T G A C C G C A A T G A T G A A A G C C G C ...
3,G T G A A A G C A G C A G T A G T T A A C G A ...
4,A T G A A A A C C A C C G C G G C G G T A C T ...
...,...
4995,A T G C A A T T A G A T G A A C A A C G T C T ...
4996,A T G C A A G T A G A T G A A C A A C G T C T ...
4997,A T G C A A G T A G A T G A A C A A C G T C T ...
4998,A T G C A A G T A G A T G A A C A A C G T C T ...


In [None]:
# === Convert to HuggingFace Dataset ===
dataset_train = Dataset.from_pandas(train_df[['spaced_sequence']])
dataset_test = Dataset.from_pandas(test_df[['spaced_sequence']])

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/361 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
# === Tokenize sequences ===
def tokenize_fn(examples):
    return tokenizer(
        examples['spaced_sequence'],
        padding='max_length',
        truncation=True,
        max_length=512
    )

tokenized_dataset_train = dataset_train.map(tokenize_fn, batched=True)
tokenized_dataset_test = dataset_test.map(tokenize_fn, batched=True)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
# === Data Collator for 15% Random Masking ===
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

In [None]:
data_collator

DataCollatorForLanguageModeling(tokenizer=BertTokenizerFast(name_or_path='Rostlab/prot_bert', vocab_size=30, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
), mlm=True, mlm_probability=0.15, mask_replace_prob=0.8, random_repla

In [None]:
# === Load pre-trained model ===
model = AutoModelForMaskedLM.from_pretrained("Rostlab/prot_bert")

# === Training Arguments ===
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_strategy="epoch",
    eval_strategy="no",
    logging_steps=20,
    logging_dir="./logs"
)

# === Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_test,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# === Train ===
trainer.train()

# === Final evaluation ===
eval_results = trainer.evaluate()
print(f"\n✅ Final eval loss: {eval_results['eval_loss']:.4f}")

pytorch_model.bin:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

Some weights of the model checkpoint at Rostlab/prot_bert were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mamartyahatua[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
20,1.6003
40,1.3144
60,1.2952
80,1.2757
100,1.3044
120,1.2816
140,1.2735
160,1.2601
180,1.2753
200,1.2619



✅ Final eval loss: 1.1326


In [None]:
trainer.save_model("/content/drive/MyDrive/PLM/finetuned_protein_roberta_SwissprotDatasets_BalancedSwissprot")
tokenizer.save_pretrained("/content/drive/MyDrive/PLM/finetuned_protein_roberta_SwissprotDatasets_BalancedSwissprot")

('/content/drive/MyDrive/PLM/finetuned_protein_roberta_SwissprotDatasets_BalancedSwissprot/tokenizer_config.json',
 '/content/drive/MyDrive/PLM/finetuned_protein_roberta_SwissprotDatasets_BalancedSwissprot/special_tokens_map.json',
 '/content/drive/MyDrive/PLM/finetuned_protein_roberta_SwissprotDatasets_BalancedSwissprot/vocab.txt',
 '/content/drive/MyDrive/PLM/finetuned_protein_roberta_SwissprotDatasets_BalancedSwissprot/added_tokens.json',
 '/content/drive/MyDrive/PLM/finetuned_protein_roberta_SwissprotDatasets_BalancedSwissprot/tokenizer.json')

### Evaluation

In [None]:
# === Evaluate on test set ===
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)

correct = 0
total = 0

Device set to use cuda:0


In [None]:
top1_correct = 0
top5_correct = 0
total = 0

for seq in test_df['Sequence'][:500]:
    if len(seq) < 5:
        continue
    pos = torch.randint(1, len(seq)-1, (1,)).item()
    true_token = seq[pos]
    seq_masked = list(seq)
    seq_masked[pos] = tokenizer.mask_token
    masked_input = " ".join(seq_masked)
    masked_input = f"{tokenizer.cls_token} {masked_input} {tokenizer.sep_token}"

    try:
        preds = fill_mask(masked_input)
        top_preds = [p['token_str'].strip() for p in preds]
        if true_token == top_preds[0]:
            top1_correct += 1
        if true_token in top_preds:
            top5_correct += 1
        total += 1
    except:
        continue

print(f"Top-1 Accuracy: {top1_correct / total:.4f}")
print(f"Top-5 Accuracy: {top5_correct / total:.4f}")


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Top-1 Accuracy: 0.3140
Top-5 Accuracy: 1.0000


In [None]:
masked_input

'[CLS] G T G C A C T T A C A G A C A A G G G C A G C C G T G G T G A C C G G C G C G G C C A G C G G C A T C G G C C T C G C C C T C A G C G C C C G C T T C G C A C G G G C C G G C G C C G G C G T C G T C A T G G C G G A C G T C G A C G G C G A T G C A C T G C A C C G C A G G G C C G C C G A A C T C A C C G C G C G G G G C G C T C A G G T C A C C G C G G T G A C C G C C G A C C T G A C C G A C C C G G A G G T C G T C G A A C G G C T C G C G G A C A C G G C G T T C G A C C A G C T C G G C G A C A T C G A C G T G G T G T G C A A C A A C G C C G G G G T C C T C G G C C C C G T G G G A C A G C C G C T G T G G G A G G T G C C G C T G G A G C G G A T G C G G C A G G T C T T C G A G G T C A A C C A C T G G G C G C A C G T C C T G G T G G C C C G C G C C T T C G T C C C C C G A C T C C T G G A G C G C G G C C G G C C C G C C C A T C T G A T C C A C A C C G C C T C G A T G T C C G C C T T C G T C G T C G G C G C C G G C A G C G C C G C C T A C G C C G C C T C C A A G C A C G C C G A C C T C G C