<a href="https://colab.research.google.com/github/anshuman1808/Single-Price-Grid/blob/main/ProtBERT_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **ProtBERT** **Preprocessing**

1: Upload the FASTA File

In [None]:
from google.colab import files

uploaded = files.upload()


Saving uniprotkb_AND_reviewed_true_AND_model_o_2025_04_07.fasta to uniprotkb_AND_reviewed_true_AND_model_o_2025_04_07.fasta


2:Installing the Required Packages

In [None]:
!pip install transformers biopython torch

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux20

 3: Full Preprocessing Code

In [None]:
from Bio import SeqIO
from transformers import BertTokenizer
import torch
from torch.utils.data import Dataset

# Load ProtBERT tokenizer
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)

# Step 1: Parse FASTA
def parse_fasta(fasta_path):
    sequences = []
    for record in SeqIO.parse(fasta_path, "fasta"):
        sequence = str(record.seq)
        if set(sequence).issubset(set("ACDEFGHIKLMNPQRSTVWY")):  # Amino acids
            sequences.append(sequence)
    return sequences

# Step 2: Preprocess (add spaces)
def preprocess_sequence(seq):
    return ' '.join(list(seq))

# Step 3: Tokenize
def tokenize_sequences(sequences, tokenizer, max_length=512):
    preprocessed = [preprocess_sequence(seq) for seq in sequences]
    return tokenizer(
        preprocessed,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=max_length
    )

# Step 4: PyTorch Dataset
class ProteinSequenceDataset(Dataset):
    def __init__(self, sequences, tokenizer):
        self.data = tokenize_sequences(sequences, tokenizer)

    def __len__(self):
        return self.data['input_ids'].shape[0]

    def __getitem__(self, idx):
        return {
            'input_ids': self.data['input_ids'][idx],
            'attention_mask': self.data['attention_mask'][idx]
        }

# === Load and Process ===
fasta_file = "uniprotkb_AND_reviewed_true_AND_model_o_2025_04_07.fasta"
sequences = parse_fasta(fasta_file)

# Optional: reduce size for testing
sequences = sequences[:100]

# Build dataset
dataset = ProteinSequenceDataset(sequences, tokenizer)

# Check one sample
sample = dataset[0]
print("Input IDs shape:", sample['input_ids'].shape)
print("Attention mask shape:", sample['attention_mask'].shape)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/361 [00:00<?, ?B/s]

Input IDs shape: torch.Size([512])
Attention mask shape: torch.Size([512])


4: Visualize Tokens

In [None]:
# View tokens
tokens = tokenizer.convert_ids_to_tokens(sample['input_ids'])
print(tokens)


['[CLS]', 'M', 'G', 'L', 'E', 'A', 'L', 'V', 'P', 'L', 'A', 'M', 'I', 'V', 'A', 'I', 'F', 'L', 'L', 'L', 'V', 'D', 'L', 'M', 'H', 'R', 'H', 'Q', 'R', 'W', 'A', 'A', 'R', 'Y', 'P', 'P', 'G', 'P', 'L', 'P', 'L', 'P', 'G', 'L', 'G', 'N', 'L', 'L', 'H', 'V', 'D', 'F', 'Q', 'N', 'T', 'P', 'Y', 'C', 'F', 'D', 'Q', 'L', 'R', 'R', 'R', 'F', 'G', 'D', 'V', 'F', 'S', 'L', 'Q', 'L', 'A', 'W', 'T', 'P', 'V', 'V', 'V', 'L', 'N', 'G', 'L', 'A', 'A', 'V', 'R', 'E', 'A', 'M', 'V', 'T', 'R', 'G', 'E', 'D', 'T', 'A', 'D', 'R', 'P', 'P', 'A', 'P', 'I', 'Y', 'Q', 'V', 'L', 'G', 'F', 'G', 'P', 'R', 'S', 'Q', 'G', 'V', 'I', 'L', 'S', 'R', 'Y', 'G', 'P', 'A', 'W', 'R', 'E', 'Q', 'R', 'R', 'F', 'S', 'V', 'S', 'T', 'L', 'R', 'N', 'L', 'G', 'L', 'G', 'K', 'K', 'S', 'L', 'E', 'Q', 'W', 'V', 'T', 'E', 'E', 'A', 'A', 'C', 'L', 'C', 'A', 'A', 'F', 'A', 'D', 'Q', 'A', 'G', 'R', 'P', 'F', 'R', 'P', 'N', 'G', 'L', 'L', 'D', 'K', 'A', 'V', 'S', 'N', 'V', 'I', 'A', 'S', 'L', 'T', 'C', 'G', 'R', 'R', 'F', 'E', 'Y', 'D', 