<a href="https://colab.research.google.com/github/archiebenn/BIOLM0050_kaggle/blob/master/protein_embedding_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q torch transformers sentencepiece h5py

In [21]:
import h5py
import numpy as np
import pandas as pd
import os
import torch

from transformers import T5EncoderModel, T5Tokenizer

## workflow aim:
protein sequence → chunk → ProtT5 embeddings per chunk → mean pool → single vector → classifier


## Load ProtT5

In [73]:
# set tokeniser
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokeniser = T5Tokenizer.from_pretrained(
    "Rostlab/prot_t5_xl_uniref50",
    do_lower_case=False
)

model = T5EncoderModel.from_pretrained(
    "Rostlab/prot_t5_xl_uniref50"
).to(device)

model.eval()

Loading weights:   0%|          | 0/196 [00:00<?, ?it/s]

T5EncoderModel LOAD REPORT from: Rostlab/prot_t5_xl_uniref50
Key            | Status     |  | 
---------------+------------+--+-
lm_head.weight | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


T5EncoderModel(
  (shared): Embedding(128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=4096, bias=False)
              (k): Linear(in_features=1024, out_features=4096, bias=False)
              (v): Linear(in_features=1024, out_features=4096, bias=False)
              (o): Linear(in_features=4096, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 32)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=1024, out_features=16384, bias=False)
              (wo): Linear(in_features=16384, out_features=1024, bias=False)
              (dropout): Dropo

In [18]:
# import training csv
from google.colab import files
uploaded = files.upload()


Saving train.csv to train (1).csv


In [56]:
# data setup
df_train = pd.read_csv("train.csv")

print(f'Train DF length: {len(df_train)}')
df_train.head()


Train DF length: 16077


Unnamed: 0,Id,acc,partition,cytoplasm,nucleus,extracellular,cell_surface,mitochondrion,endom,sequence,...,aa_frac_M,aa_frac_N,aa_frac_P,aa_frac_Q,aa_frac_R,aa_frac_S,aa_frac_T,aa_frac_V,aa_frac_W,aa_frac_Y
0,0,P61966,0,0,0,0,0,0,1,MMRFMLLFSRQGKLRLQKWYLATSDKERKKMVRELMQVVLARKPKM...,...,0.051,0.013,0.013,0.044,0.063,0.057,0.019,0.063,0.013,0.044
1,1,Q9VTK2,0,0,0,0,0,0,1,MSATYTNTITQRRKTAKVRQQQQHQWTGSDLSGESNERLHFRSRST...,...,0.028,0.032,0.044,0.043,0.068,0.08,0.063,0.059,0.025,0.038
2,2,O95858,3,0,0,0,1,0,1,MPRGDSEQVRYCARFSYLWLKFSLIIYSTVFWLIGALVLSVGIYAE...,...,0.034,0.041,0.031,0.027,0.044,0.044,0.051,0.078,0.014,0.058
3,3,Q9WUX5,0,1,0,0,0,0,1,MGRSLTCPFGISPACGAQASWSIFGVGTAEVPGTHSHSNQAAAMPH...,...,0.023,0.036,0.089,0.051,0.05,0.117,0.044,0.058,0.008,0.011
4,4,Q9NQC3-3,1,0,0,0,0,0,1,MDGQKKNWKDKVVDLLYWRDIKKTGVVFGASLFLLLSLTVFSIVSV...,...,0.015,0.03,0.015,0.035,0.035,0.07,0.035,0.101,0.015,0.04


In [112]:
# keep just the Id and sequence in the data frame
df_seq_id = df_train[["Id", "sequence"]]
df_seq_id.head()

Unnamed: 0,Id,sequence
0,0,MMRFMLLFSRQGKLRLQKWYLATSDKERKKMVRELMQVVLARKPKM...
1,1,MSATYTNTITQRRKTAKVRQQQQHQWTGSDLSGESNERLHFRSRST...
2,2,MPRGDSEQVRYCARFSYLWLKFSLIIYSTVFWLIGALVLSVGIYAE...
3,3,MGRSLTCPFGISPACGAQASWSIFGVGTAEVPGTHSHSNQAAAMPH...
4,4,MDGQKKNWKDKVVDLLYWRDIKKTGVVFGASLFLLLSLTVFSIVSV...


In [67]:
# testing for longest sequence in train.csv:
# Compute string lengths
df_train["sequence_lengths"] = df_train["sequence"].astype(str).str.len()

# sort in new df by seq length
df_sorted = df_train.sort_values("sequence_lengths", ascending=False)

df_sorted["sequence_lengths"].head(25)

Unnamed: 0,sequence_lengths
5598,34350
11943,18562
2346,18141
10752,13100
8148,8886
4725,8545
7632,8081
15527,7570
9759,7570
193,7388


## embedding protein function

In [94]:
def embed_protein(seq):

  # reformat sequences to be space separated AAs:
  seq = ' '.join(list(seq))

  inputs = tokenizer(seq, return_tensors="pt")
  inputs = {k: v.to(device) for k, v in inputs.items()}

  with torch.no_grad():
    outputs = model(**inputs)

  emb = outputs.last_hidden_state.squeeze(0).cpu().numpy()

  return emb.mean(axis=0)


### Chunking and pooling sequences
As some of the sequences are fa too large (up to 34k AAs) for ProtT5, will attempt to chunk and pool the sequences into ProtT5 instead:

## embedding chunked protein function


In [88]:
from IPython.terminal.embed import embed
def embed_chunky_protein(seq, chunk_size=1024, overlap=50):
    """
    embed a protein sequence in chunks and pool into a single fixed-length vector.
    Returns tensor of shape [hidden_dim].
    """
    seq_str = str(seq)

    # Skip empty sequences
    if len(seq_str) == 0:
        return None

    # Chunk sequence
    chunks = []
    start = 0
    while start < len(seq_str):
        end = min(start + chunk_size, len(seq_str))
        chunks.append(seq_str[start:end])
        start += chunk_size - overlap

    # embed each chunk and mean pool
    chunk_vector = []
    for chunk in chunks:
      try:
            emb = embed_protein(chunk)
            if emb is not None:

                # convert emb to a tensor
                chunk_vector.append(torch.tensor(emb))

      except Exception as e:
            print(f"Skipping a chunk due to error: {e}")
      finally:
            # free GPU memory if needed
            if 'emb' in locals():
                del emb
            torch.cuda.empty_cache()

    if len(chunk_vector) == 0:
        return None

    # mean pool over chunks to get one vector per protein:
    protein_vector = torch.mean(torch.stack(chunk_vector), dim=0)
    return protein_vector.cpu().numpy()

## Truncating sequences
Chunking and pooling taking too long and could contain unhelpful information contained within the middle of the sequences. Instead will try to keep the 1024AAs at start and end (longest seq = 2048AAs) as this will likely contain the most useful 'postcode' information.

In [115]:
# truncate function to 128aa either end
def truncate_prot(seq, chunk_len = 128):
  seq = str(seq)
  if len(seq) <= chunk_len*2:

    # short seq, return whole
    return seq

  else:
    # take first 1024 and last 1024 amino acids of sequence
    return seq[:chunk_len] + seq[-chunk_len:]


In [116]:
# just adding to shorten dataset for testing
df_seq_id = df_seq_id.head(100)

In [117]:
# write out the .h5 file:

with h5py.File("train_embeddings.h5", "w") as f:
  for _, row in df_seq_id.iterrows():

    # 1. set prot_id as a string for h5py to work (for h5 file column)
    prot_id = str(row["Id"])

    # 2. run embed protein function on each sequence:
    seq = row["sequence"]

    # truncate protein to max. 256AAs
    seq = truncate_prot(seq)

    try:
            emb = embed_protein(seq)

            if emb is None:
                print(f"Skipping {prot_id}, empty sequence")
                continue

            # convert to fixed numeric type for HDF5
            emb = np.array(emb, dtype=np.float32)

            # write to HDF5
            f.create_dataset(prot_id, data=emb)

    except RuntimeError as e:
            print(f"Skipping {prot_id} due to runtime error: {e}")

    finally:
            # free GPU memory
            if 'emb' in locals():
                del emb
            torch.cuda.empty_cache()



