<a href="https://colab.research.google.com/github/archiebenn/BIOLM0050_kaggle/blob/master/protein_embedding_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q torch transformers sentencepiece h5py

In [2]:
import h5py
import numpy as np
import pandas as pd
import os
import torch

from transformers import T5EncoderModel, T5Tokenizer

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"


## workflow aim:
protein sequence → truncate to shorten but keep N and C terminals for localisation info → ProtT5 embedding → single vector → classifier (other script)


## Load ProtT5 half model

In [3]:
# set tokeniser
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# using half prott5
tokeniser = T5Tokenizer.from_pretrained(
    "Rostlab/prot_t5_xl_half_uniref50-enc",
    do_lower_case=False
)

model = T5EncoderModel.from_pretrained(
    "Rostlab/prot_t5_xl_half_uniref50-enc"
).to(device)

model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]



spiece.model:   0%|          | 0.00/238k [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/656 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.42G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/196 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/2.42G [00:00<?, ?B/s]



T5EncoderModel(
  (shared): Embedding(128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=4096, bias=False)
              (k): Linear(in_features=1024, out_features=4096, bias=False)
              (v): Linear(in_features=1024, out_features=4096, bias=False)
              (o): Linear(in_features=4096, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 32)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=1024, out_features=16384, bias=False)
              (wo): Linear(in_features=16384, out_features=1024, bias=False)
              (dropout): Dropo

In [4]:
# import training csv
from google.colab import files
uploaded = files.upload()


Saving train_trimmed.csv to train_trimmed.csv


In [5]:
# data setup
df_train = pd.read_csv("train_trimmed.csv")

print(f'Train DF length: {len(df_train)}')
df_train.head(10)


Train DF length: 13398


Unnamed: 0.1,Unnamed: 0,Id,acc,partition,cytoplasm,nucleus,extracellular,cell_surface,mitochondrion,endom,...,aa_frac_M,aa_frac_N,aa_frac_P,aa_frac_Q,aa_frac_R,aa_frac_S,aa_frac_T,aa_frac_V,aa_frac_W,aa_frac_Y
0,0,0,P61966,0,0,0,0,0,0,1,...,0.051,0.013,0.013,0.044,0.063,0.057,0.019,0.063,0.013,0.044
1,1,1,Q9VTK2,0,0,0,0,0,0,1,...,0.028,0.032,0.044,0.043,0.068,0.08,0.063,0.059,0.025,0.038
2,2,2,O95858,3,0,0,0,1,0,1,...,0.034,0.041,0.031,0.027,0.044,0.044,0.051,0.078,0.014,0.058
3,3,3,Q9WUX5,0,1,0,0,0,0,1,...,0.023,0.036,0.089,0.051,0.05,0.117,0.044,0.058,0.008,0.011
4,4,4,Q9NQC3-3,1,0,0,0,0,0,1,...,0.015,0.03,0.015,0.035,0.035,0.07,0.035,0.101,0.015,0.04
5,5,6,O00204,3,1,1,0,0,0,1,...,0.025,0.022,0.104,0.038,0.055,0.093,0.044,0.041,0.025,0.033
6,6,7,O35095,1,1,0,0,0,0,1,...,0.016,0.018,0.062,0.049,0.053,0.062,0.051,0.058,0.015,0.016
7,7,8,Q7KRW1,1,0,0,0,0,0,1,...,0.049,0.026,0.043,0.038,0.047,0.099,0.063,0.077,0.017,0.03
8,8,10,O45405,1,1,0,0,0,0,1,...,0.025,0.034,0.046,0.021,0.059,0.059,0.076,0.059,0.008,0.029
9,9,11,Q9H4L5,0,1,1,0,1,0,1,...,0.019,0.046,0.042,0.041,0.061,0.127,0.033,0.053,0.019,0.027


## subsetting df and checking for repeats

In [6]:
# keep just the Id and sequence in the data frame
df_seq_id = df_train[["Id", "sequence"]]
df_seq_id.head()

# Get the counts of each ID
id_counts = df_seq_id["Id"].value_counts()

# checking for any duplicate ids:
df_seq_id["Id"].value_counts().head(50)

# How many IDs are duplicated (appear more than once)
num_duplicated_ids = (id_counts > 1).sum()
print("Number of duplicated IDs:", num_duplicated_ids)

# group by Id and count unique sequences per ID
seq_per_id = df_seq_id.groupby("Id")["sequence"].nunique()

# how many Ids have >1 unique sequence?
num_ids_with_multiple_sequences = (seq_per_id > 1).sum()
print("IDs with multiple sequences:", num_ids_with_multiple_sequences)



Number of duplicated IDs: 0
IDs with multiple sequences: 0


In [7]:
# sanity check for any duplicate ids:
df_seq_id["Id"].value_counts().head(10)

Unnamed: 0_level_0,count
Id,Unnamed: 1_level_1
16740,1
0,1
1,1
2,1
3,1
4,1
16721,1
16720,1
16719,1
16717,1


In [8]:
# testing for longest sequence in train.csv:
# Compute string lengths
df_train["sequence_lengths"] = df_train["sequence"].astype(str).str.len()

# sort in new df by seq length
df_sorted = df_train.sort_values("sequence_lengths", ascending=False)

df_sorted["sequence_lengths"].head(25)

Unnamed: 0,sequence_lengths
5598,34350
11943,18562
2346,18141
10752,13100
8148,8886
4725,8545
7632,8081
9759,7570
193,7388
5742,7158


## embedding protein function

In [9]:
def embed_protein(seq):

  # reformat sequences to be space separated AAs:
  seq = ' '.join(list(seq))

  inputs = tokeniser(seq, return_tensors="pt")
  inputs = {k: v.to(device) for k, v in inputs.items()}

  with torch.no_grad():
    outputs = model(**inputs)

  emb = outputs.last_hidden_state.squeeze(0).cpu().numpy()

  return emb.mean(axis=0)


### Chunking and pooling sequences
As some of the sequences are fa too large (up to 34k AAs) for ProtT5, will attempt to chunk and pool the sequences into ProtT5 instead:

## Truncating sequences
Chunking and pooling taking too long and could contain unhelpful information contained within the middle of the sequences. Instead will try to keep the 1024AAs at start and end (longest seq = 2048AAs) as this will likely contain the most useful 'postcode' information.

In [10]:
# truncate function to 2048aa either end
def truncate_prot(seq, chunk_len = 2048):
  seq = str(seq)
  if len(seq) <= chunk_len*2:

    # short seq, return whole
    return seq

  else:
    # take first 2048 and last 2048 amino acids of sequence
    return seq[:chunk_len] + seq[-chunk_len:]


In [11]:
# just adding to shorten dataset for testing
#df_seq_id = df_seq_id.head(100)

In [14]:
# write out the .h5 file:
# this is using the half protT5 model and truncated to a maximum length og 4096aa (first and last 2048aa based off testing longer and shorter lengths with the given GPU memory)

with h5py.File("protT5_half_embeddings_2048aa.h5", "a") as f:
  for _, row in df_seq_id.iterrows():

    # 1. set prot_id as a string for h5py to work (for h5 file column)
    prot_id = str(row["Id"])

    # check if dataset already exists to handle duplicate IDs in input data
    if prot_id in f:
        print(f"Skipping {prot_id}, dataset already exists in the file (duplicate ID in input data).")
        continue

    # 2. run embed protein function on each sequence:
    seq = row["sequence"]

    # truncate protein to max. 256AAs
    seq = truncate_prot(seq)

    try:
            emb = embed_protein(seq)

            if emb is None:
                print(f"Skipping {prot_id}, empty sequence")
                continue

            # convert to fixed numeric type for HDF5
            emb = np.array(emb, dtype=np.float32)

            # write to HDF5
            f.create_dataset(prot_id, data=emb)

    except RuntimeError as e:
            print(f"Skipping {prot_id} due to runtime error: {e}")

    finally:
            # completion message
            print(f'Finished Id {prot_id}')
            # free GPU memory
            if 'emb' in locals():
                del emb
            torch.cuda.empty_cache()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Finished Id 10453
Finished Id 10454
Finished Id 10455
Finished Id 10456
Finished Id 10457
Finished Id 10458
Finished Id 10459
Finished Id 10460
Finished Id 10461
Finished Id 10464
Finished Id 10465
Finished Id 10466
Finished Id 10467
Finished Id 10470
Finished Id 10473
Finished Id 10474
Finished Id 10475
Finished Id 10476
Finished Id 10478
Finished Id 10479
Finished Id 10480
Finished Id 10481
Finished Id 10482
Finished Id 10483
Finished Id 10484
Finished Id 10485
Finished Id 10487
Finished Id 10488
Finished Id 10489
Finished Id 10490
Finished Id 10491
Finished Id 10492
Finished Id 10493
Finished Id 10494
Finished Id 10495
Finished Id 10496
Finished Id 10497
Finished Id 10498
Finished Id 10499
Finished Id 10500
Finished Id 10501
Finished Id 10502
Finished Id 10503
Finished Id 10504
Finished Id 10505
Finished Id 10506
Finished Id 10507
Finished Id 10508
Finished Id 10509
Finished Id 10510
Finished Id 10511
Finished Id 10514