In [5]:
!pip install -q SentencePiece transformers

In [8]:
!pip install torch

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting torch
  Downloading torch-2.0.0-cp39-cp39-manylinux1_x86_64.whl (619.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m134.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting triton==2.0.0
  Downloading triton-2.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (63.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.3/63.3 MB[0m [31m126.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting nvidia-cusparse-cu11==11.7.4.91
  Downloading nvidia_cusparse_cu11-11.7.4.91-py3-none-manylinux1_x86_64.whl (173.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m173.2/173.2 MB[0m [31m118.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting networkx
  Downloading networkx-3.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [3

In [1]:
from transformers import T5Tokenizer, T5Model, T5EncoderModel
import re
import torch


tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_bfd', do_lower_case=False)

model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_bfd")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at Rostlab/prot_t5_xl_bfd were not used when initializing T5EncoderModel: ['decoder.block.23.layer.1.EncDecAttention.o.weight', 'decoder.block.5.layer.1.EncDecAttention.q.weight', 'decoder.block.8.layer.0.SelfAttention.q.weight', 'decoder.block.22.layer.2.DenseReluDense.wi.weight', 'decoder.block.5.layer.2.layer_norm.weight', 'decoder.block.13.layer.1.layer_norm.weight', 'decoder.block.18.layer.1.EncDecAttention.o.weight', 'decoder.block.11.layer.1.EncDecAttention.q.weight', 'decoder.block.1.layer.0.SelfAttention.o.weight', 'decoder.block.4.layer.0.SelfAttention.v.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.4.layer.0.SelfAttention.o.weight', 'decoder.block.13.layer.2.DenseReluDense.wi.weight', 'decoder.block.14.layer.1.EncDecAttention.q.weight', 'decoder.block.13.layer.0.SelfAttention.o.weight', 'decoder.block.5.layer.0.SelfAttention.o.weight', 'decoder.block.3.layer.1

In [83]:
import pandas as pd
import numpy as np
import h5py

n_seqs = 500
batch_size = 100

def extract_embed(path, h5_path):  

    i_seq = 0
    sequences = []
    seq_df = pd.read_csv(path, sep=",")

    #seq_df = seq_df.sample(frac=1).reset_index(drop=True)
    
    for row in seq_df.iterrows():
        seq = row[1]["Sequence"]
        seq = re.sub("\*", "", seq)
        sequences.append(" ".join(seq))
        i_seq += 1
    # map rarely occured amino acids (U,Z,O,B) to (X)
    print("mapping to rare amino acids...")
    #sequences = ["A E T C Z A O", "S K T Z P M V"]
    sequences = [re.sub(r"[UZOJB]", "X", sequence) for sequence in sequences]
    # Tokenize, encode sequences and load it into the GPU if possibile

    print("tokenization...")
    ids = tokenizer.batch_encode_plus(
        sequences, add_special_tokens=True, padding=True
    )
    input_ids = torch.tensor(ids["input_ids"])
    attention_mask = torch.tensor(ids["attention_mask"])

    input_ids.shape, attention_mask.shape
    
    print("computing embedding...")

    agg_embedding = list()

    for i in range(int(len(sequences) / batch_size)):
        curr = i * batch_size
        nex = i * batch_size + batch_size
        with torch.no_grad():
            embedding = model(input_ids=input_ids[curr:nex,], attention_mask=attention_mask[curr:nex,])
        embedding = embedding.last_hidden_state.cpu().numpy()
        agg_embedding.extend(embedding)
    print("embedding computed")
    agg_embedding = np.array(agg_embedding)
    features = []
    agg_features = []
    for seq_num in range(len(agg_embedding)):
        seq_len = (attention_mask[seq_num] == 1).sum()
        seq_emd = agg_embedding[seq_num][: seq_len - 1]
        agg_features.append(np.mean(seq_emd, axis=0))
        features.append(seq_emd)
    X = np.array(agg_features)
    print(X.shape, X)
    hf = h5py.File(h5_path, 'w')
    hf.create_dataset('embedding', data=X)
    hf.close()

In [None]:
extract_embed("../data/train/dataframe/positive_sites.csv", "../data/train/dataframe/prot_t5_xl_bfd_features_pos.h5")

mapping to rare amino acids...
tokenization...
computing embedding...


In [None]:
extract_embed("../data/train/dataframe/negative_sites.csv", "../data/train/dataframe/prot_t5_xl_bfd_features_neg.h5")

In [None]:
extract_embed("../data/test/dataframe/positive_sites.csv", "../data/test/dataframe/prot_t5_xl_bfd_features_pos.h5")

In [None]:
extract_embed("../data/test/dataframe/negative_sites.csv", "../data/test/dataframe/prot_t5_xl_bfd_features_neg.h5")