## read sequences from fasta

In [5]:
from tqdm import tqdm

In [1]:
def read_fasta( fasta_path, split_char="!", id_field=0):
    '''
        Reads in fasta file containing multiple sequences.
        Split_char and id_field allow to control identifier extraction from header.
        E.g.: set split_char="|" and id_field=1 for SwissProt/UniProt Headers.
        Returns dictionary holding multiple sequences or only single
        sequence, depending on input file.
    '''

    seqs = dict()
    with open( fasta_path, 'r' ) as fasta_f:
        count = 0
        for line in fasta_f:
            # get uniprot ID from header and create new entry
            if line.startswith('>'):
                uniprot_id = line.replace('>', '').strip().split(split_char)[id_field]
                # replace tokens that are mis-interpreted when loading h5
                #uniprot_id = uniprot_id.replace("/","_").replace(".","_")
                uniprot_id = uniprot_id + "_" + str(count)
                count += 1
                seqs[ uniprot_id ] = ''
            else:
                # repl. all whie-space chars and join seqs spanning multiple lines, drop gaps and cast to upper-case
                seq= ''.join( line.split() ).upper().replace("-","")
                # repl. all non-standard AAs and map them to unknown/X
                seq = seq.replace('U','X').replace('Z','X').replace('O','X')
                seqs[ uniprot_id ] += seq
    example_id=next(iter(seqs))
    print("Read {} sequences.".format(len(seqs)))
    print("Example:\n{}\n{}".format(example_id,seqs[example_id]))

    return seqs

In [2]:
# read pickle file
path = "../datasets/aligned_seq.fasta"
all_seqs = read_fasta(path)

Read 260 sequences.
Example:
d1a2za__0
YSNITVERATLPVRAITKTLRDNGIPATISYSAYPLKAGFIHVPYTPDQVVNKFFLLGKNTPSMCLEAEIKAIELAVKVSLDYLEKDRDDIKIPL


## load model (using prostt5 as an example)

In [3]:
from transformers import T5Tokenizer, T5EncoderModel
import torch
import re

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
tokenizer = T5Tokenizer.from_pretrained('Rostlab/ProstT5', do_lower_case=False)
model = T5EncoderModel.from_pretrained("Rostlab/ProstT5").to(device)
model.float() if device.type=='cpu' else model.half()

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


T5EncoderModel(
  (shared): Embedding(150, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(150, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=4096, bias=False)
              (k): Linear(in_features=1024, out_features=4096, bias=False)
              (v): Linear(in_features=1024, out_features=4096, bias=False)
              (o): Linear(in_features=4096, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 32)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=1024, out_features=16384, bias=False)
              (wo): Linear(in_features=16384, out_features=1024, bias=False)
              (dropout): Dropo

## get embeddings (using prostt5 as an example)

In [6]:
# key and value to two separate lists
keys = list(all_seqs.keys())
values = list(all_seqs.values())

In [7]:
seq_embeddings = []
for i in tqdm(values):
    sequences = [i]
    length = len(sequences[0])
    sequences = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequence in sequences]
    sequences = [ "<AA2fold>" + " " + s if s.isupper() else "<fold2AA>" + " " + s # this expects 3Di sequences to be already lower-case
                      for s in sequences
                    ]
    ids = tokenizer.batch_encode_plus(sequences,
                                  add_special_tokens=True,
                                  padding="longest",
                                  return_tensors='pt').to(device)
    with torch.no_grad():
      embedding_repr = model(
              ids.input_ids, 
              attention_mask=ids.attention_mask
              )
    emb = embedding_repr.last_hidden_state[0, 1 : length + 1].cpu().numpy()
    seq_embeddings.append(emb)

100%|██████████| 260/260 [00:09<00:00, 28.70it/s]


In [8]:
len(seq_embeddings), len(seq_embeddings[0]), seq_embeddings[0].shape

(260, 95, (95, 1024))

In [None]:
# save to pickle
import pickle
output_dict = dict(zip(keys, seq_embeddings))
with open("../embeddings/prostt5_aligned_embs.pkl", "wb") as f:
    pickle.dump(output_dict, f, protocol=4) 