# Tasks Assessing Protein Embeddings (TAPE)
https://github.com/songlab-cal/tape

In [None]:
# need this code1q snippet in each notebook to mount the google drive
from google.colab import drive  
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [None]:
%%capture
!pip install tape_proteins

In [None]:
import numpy as np
import torch
from tape import ProteinBertModel, TAPETokenizer
MAX_PROT_LEN = 810

model = ProteinBertModel.from_pretrained('bert-base')
tokenizer = TAPETokenizer(vocab='iupac')  # iupac is the vocab for TAPE models, use unirep for the UniRep model
#tokenizer = TAPETokenizer(vocab='unirep')

# Pfam Family: Hexapep, Clan: CL0536
sequence = 'GCTVEDRCLIGMGAILLNGCVIGSGSLVAAGALITQ'
#sequence = 'MAASGKTSKSEPNHVIFKKISRDKSVTIYLGNRDYIDHVSQVQPVDGVVLVDPDLVKGKKVYVTLTCAFRYGQEDIDVIGLTFRRDLYFSRVQVYPPVGAASTPTKLQESLLKKLGSNTYPFLLTFPDYLPCSVMLQPAPQDSGKSCGVDFEVKAFATDSTDAEEDKIPKKSSVRLLIRKVQHAPLEMGPQPRAEAAWQFFMSDKPLHLAVSLNKEIYFHGEPIPVTVTVTNNTEKTVKKIKAFVEQVANVVLYSSDYYVKPVAMEEAQEKVPPNSTLTKTLTLLPLLANNRERRGIALDGKIKHEDTNLASSTIIKEGIDRTVLGILVSYQIKVKLTVSGFLGELTSSEVATEVPFRLMHPQPEDPAKESYQDANLVFEEFARHNLKDAGEAEEGKRDKNDVDE'
token_ids = torch.tensor([tokenizer.encode(sequence)])

# now pad zeros
a = token_ids[0].cpu().detach().numpy()
b=np.array([np.pad(a,(0, MAX_PROT_LEN-len(sequence)), 'constant')])
token_ids = torch.from_numpy(b)
output = model(token_ids)
sequence_output = output[0]

In [None]:
print(len(sequence))

36


In [None]:
print(sequence_output.size())

torch.Size([1, 812, 768])


In [None]:
x = sequence_output[:,:sequence_output.size(1)//2,:]
x.size()

torch.Size([1, 406, 768])

In [None]:
token_ids = tokenizer.encode(sequence)

In [None]:
token_ids[1:-1]

array([11,  7, 23, 25,  9,  8, 21,  7, 15, 13, 11, 16, 11,  5, 13, 15, 15,
       17, 11,  7, 25, 13, 11, 22, 11, 22, 15, 25,  5,  5, 11,  5, 15, 13,
       23, 20])

In [None]:
output[0].size()

torch.Size([1, 812, 768])

# Untrained Model

In [None]:
def test_basic():
    import torch
    from tape import ProteinBertModel, ProteinBertConfig, TAPETokenizer  # type: ignore

    config = ProteinBertConfig(hidden_size=480, intermediate_size=480 * 4, num_hidden_layers=12)
    model = ProteinBertModel(config)
    tokenizer = TAPETokenizer(vocab='iupac')

    sequence = 'GCTVEDRCLIGMGAILLNGCVIGSGSLVAAGALITQ'
    token_ids = torch.tensor([tokenizer.encode(sequence)])
    output = model(token_ids)
    sequence_output = output[0]  # noqa
    pooled_output = output[1]  # noqa
    return sequence_output

In [None]:
output = test_basic()
print(output.size())

torch.Size([1, 38, 480])
