In [164]:
import torch
import esm
import os
import sys
sys.path.append("../src")

from utils import get_pseudo_likelihood
# Load ESM-2 model


In [3]:
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
batch_converter = alphabet.get_batch_converter()
model.eval()  # disables dropout for deterministic results

# Prepare data (first 2 sequences from ESMStructuralSplitDataset superfamily / 4)
data = [
    ("protein1", "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG"),
    ("protein2", "KALTARQQEVFDLIRDHISQTGMPPTRAEIAQRLGFRSPNAAEEHLKALARKGVIEIVSGASRGIRLLQEE"),
    ("protein2 with mask","KALTARQQEVFDLIRD<mask>ISQTGMPPTRAEIAQRLGFRSPNAAEEHLKALARKGVIEIVSGASRGIRLLQEE"),
    ("protein3",  "K A <mask> I S Q"),
]
batch_labels, batch_strs, batch_tokens = batch_converter(data)
batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

# Extract per-residue representations (on CPU)


In [74]:
sum(batch_tokens[0] != alphabet.padding_idx)

tensor(67)

In [8]:
with torch.no_grad():
    results = model(batch_tokens, repr_layers=[33], return_contacts=True)
token_representations = results["representations"][33]

# Generate per-sequence representations via averaging
# NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
sequence_representations = []
for i, tokens_len in enumerate(batch_lens):
    sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0))

# Look at the unsupervised self-attention map contact predictions


In [None]:
sequences = ["MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG",
             "KALTARQQEVFDLIRDHISQTGMPPTRAEIAQRLGFRSPNAAEEHLKALARKGVIEIVSGASRGIRLLQEE",
             "KALTARQQEVFDLIRDISQTGMPPTRAEIAKALTFRSPNAAEEHLKALARKGVIEIVSGASRGIRLLQEE",
             "KAISQ"]

### ProtBERT Exploration

In [80]:
from transformers import BertModel, BertTokenizer, BertForMaskedLM, pipeline
import pandas as pd
import numpy as np
from tqdm import tqdm 
import torch
import re
import protbert

tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False )
model = BertModel.from_pretrained("Rostlab/prot_bert")

def prepare_sequence(sequences):
    sequences = [add_space(sequence) for sequence in sequences]
    return sequences

def add_space(row):
    if not isinstance(row, float):
        row = " ".join(row)
    return row









In [59]:
sequences = prepare_sequence(sequences)

for sequence,_ in zip(enumerate(sequences), tqdm(range(len(sequences)))):
            if not isinstance(sequence[1], float):
                tokenized_sequences = tokenizer(sequence[1], return_tensors= 'pt') #return tensors using pytorch
                output = model(**tokenized_sequences)

 75%|████████████████████████████████████████████████████████████████████████████████████████████████▊                                | 3/4 [01:09<00:23, 23.15s/it]


### ProtBERT likelihood computation

In [84]:
### attempt 1

mask_model = BertForMaskedLM.from_pretrained("Rostlab/prot_bert")
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False )
unmasker = pipeline('fill-mask', model=mask_model, tokenizer=tokenizer, top_k = 21)

def prepare_sequence_MLM(seq_tokens, pos):
    x = seq_tokens.copy()
    x[pos] = "[MASK]"
    return ' '.join(x)

probs = []
for sequence in tqdm(sequences):
    probs_seq = []
    seq_tokens = list(sequence)
    for pos in tqdm(range(len(sequence))):
        prep_seq = prepare_sequence_MLM(seq_tokens, pos)
        scores = unmasker(prep_seq)
        scores_dict = {dict["token_str"]:dict["score"] for dict in scores}
        probs_seq.append(scores_dict)
    probs.append(pd.DataFrame(probs_seq))

                


Some weights of the model checkpoint at Rostlab/prot_bert were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
### attempt 2

import scipy
probs = []
for sequence in tqdm(sequences):
    seq_tokens = ' '.join(list(sequence))
    seq_tokens = tokenizer(seq_tokens, return_tensors='pt')
    logits = mask_model(**seq_tokens).logits[0].detach().numpy()
    prob = scipy.special.softmax(logits,axis = 1)
    df = pd.DataFrame(prob, columns = tokenizer.vocab)
    df = df.iloc[:,5:-5]
    df = df.loc[:, df.columns.isin(["U","Z","O","B","X"]) == False]
    #removing CLS and SEP
    df = df.iloc[1:-1,:]
    df = df.reindex(sorted(df.columns), axis=1)
    probs.append(df)
    


### Antiberty likelihood Computation

In [146]:
import antiberty

antiberty = antiberty.AntiBERTyRunner()

pll = antiberty.pseudo_log_likelihood(sequences,batch_size=1)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 65/65 [01:28<00:00,  1.37s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 71/71 [01:51<00:00,  1.57s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 70/70 [01:50<00:00,  1.57s/it]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  5.99it/s]


In [150]:
pll

tensor([-3.6782, -3.5895, -3.4208, -4.2961])

### Ablang likelihood Computation



In [152]:
import ablang

heavy_ablang = ablang.pretrained("heavy")
heavy_ablang.freeze()



Downloading model ...


In [165]:
probs = []
for sequence in tqdm(sequences):
    logits = heavy_ablang(sequence, mode="likelihood")[0]
    prob = scipy.special.softmax(logits,axis = 1)
    df = pd.DataFrame(prob, columns = list(heavy_ablang.tokenizer.vocab_to_aa.values())[4:])
    #removing CLS and SEP
    df = df.iloc[1:-1,:]
    df = df.reindex(sorted(df.columns), axis=1)
    probs.append(df)

likelihoods = get_pseudo_likelihood(probs, sequences) 


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:16<00:00,  4.14s/it]


In [None]:
tcrbert = BertModel.from_pretrained("wukevin/tcr-bert")
tcrbert_tokenizer = BertTokenizer.from_pretrained("wukevin/tcr-bert")

for sequence in tqdm(sequences):
    seq_tokens = ' '.join(sequence)
    seq_tokens = tcrbert_tokenizer(seq_tokens, return_tensors='pt')
    logits = tcrbert(**seq_tokens).logits[0].detach().numpy()
    prob = scipy.special.softmax(logits,axis = 1)
    df = pd.DataFrame(prob, columns = tcrbert_tokenizer.vocab)
    df = df.iloc[:,5:-5]
    df = df.loc[:, df.columns.isin(["U","Z","O","B","X"]) == False]
    #removing CLS and SEP
    df = df.iloc[1:-1,:]
    df = df.reindex(sorted(df.columns), axis=1)
    probs.append(df)

In [206]:
import torch
import pandas as pd
import numpy as np
import skorch
from transformers import BertModel, BertTokenizer


class TCRBert():
    """
    Class for the TCRBert Language Model
    """

    def __init__(self, method='average', file_name = 'TCRBert'):
        """
        Creates the instance of the language model instance, loads tokenizer and model
        """
        self.model = BertModel.from_pretrained("wukevin/tcr-bert-mlm-only")
        self.tokenizer = BertTokenizer.from_pretrained("wukevin/tcr-bert-mlm-only")
        self.method = method
        self.file = file_name
        
    def fit_transform(self, sequences:list):
        """
        Fits the model and outputs the embeddings.
        parameters
        ----------
        sequences: `list` 
        List with sequences to be transformed
        ------
        None, saved the embeddings in the embeddings.csv
        """
        embeddings = []
        print("Using '"+self.method+"' Method")
        for sequence in sequences:
            sequence = ' '.join(sequence)
            token = self.tokenizer(sequence, return_tensors="pt")
            print(token)
            output = self.model(**token)
            if self.method == "average":
                output = torch.mean(output.last_hidden_state, axis = 1)[0]
                    
            elif self.method == "pooler":
                output = output.pooler_output[0]
                
            elif self.method == "first":
                output = output.last_hidden_state[0,0,:]

            elif self.method == "last":
                output = output.last_hidden_state[0,-1,:]
            
            embeddings.append(output.tolist())
        
        pd.DataFrame(embeddings).to_csv("outfiles/"+self.file+"/embeddings.csv")


In [207]:
tcrbert = TCRBert()

Some weights of BertModel were not initialized from the model checkpoint at wukevin/tcr-bert-mlm-only and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [208]:
tcrbert.fit_transform(sequences)

Using 'average' Method
{'input_ids': tensor([[25, 17,  2,  6, 14,  0,  8,  4,  0, 16,  2,  5, 15, 14,  0, 15, 16,  4,
          0,  5,  2,  4, 12, 14,  5, 11, 13,  8, 16, 13,  4,  4, 16,  5, 14,  5,
          0,  8, 14, 15, 14,  8,  3, 15, 13, 19, 16,  0,  5, 16, 11, 19,  7, 15,
         14, 13,  6, 12,  0, 11, 19, 14, 16, 13, 11, 11, 24]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


RuntimeError: The size of tensor a (67) must match the size of tensor b (64) at non-singleton dimension 1

In [214]:
pd.DataFrame({"a":["xyz"]}).to_numpy()


array([['xyz']], dtype=object)