In [1]:
import torch
import esm
import os
import sys
sys.path.append("../src")

# Load ESM-2 model


In [20]:
from utils import get_pseudo_likelihood

In [3]:
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
batch_converter = alphabet.get_batch_converter()
model.eval()  # disables dropout for deterministic results

# Prepare data (first 2 sequences from ESMStructuralSplitDataset superfamily / 4)
data = [
    ("protein1", "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG"),
    ("protein2", "KALTARQQEVFDLIRDHISQTGMPPTRAEIAQRLGFRSPNAAEEHLKALARKGVIEIVSGASRGIRLLQEE"),
    ("protein2 with mask","KALTARQQEVFDLIRD<mask>ISQTGMPPTRAEIAQRLGFRSPNAAEEHLKALARKGVIEIVSGASRGIRLLQEE"),
    ("protein3",  "K A <mask> I S Q"),
]
batch_labels, batch_strs, batch_tokens = batch_converter(data)
batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

# Extract per-residue representations (on CPU)


In [8]:
with torch.no_grad():
    results = model(batch_tokens, repr_layers=[33], return_contacts=True)
token_representations = results["representations"][33]

# Generate per-sequence representations via averaging
# NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
sequence_representations = []
for i, tokens_len in enumerate(batch_lens):
    sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0))

# Look at the unsupervised self-attention map contact predictions


In [2]:
sequences = ["MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG",
             "KALTARQQEVFDLIRDHISQTGMPPTRAEIAQRLGFRSPNAAEEHLKALARKGVIEIVSGASRGIRLLQEE",
             "KALTARQQEVFDLIRDISQTGMPPTRAEIAKALTFRSPNAAEEHLKALARKGVIEIVSGASRGIRLLQEE",
             "KAISQ"]

### ProtBERT Exploration

In [3]:
from transformers import BertModel, BertTokenizer, BertForMaskedLM, pipeline
import pandas as pd
import numpy as np
from tqdm import tqdm 
import torch
import re
import protbert

tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False )
model = BertModel.from_pretrained("Rostlab/prot_bert")

def prepare_sequence(sequences):
    sequences = [add_space(sequence) for sequence in sequences]
    return sequences

def add_space(row):
    if not isinstance(row, float):
        row = " ".join(row)
    return row









In [59]:
sequences = prepare_sequence(sequences)

for sequence,_ in zip(enumerate(sequences), tqdm(range(len(sequences)))):
            if not isinstance(sequence[1], float):
                tokenized_sequences = tokenizer(sequence[1], return_tensors= 'pt') #return tensors using pytorch
                output = model(**tokenized_sequences)

 75%|████████████████████████████████████████████████████████████████████████████████████████████████▊                                | 3/4 [01:09<00:23, 23.15s/it]


### ProtBERT likelihood computation

In [84]:
### attempt 1

mask_model = BertForMaskedLM.from_pretrained("Rostlab/prot_bert")
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False )
unmasker = pipeline('fill-mask', model=mask_model, tokenizer=tokenizer, top_k = 21)

def prepare_sequence_MLM(seq_tokens, pos):
    x = seq_tokens.copy()
    x[pos] = "[MASK]"
    return ' '.join(x)

probs = []
for sequence in tqdm(sequences):
    probs_seq = []
    seq_tokens = list(sequence)
    for pos in tqdm(range(len(sequence))):
        prep_seq = prepare_sequence_MLM(seq_tokens, pos)
        scores = unmasker(prep_seq)
        scores_dict = {dict["token_str"]:dict["score"] for dict in scores}
        probs_seq.append(scores_dict)
    probs.append(pd.DataFrame(probs_seq))

                


Some weights of the model checkpoint at Rostlab/prot_bert were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
### attempt 2
device = "cuda:0" if torch.cuda.is_available() else "cpu"

mask_model = BertForMaskedLM.from_pretrained("Rostlab/prot_bert").to(device)
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)

import scipy
import pandas as pd

probs = []

                        
for sequence in tqdm(sequences):
    seq_tokens = ' '.join(list(sequence))
    seq_tokens = tokenizer(seq_tokens, return_tensors='pt').to(device)
    logits = mask_model(**seq_tokens).logits[0].cpu().detach().numpy()
    prob = scipy.special.softmax(logits,axis = 1)
    df = pd.DataFrame(prob, columns = tokenizer.vocab)
    df = df.iloc[:,5:-5]
    df = df.loc[:, df.columns.isin(["U","Z","O","B","X"]) == False]
    #removing CLS and SEP
    df = df.iloc[1:-1,:]
    df = df.reindex(sorted(df.columns), axis=1)
    probs.append(df)
    


Some weights of the model checkpoint at Rostlab/prot_bert were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:06<00:00,  1.73s/it]


In [39]:
def get_pseudo_likelihood(probs, sequences):
    probs_all = []
    for i,sequence_probs in enumerate(tqdm(probs)):
        wt_probs_full = []
        for pos in range(sequence_probs.shape[0]):

            wt_j = sequences[i][pos]
            #Can comment if PLM gives probabilities also for gaps
            if wt_j == "-" or wt_j =="*":
                continue
            wt_prob = sequence_probs.iloc[pos,:][wt_j]
            wt_probs_full.append(np.log(wt_prob))
        probs_all.append(np.average(wt_probs_full))
    return probs_all
    
pseudo = get_pseudo_likelihood(probs, list(sequences))


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12570/12570 [02:11<00:00, 95.33it/s]


In [40]:
pseudo

[-0.1878769,
 -0.18736279,
 -0.19010155,
 -0.27160725,
 -0.20921014,
 -0.17248185,
 -0.16602251,
 -0.18929744,
 -0.21467482,
 -0.16016108,
 -0.22486228,
 -0.17152075,
 -0.17114305,
 -0.20624895,
 -0.18542437,
 -0.28184122,
 -0.20921014,
 -0.13305043,
 -0.15144818,
 -0.16825531,
 -0.15589093,
 -0.16131198,
 -0.14519814,
 -0.15034878,
 -0.18184976,
 -0.19896989,
 -0.13709423,
 -0.17970262,
 -0.21458061,
 -0.20334244,
 -0.19303684,
 -0.17274092,
 -0.13191147,
 -0.16480674,
 -0.23642138,
 -0.1905893,
 -0.1584998,
 -0.15452017,
 -0.16783366,
 -0.16203225,
 -0.17442696,
 -0.15610264,
 -0.17780803,
 -0.19270894,
 -0.17460558,
 -0.16304336,
 -0.15370403,
 -0.15483429,
 -0.19534646,
 -0.16098097,
 -0.20988026,
 -0.2032551,
 -0.13420674,
 -0.19551572,
 -0.12234064,
 -0.15980375,
 -0.2293673,
 -0.18524839,
 -0.17762665,
 -0.13517477,
 -0.21786273,
 -0.23571877,
 -0.15510426,
 -0.23508891,
 -0.26100925,
 -0.16012886,
 -0.23339401,
 -0.21458061,
 -0.17970262,
 -0.18607841,
 -0.1865574,
 -0.15395415

### Antiberty likelihood Computation

In [25]:
import antiberty

antiberty = antiberty.AntiBERTyRunner()

pll = antiberty.pseudo_log_likelihood(sequences)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 83.34it/s]


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument target in method wrapper_CUDA__nll_loss2d_forward)

In [28]:
antiberty.tokenizer.encode("M M M", return_tensors="pt").device

device(type='cpu')

### Ablang likelihood Computation



In [4]:
import ablang

heavy_ablang = ablang.pretrained("heavy")
heavy_ablang.freeze()



AttributeError: 'pretrained' object has no attribute 'to'

In [165]:
probs = []
for sequence in tqdm(sequences):
    logits = heavy_ablang(sequence, mode="likelihood")[0]
    prob = scipy.special.softmax(logits,axis = 1)
    df = pd.DataFrame(prob, columns = list(heavy_ablang.tokenizer.vocab_to_aa.values())[4:])
    #removing CLS and SEP
    df = df.iloc[1:-1,:]
    df = df.reindex(sorted(df.columns), axis=1)
    probs.append(df)

likelihoods = get_pseudo_likelihood(probs, sequences) 


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:16<00:00,  4.14s/it]


In [None]:
tcrbert = BertModel.from_pretrained("wukevin/tcr-bert")
tcrbert_tokenizer = BertTokenizer.from_pretrained("wukevin/tcr-bert")

for sequence in tqdm(sequences):
    seq_tokens = ' '.join(sequence)
    seq_tokens = tcrbert_tokenizer(seq_tokens, return_tensors='pt')
    logits = tcrbert(**seq_tokens).logits[0].detach().numpy()
    prob = scipy.special.softmax(logits,axis = 1)
    df = pd.DataFrame(prob, columns = tcrbert_tokenizer.vocab)
    df = df.iloc[:,5:-5]
    df = df.loc[:, df.columns.isin(["U","Z","O","B","X"]) == False]
    #removing CLS and SEP
    df = df.iloc[1:-1,:]
    df = df.reindex(sorted(df.columns), axis=1)
    probs.append(df)

In [206]:
import torch
import pandas as pd
import numpy as np
import skorch
from transformers import BertModel, BertTokenizer


class TCRBert():
    """
    Class for the TCRBert Language Model
    """

    def __init__(self, method='average', file_name = 'TCRBert'):
        """
        Creates the instance of the language model instance, loads tokenizer and model
        """
        self.model = BertModel.from_pretrained("wukevin/tcr-bert-mlm-only")
        self.tokenizer = BertTokenizer.from_pretrained("wukevin/tcr-bert-mlm-only")
        self.method = method
        self.file = file_name
        
    def fit_transform(self, sequences:list):
        """
        Fits the model and outputs the embeddings.
        parameters
        ----------
        sequences: `list` 
        List with sequences to be transformed
        ------
        None, saved the embeddings in the embeddings.csv
        """
        embeddings = []
        print("Using '"+self.method+"' Method")
        for sequence in sequences:
            sequence = ' '.join(sequence)
            token = self.tokenizer(sequence, return_tensors="pt")
            print(token)
            output = self.model(**token)
            if self.method == "average":
                output = torch.mean(output.last_hidden_state, axis = 1)[0]
                    
            elif self.method == "pooler":
                output = output.pooler_output[0]
                
            elif self.method == "first":
                output = output.last_hidden_state[0,0,:]

            elif self.method == "last":
                output = output.last_hidden_state[0,-1,:]
            
            embeddings.append(output.tolist())
        
        pd.DataFrame(embeddings).to_csv("outfiles/"+self.file+"/embeddings.csv")


In [207]:
tcrbert = TCRBert()

Some weights of BertModel were not initialized from the model checkpoint at wukevin/tcr-bert-mlm-only and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
