In [1]:
import torch
import torch.nn as nn
import numpy as np
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import ast #used to parse list string
import torch.nn.functional as F

#import esm 
import os
import subprocess
import sys

  from .autonotebook import tqdm as notebook_tqdm


# Example 1: A single prediction

Here, we can make a single prediction from a protein sequence.

### Loading the model

In [16]:
class TPMLPmodel(nn.Module):
    def __init__(self, input_size=2560, common_dim=1024, dropout_rate=0.2):
        super(TPMLPmodel, self).__init__()

        self.fc1 = nn.Linear(input_size, int(common_dim))
        self.bn1 = nn.BatchNorm1d(common_dim)
        self.dropout1 = nn.Dropout(dropout_rate)

        self.fc2 = nn.Linear(common_dim, int(common_dim // 2))
        self.bn2 = nn.BatchNorm1d(int(common_dim // 2))
        self.dropout2 = nn.Dropout(dropout_rate)

        self.fc3 = nn.Linear(int(common_dim // 2), int(common_dim // 4))
        self.bn3 = nn.BatchNorm1d(int(common_dim // 4))
        self.dropout3 = nn.Dropout(dropout_rate)

        self.fc4 = nn.Linear(int(common_dim // 4), 1)

    def forward(self, x):
        x = self.dropout1(self.bn1(self.fc1(x)))
        x = F.gelu(x)

        x = self.dropout2(self.bn2(self.fc2(x)))
        x = F.gelu(x)


        x = self.dropout3(self.bn3(self.fc3(x)))
        x = F.gelu(x)

        x = self.fc4(x)
        x = torch.sigmoid(x).squeeze(1)

        return x

In [17]:
#this model is for the projection from protein dimension to molecule dimension
model1 = TPMLPmodel(
        input_size=1536, common_dim=512, dropout_rate=0.2
)
model1.load_state_dict(torch.load("../weights/weights.pth", map_location='cpu'))
model1.eval() #call the eval function


TPMLPmodel(
  (fc1): Linear(in_features=1536, out_features=512, bias=True)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout1): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout2): Dropout(p=0.2, inplace=False)
  (fc3): Linear(in_features=256, out_features=128, bias=True)
  (bn3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout3): Dropout(p=0.2, inplace=False)
  (fc4): Linear(in_features=128, out_features=1, bias=True)
)

### Running on a single sequence an a cutoff.


Define our sequence

In [2]:
sequence = "MRAAYACDPMATRGRAVVEEESAHRSPFQRDRDRIIHSSAFRRLKH"


We first need to generate an Ankh embedding.

In [7]:
import ankh

In [8]:

device ="null"
try:    
    device = torch.device("cuda") #get the device: cpu or cuda 
except:
    device = torch.device("cpu")
    
# Load ankh model
model_ankh, tokenizer_ankh = ankh.load_large_model()
model_ankh.eval()
#function to return an ankh embedding
def get_ankh(sequence):
    #limiting the sequence length to 1024
    sequence = sequence[:1024]
    #check if all letters 
    #get the ankh embeddings and saving
    protein_sequences = [[sequence]]
    outputs = tokenizer_ankh.batch_encode_plus(protein_sequences, 
                                        add_special_tokens=True, 
                                        padding=True, 
                                        is_split_into_words=True, 
                                        return_tensors="pt")
    with torch.no_grad():
        embeddings = model_ankh(input_ids=outputs['input_ids'], attention_mask=outputs['attention_mask'])
    true_tensor = embeddings["last_hidden_state"].squeeze().mean(0).tolist()
    return true_tensor

Some weights of the model checkpoint at ElnaggarLab/ankh-large were not used when initializing T5EncoderModel: ['decoder.block.10.layer.0.SelfAttention.v.weight', 'decoder.block.17.layer.1.EncDecAttention.o.weight', 'decoder.block.19.layer.2.DenseReluDense.wo.weight', 'decoder.block.19.layer.1.EncDecAttention.k.weight', 'decoder.block.2.layer.1.EncDecAttention.o.weight', 'decoder.block.1.layer.1.EncDecAttention.k.weight', 'decoder.block.11.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.5.layer.2.DenseReluDense.wo.weight', 'decoder.block.20.layer.2.DenseReluDense.wi_1.weight', 'decoder.block.22.layer.0.SelfAttention.q.weight', 'decoder.block.6.layer.0.SelfAttention.o.weight', 'decoder.block.1.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.17.layer.2.layer_norm.weight', 'decoder.block.0.layer.2.layer_norm.weight', 'decoder.block.2.layer.0.SelfAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.o.weight', 'decoder.b

In [9]:

ankh_embedings = get_ankh(sequence)

Convert to correct format

In [19]:
X_prot_test_tensor = torch.tensor(ankh_embedings).float()
#make 2d to fit the model
X_prot_test_tensor = X_prot_test_tensor.unsqueeze(0) 
print(X_prot_test_tensor.size())

torch.Size([1, 1536])


Getting predicted value a probability

In [21]:
with torch.no_grad():
    test_pred = model1(X_prot_test_tensor) #input torch tensors  
#convert predicted tensor to numpy
test_pred =  np.array(test_pred)
print("predicted value:", test_pred)

predicted value: [0.00079683]


Using our cutoff, classify. 1 means yes, 0 means no

In [26]:
ideal_cutoff = 0.6938000000000001

In [27]:
test_classified = (test_pred > ideal_cutoff).astype(int)
print("final classified value:", test_classified)
print("is our protein a transporter?", "no" if test_classified == 0 else "yes")

final classified value: [0]
is our protein a transporter? no
