<h3> Extracting protein sequences' features using Electra pretrained-model <h3>

<b>1. Load necessry libraries including huggingface transformers<b>

In [1]:
import torch
from transformers import ElectraTokenizer, ElectraForPreTraining, ElectraForMaskedLM, ElectraModel
import re
import urllib
import os

<b>2. Set the url location of Electra and the vocabulary file<b>

In [2]:
generatorModelUrl = 'ftp://rostlab.org/bio-transformers/models/electra/pytorch/uniref100/generator/pytorch_model.bin'
discriminatorModelUrl = 'ftp://rostlab.org/bio-transformers/models/electra/pytorch/uniref100/discriminator/pytorch_model.bin'

generatorConfigUrl = 'ftp://rostlab.org/bio-transformers/models/electra/pytorch/uniref100/generator/config.json'
discriminatorConfigUrl = 'ftp://rostlab.org/bio-transformers/models/electra/pytorch/uniref100/discriminator/config.json'

vocabUrl = 'ftp://rostlab.org/bio-transformers/models/electra/pytorch/uniref100/discriminator/vocab.txt'

<b>3. Download Electra models and vocabulary files<b>

In [3]:
downloadFolderPath = 'tmp/electra/'

In [4]:
discriminatorFolderPath = os.path.join(downloadFolderPath, 'discriminator')
generatorFolderPath = os.path.join(downloadFolderPath, 'generator')

discriminatorModelFilePath = os.path.join(discriminatorFolderPath, 'pytorch_model.bin')
generatorModelFilePath = os.path.join(generatorFolderPath, 'pytorch_model.bin')

discriminatorConfigFilePath = os.path.join(discriminatorFolderPath, 'config.json')
generatorConfigFilePath = os.path.join(generatorFolderPath, 'config.json')

vocabFilePath = os.path.join(downloadFolderPath, 'vocab.txt')

In [5]:
if not os.path.exists(discriminatorFolderPath):
    os.makedirs(discriminatorFolderPath)
if not os.path.exists(generatorFolderPath):
    os.makedirs(generatorFolderPath)

In [6]:
if not os.path.exists(generatorModelFilePath):
    urllib.request.urlretrieve(generatorModelUrl, generatorModelFilePath)

if not os.path.exists(discriminatorModelFilePath):
    urllib.request.urlretrieve(discriminatorModelUrl, discriminatorModelFilePath)
    
if not os.path.exists(generatorConfigFilePath):
    urllib.request.urlretrieve(generatorConfigUrl, generatorConfigFilePath)

if not os.path.exists(discriminatorConfigFilePath):
    urllib.request.urlretrieve(discriminatorConfigUrl, discriminatorConfigFilePath)
    
if not os.path.exists(vocabFilePath):
    urllib.request.urlretrieve(vocabUrl, vocabFilePath)

<b>4. Load the vocabulary and Electra discriminator and generator Models<b>

In [7]:
vocab = ElectraTokenizer(vocabFilePath, do_lower_case=False )

In [8]:
discriminator = ElectraForPreTraining.from_pretrained(discriminatorFolderPath)

In [9]:
generator = ElectraForMaskedLM.from_pretrained(generatorFolderPath)

In [10]:
electra = ElectraModel.from_pretrained(discriminatorFolderPath)

<b>5. Load the model into the GPU if avilabile and switch to inference mode<b>

In [11]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [12]:
discriminator = discriminator.to(device)
discriminator = discriminator.eval()

In [13]:
generator = generator.to(device)
generator = generator.eval()

In [14]:
electra = electra.to(device)
electra = electra.eval()

<b>6. Create or load sequences and map rarely occured amino acids (U,Z,O,B) to (X)<b>

In [15]:
sequences_Example = ["A E T C Z A O","S K T Z P"]

In [16]:
sequences_Example = [re.sub(r"[UZOB]", "X", sequence) for sequence in sequences_Example]

<b>7. Tokenize, encode sequences and load it into the GPU if possibile<b>

In [17]:
ids = vocab.batch_encode_plus(sequences_Example, add_special_tokens=True, pad_to_max_length=True)

In [18]:
input_ids = torch.tensor(ids['input_ids']).to(device)
attention_mask = torch.tensor(ids['attention_mask']).to(device)

<b>8. Extracting sequences' features and load it into the CPU if needed<b>

In [19]:
with torch.no_grad():
    discriminator_embedding = discriminator(input_ids=input_ids,attention_mask=attention_mask)[0]

In [20]:
discriminator_embedding = discriminator_embedding.cpu().numpy()

In [21]:
with torch.no_grad():
    generator_embedding = generator(input_ids=input_ids,attention_mask=attention_mask)[0]

In [22]:
generator_embedding = generator_embedding.cpu().numpy()

In [23]:
with torch.no_grad():
    electra_embedding = electra(input_ids=input_ids,attention_mask=attention_mask)[0]

In [24]:
electra_embedding = electra_embedding.cpu().numpy()

<b>9. Remove padding ([PAD]) and special tokens ([CLS],[SEP]) that is added by Electra model<b>

In [25]:
features = [] 
for seq_num in range(len(electra_embedding)):
    seq_len = (attention_mask[seq_num] == 1).sum()
    seq_emd = electra_embedding[seq_num][1:seq_len-1]
    features.append(seq_emd)

In [26]:
print(features)

[array([[-3.11757270e-02, -1.18080482e-01, -1.51422888e-01, ...,
        -8.80779102e-02, -2.03648835e-01,  2.34548226e-02],
       [-6.92144260e-02, -7.63375461e-02, -1.78090129e-02, ...,
        -4.15136591e-02, -3.08615640e-02, -8.58286917e-02],
       [ 3.80903780e-02, -1.71692193e-01, -5.64221852e-02, ...,
        -1.18379034e-01, -9.77955908e-02,  2.44729687e-02],
       ...,
       [ 1.27262741e-01, -1.34989783e-01, -3.06518853e-01, ...,
         3.99144813e-02, -4.54520248e-02, -3.57909858e-01],
       [-5.05250208e-02, -9.02514383e-02,  6.78477511e-02, ...,
        -4.76735011e-02, -9.57429931e-02, -1.68221872e-02],
       [ 3.07772551e-02,  7.55244400e-05, -5.32223955e-02, ...,
        -1.47998398e-02, -1.57045141e-01, -9.64659974e-02]], dtype=float32), array([[-6.04736097e-02, -1.60798252e-01, -1.63700730e-01, ...,
        -7.67329112e-02, -1.51252106e-01, -4.52130586e-02],
       [-9.30741653e-02, -5.02011962e-02, -1.62956715e-02, ...,
        -2.65419309e-04, -2.70892959e-