<h3> Extracting protein sequences' features using Electra pretrained-model <h3>

<b>1. Load necessry libraries including huggingface transformers<b>

In [1]:
import torch
from transformers import ElectraTokenizer, ElectraForPreTraining, ElectraForMaskedLM, ElectraModel, pipeline
import re
import numpy as np
import urllib
import os

<b>2. Set the url location of Electra and the vocabulary file<b>

In [2]:
generatorModelUrl = 'ftp://rostlab.org/bio-transformers/models/electra/pytorch/uniref100/generator/pytorch_model.bin'
discriminatorModelUrl = 'ftp://rostlab.org/bio-transformers/models/electra/pytorch/uniref100/discriminator/pytorch_model.bin'

generatorConfigUrl = 'ftp://rostlab.org/bio-transformers/models/electra/pytorch/uniref100/generator/config.json'
discriminatorConfigUrl = 'ftp://rostlab.org/bio-transformers/models/electra/pytorch/uniref100/discriminator/config.json'

vocabUrl = 'ftp://rostlab.org/bio-transformers/models/electra/pytorch/uniref100/discriminator/vocab.txt'

<b>3. Download Electra models and vocabulary files<b>

In [3]:
downloadFolderPath = 'tmp/electra/'

In [4]:
discriminatorFolderPath = os.path.join(downloadFolderPath, 'discriminator')
generatorFolderPath = os.path.join(downloadFolderPath, 'generator')

discriminatorModelFilePath = os.path.join(discriminatorFolderPath, 'pytorch_model.bin')
generatorModelFilePath = os.path.join(generatorFolderPath, 'pytorch_model.bin')

discriminatorConfigFilePath = os.path.join(discriminatorFolderPath, 'config.json')
generatorConfigFilePath = os.path.join(generatorFolderPath, 'config.json')

vocabFilePath = os.path.join(downloadFolderPath, 'vocab.txt')

In [5]:
if not os.path.exists(discriminatorFolderPath):
    os.makedirs(discriminatorFolderPath)
if not os.path.exists(generatorFolderPath):
    os.makedirs(generatorFolderPath)

In [6]:
if not os.path.exists(generatorModelFilePath):
    urllib.request.urlretrieve(generatorModelUrl, generatorModelFilePath)

if not os.path.exists(discriminatorModelFilePath):
    urllib.request.urlretrieve(discriminatorModelUrl, discriminatorModelFilePath)
    
if not os.path.exists(generatorConfigFilePath):
    urllib.request.urlretrieve(generatorConfigUrl, generatorConfigFilePath)

if not os.path.exists(discriminatorConfigFilePath):
    urllib.request.urlretrieve(discriminatorConfigUrl, discriminatorConfigFilePath)
    
if not os.path.exists(vocabFilePath):
    urllib.request.urlretrieve(vocabUrl, vocabFilePath)

<b>4. Load the vocabulary and Electra discriminator and generator Models<b>

In [7]:
vocab = ElectraTokenizer(vocabFilePath, do_lower_case=False )

In [8]:
discriminator = ElectraForPreTraining.from_pretrained(discriminatorFolderPath)

In [9]:
generator = ElectraForMaskedLM.from_pretrained(generatorFolderPath)

In [10]:
electra = ElectraModel.from_pretrained(discriminatorFolderPath)

<b>5. Load the models into the GPU if avilabile<b>

In [11]:
discriminator = pipeline('feature-extraction', model=discriminator, tokenizer=vocab,device=0)

In [12]:
generator = pipeline('feature-extraction', model=generator, tokenizer=vocab,device=0)

In [13]:
electra = pipeline('feature-extraction', model=electra, tokenizer=vocab,device=0)

<b>6. Create or load sequences and map rarely occured amino acids (U,Z,O,B) to (X)<b>

In [14]:
sequences_Example = ["A E T C Z A O","S K T Z P"]

In [15]:
sequences_Example = [re.sub(r"[UZOB]", "X", sequence) for sequence in sequences_Example]

<b>7. Extracting sequences' features and covert the output to numpy if needed<b>

In [16]:
discriminator_embedding = discriminator(sequences_Example)

In [17]:
generator_embedding = generator(sequences_Example)

In [18]:
electra_embedding = electra(sequences_Example)

In [19]:
discriminator_embedding = np.array(discriminator_embedding)

In [20]:
generator_embedding = np.array(generator_embedding)

In [21]:
electra_embedding = np.array(electra_embedding)

In [22]:
print(discriminator_embedding)

[[-10.04449844  -1.24294269  -1.15765774  -1.16310215  -1.01715231
   -1.45999193  -1.4637301   -1.45527613 -11.15838051]
 [ -9.85941696  -1.03459871  -0.97092283  -1.22789431  -1.90789998
   -1.42081535 -10.74415016  -0.84459877  -0.76843232]]


In [23]:
print(generator_embedding)

[[[-1.23957548e+01 -1.27599611e+01 -3.73967781e+01 -2.04540806e+01
   -1.84475517e+01  4.21902627e-01 -1.41023815e+00  1.16866863e+00
    6.08512735e+00  1.03796873e+01 -8.59530258e+00  1.14877164e-01
    4.69289351e+00  1.02395687e+01  3.29155445e+00 -5.13703704e-01
   -4.13311672e+00 -6.47000027e+00  1.69130266e+00 -6.73368311e+00
   -2.65443110e+00 -5.94074917e+00 -3.86547065e+00 -4.43839407e+00
    7.09535122e+00  2.54481030e+00 -1.27812929e+01 -1.27303667e+01
   -1.22453709e+01 -1.29130554e+01]
  [-1.28436508e+01 -1.32133713e+01 -3.39164391e+01 -1.80228424e+01
   -1.99971008e+01 -8.79037976e-01  2.14121227e+01  3.00555801e+00
    6.51582050e+00  6.91750669e+00 -6.56447983e+00 -2.84737062e+00
    4.84939146e+00  7.71299076e+00 -3.31786299e+00  2.78463006e+00
    8.20150757e+00 -1.02165117e+01  2.70752668e+00 -2.60612249e+00
   -1.54998672e+00 -3.94209599e+00 -5.67770195e+00 -8.36739731e+00
   -7.96068847e-01  8.07460976e+00 -1.29483919e+01 -1.21770697e+01
   -1.29219017e+01 -1.2915

In [24]:
print(electra_embedding)

[[[-1.04451753e-01  1.96046561e-01  7.24662021e-02 ...  4.70826142e-02
   -1.38892919e-01 -1.83729574e-01]
  [-3.11757270e-02 -1.18080482e-01 -1.51422888e-01 ... -8.80779102e-02
   -2.03648835e-01  2.34548226e-02]
  [-6.92144260e-02 -7.63375461e-02 -1.78090129e-02 ... -4.15136591e-02
   -3.08615640e-02 -8.58286917e-02]
  ...
  [-5.05250208e-02 -9.02514383e-02  6.78477511e-02 ... -4.76735011e-02
   -9.57429931e-02 -1.68221872e-02]
  [ 3.07772551e-02  7.55244400e-05 -5.32223955e-02 ... -1.47998398e-02
   -1.57045141e-01 -9.64659974e-02]
  [ 2.91315955e-03 -3.36663313e-02  1.97645389e-02 ...  1.61298245e-01
   -1.03283875e-01 -1.35708630e-01]]

 [[-2.07917169e-01  1.58023492e-01  4.76760976e-02 ...  6.73385412e-02
   -1.69237435e-01 -1.67796254e-01]
  [-6.04736097e-02 -1.60798252e-01 -1.63700730e-01 ... -7.67329112e-02
   -1.51252106e-01 -4.52130586e-02]
  [-9.30741653e-02 -5.02011962e-02 -1.62956715e-02 ... -2.65419309e-04
   -2.70892959e-03 -2.37736460e-02]
  ...
  [-1.82417214e-01 -3.1

<b>Optional: Remove padding ([PAD]) and special tokens ([CLS],[SEP]) that is added by Electra model<b>

In [25]:
features = [] 

for seq_num in range(len(electra_embedding)):
    seq_len = len(sequences_Example[seq_num].replace(" ", ""))
    start_Idx = 1
    end_Idx = seq_len+1
    seq_emd = electra_embedding[seq_num][start_Idx:end_Idx]
    features.append(seq_emd)

In [26]:
print(features)

[array([[-3.11757270e-02, -1.18080482e-01, -1.51422888e-01, ...,
        -8.80779102e-02, -2.03648835e-01,  2.34548226e-02],
       [-6.92144260e-02, -7.63375461e-02, -1.78090129e-02, ...,
        -4.15136591e-02, -3.08615640e-02, -8.58286917e-02],
       [ 3.80903780e-02, -1.71692193e-01, -5.64221852e-02, ...,
        -1.18379034e-01, -9.77955908e-02,  2.44729687e-02],
       ...,
       [ 1.27262741e-01, -1.34989783e-01, -3.06518853e-01, ...,
         3.99144813e-02, -4.54520248e-02, -3.57909858e-01],
       [-5.05250208e-02, -9.02514383e-02,  6.78477511e-02, ...,
        -4.76735011e-02, -9.57429931e-02, -1.68221872e-02],
       [ 3.07772551e-02,  7.55244400e-05, -5.32223955e-02, ...,
        -1.47998398e-02, -1.57045141e-01, -9.64659974e-02]]), array([[-6.04736097e-02, -1.60798252e-01, -1.63700730e-01, ...,
        -7.67329112e-02, -1.51252106e-01, -4.52130586e-02],
       [-9.30741653e-02, -5.02011962e-02, -1.62956715e-02, ...,
        -2.65419309e-04, -2.70892959e-03, -2.37736460