<a href="https://colab.research.google.com/github/agemagician/Prot-Transformers/blob/master/Embedding/Advanced/Electra.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h3> Extracting protein sequences' features using Electra pretrained-model <h3>

<b>1. Load necessry libraries including huggingface transformers<b>

In [1]:
!pip install -q transformers
!pip install -q gdown

[K     |████████████████████████████████| 675kB 4.7MB/s 
[K     |████████████████████████████████| 3.8MB 13.4MB/s 
[K     |████████████████████████████████| 890kB 41.3MB/s 
[K     |████████████████████████████████| 1.1MB 47.2MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [2]:
import torch
from transformers import ElectraTokenizer, ElectraForPreTraining, ElectraForMaskedLM, ElectraModel
import re
import gdown
import os

<b>2. Set the url location of Electra and the vocabulary file<b>

In [3]:
generatorModelUrl = 'https://drive.google.com/uc?export=download&confirm=BTQ_&id=1vaB80ioD8MNFB3zE_5AD-QJtNy0389jg'
discriminatorModelUrl = 'https://drive.google.com/uc?export=download&confirm=BTQ_&id=1xMUwFYs4tgD7qIs7XrrqQ6tKabH7ZyS9'

generatorConfigUrl = 'https://drive.google.com/uc?export=download&confirm=BTQ_&id=1SBtS-9_Wy26vZDjXBEos9KuiQc7TChhT'
discriminatorConfigUrl = 'https://drive.google.com/uc?export=download&confirm=BTQ_&id=1jZQLHL4TTMK5eoWL-JhihiVRVoUepC_B'

vocabUrl = 'https://drive.google.com/uc?export=download&confirm=BTQ_&id=1vuAP1zRvN1c6EHoSQMVC2ivZMTpzYR0P'

<b>3. Download Electra models and vocabulary files<b>

In [4]:
downloadFolderPath = 'models/electra/'

In [5]:
discriminatorFolderPath = os.path.join(downloadFolderPath, 'discriminator')
generatorFolderPath = os.path.join(downloadFolderPath, 'generator')

discriminatorModelFilePath = os.path.join(discriminatorFolderPath, 'pytorch_model.bin')
generatorModelFilePath = os.path.join(generatorFolderPath, 'pytorch_model.bin')

discriminatorConfigFilePath = os.path.join(discriminatorFolderPath, 'config.json')
generatorConfigFilePath = os.path.join(generatorFolderPath, 'config.json')

vocabFilePath = os.path.join(downloadFolderPath, 'vocab.txt')

In [6]:
if not os.path.exists(discriminatorFolderPath):
    os.makedirs(discriminatorFolderPath)
if not os.path.exists(generatorFolderPath):
    os.makedirs(generatorFolderPath)

In [7]:
def download_file(url,filename):
  while not os.path.exists(filename):
    gdown.download(url,filename, quiet=False)

In [8]:
if not os.path.exists(generatorModelFilePath):
    download_file(generatorModelUrl, generatorModelFilePath)

if not os.path.exists(discriminatorModelFilePath):
    download_file(discriminatorModelUrl, discriminatorModelFilePath)
    
if not os.path.exists(generatorConfigFilePath):
    download_file(generatorConfigUrl, generatorConfigFilePath)

if not os.path.exists(discriminatorConfigFilePath):
    download_file(discriminatorConfigUrl, discriminatorConfigFilePath)
    
if not os.path.exists(vocabFilePath):
    download_file(vocabUrl, vocabFilePath)

Downloading...
From: https://drive.google.com/uc?export=download&confirm=BTQ_&id=1vaB80ioD8MNFB3zE_5AD-QJtNy0389jg
To: /content/models/electra/generator/pytorch_model.bin
261MB [00:02, 94.4MB/s]
Downloading...
From: https://drive.google.com/uc?export=download&confirm=BTQ_&id=1xMUwFYs4tgD7qIs7XrrqQ6tKabH7ZyS9
To: /content/models/electra/discriminator/pytorch_model.bin
1.68GB [00:17, 93.7MB/s]
Downloading...
From: https://drive.google.com/uc?export=download&confirm=BTQ_&id=1SBtS-9_Wy26vZDjXBEos9KuiQc7TChhT
To: /content/models/electra/generator/config.json
100%|██████████| 463/463 [00:00<00:00, 840kB/s]
Downloading...
From: https://drive.google.com/uc?export=download&confirm=BTQ_&id=1jZQLHL4TTMK5eoWL-JhihiVRVoUepC_B
To: /content/models/electra/discriminator/config.json
100%|██████████| 468/468 [00:00<00:00, 669kB/s]
Downloading...
From: https://drive.google.com/uc?export=download&confirm=BTQ_&id=1vuAP1zRvN1c6EHoSQMVC2ivZMTpzYR0P
To: /content/models/electra/vocab.txt
100%|██████████| 81.0/

<b>4. Load the vocabulary and Electra discriminator and generator Models<b>

In [9]:
vocab = ElectraTokenizer(vocabFilePath, do_lower_case=False )

In [10]:
discriminator = ElectraForPreTraining.from_pretrained(discriminatorFolderPath)

In [11]:
generator = ElectraForMaskedLM.from_pretrained(generatorFolderPath)

In [12]:
electra = ElectraModel.from_pretrained(discriminatorFolderPath)

<b>5. Load the model into the GPU if avilabile and switch to inference mode<b>

In [13]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [14]:
discriminator = discriminator.to(device)
discriminator = discriminator.eval()

In [15]:
generator = generator.to(device)
generator = generator.eval()

In [16]:
electra = electra.to(device)
electra = electra.eval()

<b>6. Create or load sequences and map rarely occured amino acids (U,Z,O,B) to (X)<b>

In [17]:
sequences_Example = ["A E T C Z A O","S K T Z P"]

In [18]:
sequences_Example = [re.sub(r"[UZOB]", "X", sequence) for sequence in sequences_Example]

<b>7. Tokenize, encode sequences and load it into the GPU if possibile<b>

In [19]:
ids = vocab.batch_encode_plus(sequences_Example, add_special_tokens=True, pad_to_max_length=True)

In [20]:
input_ids = torch.tensor(ids['input_ids']).to(device)
attention_mask = torch.tensor(ids['attention_mask']).to(device)

<b>8. Extracting sequences' features and load it into the CPU if needed<b>

In [21]:
with torch.no_grad():
    discriminator_embedding = discriminator(input_ids=input_ids,attention_mask=attention_mask)[0]

In [22]:
discriminator_embedding = discriminator_embedding.cpu().numpy()

In [23]:
with torch.no_grad():
    generator_embedding = generator(input_ids=input_ids,attention_mask=attention_mask)[0]

In [24]:
generator_embedding = generator_embedding.cpu().numpy()

In [25]:
with torch.no_grad():
    electra_embedding = electra(input_ids=input_ids,attention_mask=attention_mask)[0]

In [26]:
electra_embedding = electra_embedding.cpu().numpy()

<b>9. Remove padding ([PAD]) and special tokens ([CLS],[SEP]) that is added by Electra model<b>

In [27]:
features = [] 
for seq_num in range(len(electra_embedding)):
    seq_len = (attention_mask[seq_num] == 1).sum()
    seq_emd = electra_embedding[seq_num][1:seq_len-1]
    features.append(seq_emd)

In [28]:
print(features)

[array([[-3.11754458e-02, -1.18080616e-01, -1.51422679e-01, ...,
        -8.80782455e-02, -2.03649044e-01,  2.34545898e-02],
       [-6.92143589e-02, -7.63380080e-02, -1.78088211e-02, ...,
        -4.15132381e-02, -3.08615528e-02, -8.58288854e-02],
       [ 3.80904488e-02, -1.71692267e-01, -5.64219430e-02, ...,
        -1.18378937e-01, -9.77956504e-02,  2.44725216e-02],
       ...,
       [ 1.27263516e-01, -1.34989679e-01, -3.06518644e-01, ...,
         3.99149172e-02, -4.54527065e-02, -3.57910693e-01],
       [-5.05245999e-02, -9.02514085e-02,  6.78477362e-02, ...,
        -4.76730466e-02, -9.57428291e-02, -1.68221351e-02],
       [ 3.07775717e-02,  7.57525049e-05, -5.32222912e-02, ...,
        -1.47995083e-02, -1.57044619e-01, -9.64660496e-02]], dtype=float32), array([[-6.04737513e-02, -1.60797983e-01, -1.63700715e-01, ...,
        -7.67330825e-02, -1.51252389e-01, -4.52133343e-02],
       [-9.30745900e-02, -5.02012298e-02, -1.62957162e-02, ...,
        -2.65192648e-04, -2.70886812e-