<a href="https://colab.research.google.com/github/agemagician/Prot-Transformers/blob/master/Embedding/Advanced/XLNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h3> Extracting protein sequences' features using ProtXLNet pretrained-model <h3>

<b>1. Load necessry libraries including huggingface transformers<b>

In [1]:
!pip install -q transformers

[K     |████████████████████████████████| 675kB 2.6MB/s 
[K     |████████████████████████████████| 1.1MB 8.8MB/s 
[K     |████████████████████████████████| 3.8MB 16.3MB/s 
[K     |████████████████████████████████| 890kB 31.9MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [2]:
import torch
from transformers import XLNetModel, XLNetTokenizer
import re
import os
import requests
from tqdm.auto import tqdm

<b>2. Set the url location of ProtXLNet and the vocabulary file<b>

In [3]:
modelUrl = 'https://www.dropbox.com/s/z0i0z01d2wm19ap/pytorch_model.bin?dl=1'
configUrl = 'https://www.dropbox.com/s/to876ivj48wylkj/config.json?dl=1'
tokenizerUrl = 'https://www.dropbox.com/s/mvypdtedpuz0yxg/spm_model.model?dl=1'

<b>3. Download ProtXLNet models and vocabulary files<b>

In [4]:
downloadFolderPath = 'models/ProtXLNet/'

In [5]:
modelFolderPath = downloadFolderPath

modelFilePath = os.path.join(modelFolderPath, 'pytorch_model.bin')

configFilePath = os.path.join(modelFolderPath, 'config.json')

tokenizerFilePath = os.path.join(modelFolderPath, 'spm_model.model')

In [6]:
if not os.path.exists(modelFolderPath):
    os.makedirs(modelFolderPath)

In [7]:
def download_file(url, filename):
  response = requests.get(url, stream=True)
  with tqdm.wrapattr(open(filename, "wb"), "write", miniters=1,
                    total=int(response.headers.get('content-length', 0)),
                    desc=filename) as fout:
      for chunk in response.iter_content(chunk_size=4096):
          fout.write(chunk)

In [8]:
if not os.path.exists(modelFilePath):
    download_file(modelUrl, modelFilePath)

if not os.path.exists(configFilePath):
    download_file(configUrl, configFilePath)

if not os.path.exists(tokenizerFilePath):
    download_file(tokenizerUrl, tokenizerFilePath)

HBox(children=(FloatProgress(value=0.0, description='models/ProtXLNet/pytorch_model.bin', max=1637757076.0, st…




HBox(children=(FloatProgress(value=0.0, description='models/ProtXLNet/config.json', max=1351.0, style=Progress…




HBox(children=(FloatProgress(value=0.0, description='models/ProtXLNet/spm_model.model', max=238192.0, style=Pr…




<b>4. Load the vocabulary and ProtXLNet Model<b>

In [9]:
tokenizer = XLNetTokenizer(tokenizerFilePath, do_lower_case=False)

In [10]:
xlnet_men_len = 512

In [11]:
model = XLNetModel.from_pretrained(modelFolderPath,mem_len=xlnet_men_len)

<b>5. Load the model into the GPU if avilabile and switch to inference mode<b>

In [12]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [13]:
model = model.to(device)
model = model.eval()

<b>6. Create or load sequences and map rarely occured amino acids (U,Z,O,B) to (X)<b>

In [14]:
sequences_Example = ["A E T C Z A O","S K T Z P"]

In [15]:
sequences_Example = [re.sub(r"[UZOB]", "X", sequence) for sequence in sequences_Example]

<b>7. Tokenize, encode sequences and load it into the GPU if possibile<b>

In [16]:
ids = tokenizer.batch_encode_plus(sequences_Example, add_special_tokens=True, pad_to_max_length=True)

In [17]:
input_ids = torch.tensor(ids['input_ids']).to(device)
attention_mask = torch.tensor(ids['attention_mask']).to(device)

<b>8. Extracting sequences' features and load it into the CPU if needed<b>

In [18]:
with torch.no_grad():
    embedding, memory = model(input_ids=input_ids,attention_mask=attention_mask,mems=None)

In [19]:
embedding = embedding.cpu().numpy()

<b>9. Remove padding ([PAD]) and special tokens ([CLS],[SEP]) that is added by ProtXLNet model<b>

In [20]:
features = [] 
for seq_num in range(len(embedding)):
    seq_len = (attention_mask[seq_num] == 1).sum()
    padded_seq_len = len(attention_mask[seq_num])
    seq_emd = embedding[seq_num][padded_seq_len-seq_len:padded_seq_len-2]
    features.append(seq_emd)

In [21]:
print(features)

[array([[ 0.48745194, -0.770879  ,  0.99001765, ..., -0.37356165,
        -1.0589758 ,  0.9559981 ],
       [ 0.21325967, -0.5478905 ,  0.61154914, ..., -0.0547137 ,
        -0.8787893 ,  0.24645106],
       [ 0.37891382, -0.63965815,  0.6722441 , ..., -0.14891182,
        -0.6769571 ,  0.34598157],
       ...,
       [ 0.09265058, -0.68101126,  0.5181862 , ..., -0.3275623 ,
        -0.5673123 , -0.16441283],
       [-0.08533144, -0.7438235 ,  0.29890785, ..., -0.2437637 ,
        -0.11153173, -0.7260989 ],
       [-0.4822571 , -0.8381702 ,  0.08214345, ..., -0.25196436,
        -0.03577353, -0.5348941 ]], dtype=float32), array([[ 1.0403053 , -0.95049226,  0.3353453 , ..., -0.23747277,
        -0.27550653,  0.47948802],
       [ 0.50853604, -0.9508459 ,  1.0235127 , ..., -0.05893146,
        -0.9752788 ,  0.08713217],
       [ 0.65626544, -0.7540615 ,  0.43234807, ...,  0.36707586,
        -0.7063245 , -0.5553407 ],
       [ 0.5903388 ,  0.14151968,  0.29578856, ...,  0.20601138,
     