<a href="https://colab.research.google.com/github/agemagician/Prot-Transformers/blob/master/Embedding/Basic/XLNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h3> Extracting protein sequences' features using ProtXLNet pretrained-model <h3>

<b>1. Load necessry libraries including huggingface transformers<b>

In [1]:
!pip install -q transformers

[K     |████████████████████████████████| 675kB 5.3MB/s 
[K     |████████████████████████████████| 3.8MB 26.7MB/s 
[K     |████████████████████████████████| 890kB 55.3MB/s 
[K     |████████████████████████████████| 1.1MB 54.8MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [2]:
import torch
from transformers import XLNetModel, XLNetTokenizer, pipeline
import re
import numpy as np
import os
import requests
from tqdm.auto import tqdm

<b>2. Set the url location of ProtXLNet and the vocabulary file<b>

In [3]:
modelUrl = 'https://www.dropbox.com/s/z0i0z01d2wm19ap/pytorch_model.bin?dl=1'
configUrl = 'https://www.dropbox.com/s/to876ivj48wylkj/config.json?dl=1'
tokenizerUrl = 'https://www.dropbox.com/s/mvypdtedpuz0yxg/spm_model.model?dl=1'

<b>3. Download ProtXLNet models and vocabulary files<b>

In [4]:
downloadFolderPath = 'models/ProtXLNet/'

In [5]:
modelFolderPath = downloadFolderPath

modelFilePath = os.path.join(modelFolderPath, 'pytorch_model.bin')

configFilePath = os.path.join(modelFolderPath, 'config.json')

tokenizerFilePath = os.path.join(modelFolderPath, 'spm_model.model')

In [6]:
if not os.path.exists(modelFolderPath):
    os.makedirs(modelFolderPath)

In [7]:
def download_file(url, filename):
  response = requests.get(url, stream=True)
  with tqdm.wrapattr(open(filename, "wb"), "write", miniters=1,
                    total=int(response.headers.get('content-length', 0)),
                    desc=filename) as fout:
      for chunk in response.iter_content(chunk_size=4096):
          fout.write(chunk)

In [8]:
if not os.path.exists(modelFilePath):
    download_file(modelUrl, modelFilePath)

if not os.path.exists(configFilePath):
    download_file(configUrl, configFilePath)

if not os.path.exists(tokenizerFilePath):
    download_file(tokenizerUrl, tokenizerFilePath)

HBox(children=(FloatProgress(value=0.0, description='models/ProtXLNet/pytorch_model.bin', max=1637757076.0, st…




HBox(children=(FloatProgress(value=0.0, description='models/ProtXLNet/config.json', max=1351.0, style=Progress…




HBox(children=(FloatProgress(value=0.0, description='models/ProtXLNet/spm_model.model', max=238192.0, style=Pr…




<b>4. Load the vocabulary and ProtXLNet Model<b>

In [9]:
tokenizer = XLNetTokenizer(tokenizerFilePath, do_lower_case=False)

In [10]:
model = XLNetModel.from_pretrained(modelFolderPath)

<b>5. Load the model into the GPU if avilabile<b>

In [11]:
model = pipeline('feature-extraction', model=model, tokenizer=tokenizer,device=0)

<b>6. Create or load sequences and map rarely occured amino acids (U,Z,O,B) to (X)<b>

In [12]:
sequences_Example = ["A E T C Z A O","S K T Z P"]

In [13]:
sequences_Example = [re.sub(r"[UZOB]", "X", sequence) for sequence in sequences_Example]

<b>6. Extracting sequences' features and covert the output to numpy if needed<b>

In [14]:
embedding = model(sequences_Example)

In [15]:
embedding = np.array(embedding)

In [16]:
print(embedding)

[[[ 5.69581509e-01 -8.12228858e-01  1.51267886e+00 ... -3.47372681e-01
   -1.97737586e+00  1.02282548e+00]
  [ 2.76618991e-02 -6.71196997e-01  9.98873472e-01 ...  7.27679655e-02
   -1.62625980e+00 -8.44566710e-03]
  [ 2.20987082e-01 -5.26815534e-01  6.64871037e-01 ...  4.78142388e-02
   -1.39787078e+00  3.08237135e-01]
  ...
  [-3.64926189e-01 -8.19321334e-01  4.81532872e-01 ...  2.35715955e-01
   -6.73882365e-01 -1.06030309e+00]
  [ 4.51356888e-01 -8.96942139e-01  4.00962055e-01 ... -1.93732992e-01
   -5.60827136e-01 -2.78552026e-01]
  [ 3.18278044e-01 -1.61192930e+00  4.94406074e-01 ... -2.51359522e-01
   -1.32739976e-01 -1.23092830e-02]]

 [[ 1.91231534e-01  1.84455216e-02 -1.82765443e-03 ... -4.36504632e-01
    2.18422841e-02 -1.59097195e-01]
  [ 2.63838232e-01 -6.02961145e-02 -1.12764817e-02 ... -2.28307739e-01
   -3.21160257e-01  1.10596135e-01]
  [ 8.65126610e-01 -1.61868662e-01 -1.75777614e-01 ...  3.56551766e-01
   -2.34120205e-01  4.93936874e-02]
  ...
  [ 4.84943122e-01  6.7

<b>Optional: Remove padding ([PAD]) and special tokens ([CLS],[SEP]) that is added by ProtXLNet model<b>

In [17]:
features = [] 
for seq_num in range(len(embedding)):
    seq_len = len(sequences_Example[seq_num].replace(" ", ""))
    padded_seq_len = len(embedding[seq_num])
    start_Idx = padded_seq_len-seq_len-2
    end_Idx = padded_seq_len-2
    seq_emd = embedding[seq_num][start_Idx:end_Idx]
    features.append(seq_emd)

In [18]:
print(features)

[array([[ 0.56958151, -0.81222886,  1.51267886, ..., -0.34737268,
        -1.97737586,  1.02282548],
       [ 0.0276619 , -0.671197  ,  0.99887347, ...,  0.07276797,
        -1.6262598 , -0.00844567],
       [ 0.22098708, -0.52681553,  0.66487104, ...,  0.04781424,
        -1.39787078,  0.30823714],
       ...,
       [ 0.70799673, -0.66436064,  0.85833871, ..., -0.02473333,
        -1.51670885, -0.21759868],
       [-0.14213681, -0.86483932,  0.81442791, ..., -0.32999074,
        -0.23385319, -1.7195524 ],
       [-0.36492619, -0.81932133,  0.48153287, ...,  0.23571596,
        -0.67388237, -1.06030309]]), array([[ 0.86512661, -0.16186866, -0.17577761, ...,  0.35655177,
        -0.23412021,  0.04939369],
       [ 0.16243157, -0.22352344,  0.53584576, ...,  0.37107557,
        -0.72951031, -0.2967481 ],
       [ 0.64317816, -0.07562806,  0.37602407, ...,  0.80507094,
        -1.03585243, -0.67776829],
       [ 0.67437637,  0.27612588,  0.44298318, ...,  0.13377103,
        -0.96995842,