<a href="https://colab.research.google.com/github/agemagician/ProtTrans/blob/master/Embedding/PyTorch/Basic/ProtXLNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h3> Extracting protein sequences' features using ProtXLNet pretrained-model <h3>

<b>1. Load necessry libraries including huggingface transformers<b>

In [1]:
!pip install -q transformers sentencepiece

In [2]:
import torch
from transformers import XLNetTokenizer, AutoModel, pipeline
import re
import numpy as np
import os
import requests
from tqdm.auto import tqdm

<b>2. Load the vocabulary and ProtXLNet Model<b>

In [3]:
tokenizer = XLNetTokenizer.from_pretrained("Rostlab/prot_xlnet", do_lower_case=False)

In [4]:
model = AutoModel.from_pretrained("Rostlab/prot_xlnet")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1637757076.0, style=ProgressStyle(descr…




<b>3. Load the model into the GPU if avilabile<b>

In [5]:
fe = pipeline('feature-extraction', model=model, tokenizer=tokenizer,device=0)

<b>4. Create or load sequences and map rarely occured amino acids (U,Z,O,B) to (unk)<b>

In [6]:
sequences_Example = ["A E T C Z A O","S K T Z P"]

In [7]:
sequences_Example = [re.sub(r"[UZOB]", "<unk>", sequence) for sequence in sequences_Example]

<b>5. Extracting sequences' features and covert the output to numpy if needed<b>

In [8]:
embedding = fe(sequences_Example)

In [9]:
embedding = np.array(embedding)

In [10]:
print(embedding)

[[[ 5.69581509e-01 -8.12228858e-01  1.51267886e+00 ... -3.47372681e-01
   -1.97737586e+00  1.02282548e+00]
  [ 2.76618991e-02 -6.71196997e-01  9.98873472e-01 ...  7.27679655e-02
   -1.62625980e+00 -8.44566710e-03]
  [ 2.20987082e-01 -5.26815534e-01  6.64871037e-01 ...  4.78142388e-02
   -1.39787078e+00  3.08237135e-01]
  ...
  [-3.64926189e-01 -8.19321334e-01  4.81532872e-01 ...  2.35715955e-01
   -6.73882365e-01 -1.06030309e+00]
  [ 4.51356888e-01 -8.96942139e-01  4.00962055e-01 ... -1.93732992e-01
   -5.60827136e-01 -2.78552026e-01]
  [ 3.18278044e-01 -1.61192930e+00  4.94406074e-01 ... -2.51359522e-01
   -1.32739976e-01 -1.23092830e-02]]

 [[ 1.91231534e-01  1.84455216e-02 -1.82765443e-03 ... -4.36504632e-01
    2.18422841e-02 -1.59097195e-01]
  [ 2.63838232e-01 -6.02961145e-02 -1.12764817e-02 ... -2.28307739e-01
   -3.21160257e-01  1.10596135e-01]
  [ 8.65126610e-01 -1.61868662e-01 -1.75777614e-01 ...  3.56551766e-01
   -2.34120205e-01  4.93936874e-02]
  ...
  [ 4.84943122e-01  6.7

<b>Optional: Remove padding ([PAD]) and special tokens ([CLS],[SEP]) that is added by ProtXLNet model<b>

In [11]:
features = [] 
for seq_num in range(len(embedding)):
    seq_len = len(sequences_Example[seq_num].replace(" ", ""))
    padded_seq_len = len(embedding[seq_num])
    start_Idx = padded_seq_len-seq_len-2
    end_Idx = padded_seq_len-2
    seq_emd = embedding[seq_num][start_Idx:end_Idx]
    features.append(seq_emd)

In [12]:
print(features)

[array([[ 0.0276619 , -0.671197  ,  0.99887347, ...,  0.07276797,
        -1.6262598 , -0.00844567],
       [ 0.22098708, -0.52681553,  0.66487104, ...,  0.04781424,
        -1.39787078,  0.30823714],
       [ 0.98757732, -1.03212166,  0.99680531, ..., -0.33856013,
        -1.51521778,  1.05237114],
       [ 0.70799673, -0.66436064,  0.85833871, ..., -0.02473333,
        -1.51670885, -0.21759868],
       [-0.14213681, -0.86483932,  0.81442791, ..., -0.32999074,
        -0.23385319, -1.7195524 ],
       [-0.36492619, -0.81932133,  0.48153287, ...,  0.23571596,
        -0.67388237, -1.06030309]]), array([], shape=(0, 1024), dtype=float64)]
