<a href="https://colab.research.google.com/github/agemagician/ProtTrans/blob/master/Embedding/PyTorch/Basic/ProtXLNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h3> Extracting protein sequences' features using ProtXLNet pretrained-model <h3>

<b>1. Load necessry libraries including huggingface transformers<b>

In [1]:
!pip install -q transformers

In [2]:
import torch
from transformers import AutoTokenizer, AutoModel, pipeline
import re
import numpy as np
import os
import requests
from tqdm.auto import tqdm

<b>2. Load the vocabulary and ProtXLNet Model<b>

In [3]:
tokenizer = AutoTokenizer.from_pretrained("Rostlab/prot_xlnet", do_lower_case=False)



In [4]:
model = AutoModel.from_pretrained("Rostlab/prot_xlnet")



<b>3. Load the model into the GPU if avilabile<b>

In [5]:
fe = pipeline('feature-extraction', model=model, tokenizer=tokenizer,device=0)

<b>4. Create or load sequences and map rarely occured amino acids (U,Z,O,B) to (unk)<b>

In [6]:
sequences_Example = ["A E T C Z A O","S K T Z P"]

In [7]:
sequences_Example = [re.sub(r"[UZOB]", "<unk>", sequence) for sequence in sequences_Example]

<b>5. Extracting sequences' features and covert the output to numpy if needed<b>

In [8]:
embedding = fe(sequences_Example)

  attn_score = (ac + bd + ef) * self.scale


In [9]:
embedding = np.array(embedding)

In [10]:
print(embedding)

[[[ 5.69581151e-01 -8.12228918e-01  1.51267755e+00 ... -3.47373217e-01
   -1.97737479e+00  1.02282667e+00]
  [ 2.76618432e-02 -6.71196342e-01  9.98872995e-01 ...  7.27679431e-02
   -1.62625897e+00 -8.44636653e-03]
  [ 2.20985517e-01 -5.26815653e-01  6.64870739e-01 ...  4.78135161e-02
   -1.39787018e+00  3.08237225e-01]
  ...
  [-3.64925861e-01 -8.19322586e-01  4.81532544e-01 ...  2.35715851e-01
   -6.73881412e-01 -1.06030190e+00]
  [ 4.51357126e-01 -8.96942556e-01  4.00962412e-01 ... -1.93733007e-01
   -5.60827494e-01 -2.78550863e-01]
  [ 3.18279147e-01 -1.61192894e+00  4.94404495e-01 ... -2.51359016e-01
   -1.32738993e-01 -1.23080434e-02]]

 [[ 1.91231221e-01  1.84447747e-02 -1.82828668e-03 ... -4.36504602e-01
    2.18437910e-02 -1.59096792e-01]
  [ 2.63838947e-01 -6.02974333e-02 -1.12759480e-02 ... -2.28307560e-01
   -3.21159273e-01  1.10597484e-01]
  [ 8.65127444e-01 -1.61870778e-01 -1.75775602e-01 ...  3.56552184e-01
   -2.34119326e-01  4.93937097e-02]
  ...
  [ 4.84942824e-01  6.7

<b>Optional: Remove padding ([PAD]) and special tokens ([CLS],[SEP]) that is added by ProtXLNet model<b>

In [11]:
features = [] 
for seq_num in range(len(embedding)):
    seq_len = len(sequences_Example[seq_num].replace(" ", ""))
    padded_seq_len = len(embedding[seq_num])
    start_Idx = padded_seq_len-seq_len-2
    end_Idx = padded_seq_len-2
    seq_emd = embedding[seq_num][start_Idx:end_Idx]
    features.append(seq_emd)

In [12]:
print(features)

[array([[ 0.02766184, -0.67119634,  0.998873  , ...,  0.07276794,
        -1.62625897, -0.00844637],
       [ 0.22098552, -0.52681565,  0.66487074, ...,  0.04781352,
        -1.39787018,  0.30823722],
       [ 0.98757672, -1.03212094,  0.99680483, ..., -0.33855975,
        -1.51521766,  1.05237162],
       [ 0.70799863, -0.66436082,  0.85833853, ..., -0.02473419,
        -1.51670933, -0.21759486],
       [-0.14213729, -0.86483812,  0.81442761, ..., -0.32999155,
        -0.23385341, -1.71955049],
       [-0.36492586, -0.81932259,  0.48153254, ...,  0.23571585,
        -0.67388141, -1.0603019 ]]), array([], shape=(0, 1024), dtype=float64)]
