<h3> Extracting protein sequences' features using XLNet pretrained-model <h3>

<b>1. Load necessry libraries including huggingface transformers<b>

In [1]:
import torch
from transformers import XLNetModel, XLNetTokenizer, pipeline
import re
import numpy as np

<b>2. Set the file location of XLNet and the vocabulary file<b>

In [2]:
modelPath = '/media/agemagician/Disk2/projects/bio_google_tpu_project/xlnet_pytorch/'
vocabPath = '/media/agemagician/Disk2/projects/bio_google_tpu_project/xlnet_pytorch/spm_model.model'

<b>3. Load the vocabulary and XLNet Model<b>

In [3]:
vocab = XLNetTokenizer(vocabPath, do_lower_case=False)

In [4]:
model = XLNetModel.from_pretrained(modelPath)

<b>4. Load the model into the GPU if avilabile<b>

In [5]:
model = pipeline('feature-extraction', model=model, tokenizer=vocab,device=0)

<b>5. Create or load sequences and map rarely occured amino acids (U,Z,O,B) to (X)<b>

In [6]:
sequences_Example = ["A E T C Z A O","S K T Z P"]

In [7]:
sequences_Example = [re.sub(r"[UZOB]", "X", sequence) for sequence in sequences_Example]

<b>6. Extracting sequences' features and covert the output to numpy if needed<b>

In [8]:
embedding = model(sequences_Example)

In [9]:
embedding = np.array(embedding)

In [10]:
print(embedding)

[[[ 5.69581389e-01 -8.12229156e-01  1.51267779e+00 ... -3.47373039e-01
   -1.97737503e+00  1.02282655e+00]
  [ 2.76620593e-02 -6.71196163e-01  9.98872876e-01 ...  7.27678835e-02
   -1.62625921e+00 -8.44672695e-03]
  [ 2.20985755e-01 -5.26815534e-01  6.64871275e-01 ...  4.78132330e-02
   -1.39787042e+00  3.08236301e-01]
  ...
  [-3.64927173e-01 -8.19321752e-01  4.81532007e-01 ...  2.35715851e-01
   -6.73881233e-01 -1.06030214e+00]
  [ 4.51355964e-01 -8.96942437e-01  4.00961936e-01 ... -1.93732589e-01
   -5.60827076e-01 -2.78551757e-01]
  [ 3.18277806e-01 -1.61193049e+00  4.94404763e-01 ... -2.51358896e-01
   -1.32739350e-01 -1.23085007e-02]]

 [[ 1.91230953e-01  1.84453893e-02 -1.82769471e-03 ... -4.36504394e-01
    2.18427107e-02 -1.59096345e-01]
  [ 2.63838321e-01 -6.02972694e-02 -1.12769334e-02 ... -2.28307098e-01
   -3.21158886e-01  1.10597774e-01]
  [ 8.65127563e-01 -1.61870375e-01 -1.75775871e-01 ...  3.56552184e-01
   -2.34119356e-01  4.93951626e-02]
  ...
  [ 4.84944880e-01  6.7

<b>Optional: Remove padding ([PAD]) and special tokens ([CLS],[SEP]) that is added by XLNet model<b>

In [11]:
features = [] 
for seq_num in range(len(embedding)):
    seq_len = len(sequences_Example[seq_num].replace(" ", ""))
    padded_seq_len = len(embedding[seq_num])
    start_Idx = padded_seq_len-seq_len-2
    end_Idx = padded_seq_len-2
    seq_emd = embedding[seq_num][start_Idx:end_Idx]
    features.append(seq_emd)

In [12]:
print(features)

[array([[ 0.56958139, -0.81222916,  1.51267779, ..., -0.34737304,
        -1.97737503,  1.02282655],
       [ 0.02766206, -0.67119616,  0.99887288, ...,  0.07276788,
        -1.62625921, -0.00844673],
       [ 0.22098576, -0.52681553,  0.66487128, ...,  0.04781323,
        -1.39787042,  0.3082363 ],
       ...,
       [ 0.70799679, -0.66436082,  0.85833895, ..., -0.02473486,
        -1.51670837, -0.21759787],
       [-0.14213672, -0.86483973,  0.81442684, ..., -0.3299906 ,
        -0.23385416, -1.71955085],
       [-0.36492717, -0.81932175,  0.48153201, ...,  0.23571585,
        -0.67388123, -1.06030214]]), array([[ 0.86512756, -0.16187038, -0.17577587, ...,  0.35655218,
        -0.23411936,  0.04939516],
       [ 0.16243289, -0.2235232 ,  0.53584665, ...,  0.37107733,
        -0.72950965, -0.29674751],
       [ 0.64317876, -0.07562793,  0.37602416, ...,  0.80507225,
        -1.03585291, -0.67776859],
       [ 0.67437631,  0.27612686,  0.44298273, ...,  0.13377124,
        -0.96995813,