<h3> Extracting protein sequences' features using XLNet pretrained-model <h3>

<b>1. Load necessry libraries including huggingface transformers<b>

In [1]:
import torch
from transformers import XLNetModel, XLNetTokenizer
import re

<b>2. Set the file location of XLNet and the vocabulary file<b>

In [2]:
modelPath = '/media/agemagician/Disk2/projects/bio_google_tpu_project/xlnet_pytorch/'
vocabPath = '/media/agemagician/Disk2/projects/bio_google_tpu_project/xlnet_pytorch/spm_model.model'

<b>3. Load the vocabulary and XLNet Model<b>

In [3]:
vocab = XLNetTokenizer(vocabPath, do_lower_case=False)

In [4]:
xlnet_men_len = 512

In [5]:
model = XLNetModel.from_pretrained(modelPath,mem_len=xlnet_men_len)

<b>4. Load the model into the GPU if avilabile and switch to inference mode<b>

In [6]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [7]:
model = model.to(device)
model = model.eval()

<b>5. Create or load sequences and map rarely occured amino acids (U,Z,O,B) to (X)<b>

In [9]:
sequences_Example = ["A E T C Z A O","S K T Z P"]

In [10]:
sequences_Example = [re.sub(r"[UZOB]", "X", sequence) for sequence in sequences_Example]

<b>6. Tokenize, encode sequences and load it into the GPU if possibile<b>

In [10]:
ids = vocab.batch_encode_plus(sequences_Example, add_special_tokens=True, pad_to_max_length=True)

In [11]:
input_ids = torch.tensor(ids['input_ids']).to(device)
attention_mask = torch.tensor(ids['attention_mask']).to(device)

<b>7. Extracting sequences' features and load it into the CPU if needed<b>

In [12]:
with torch.no_grad():
    embedding, memory = model(input_ids=input_ids,attention_mask=attention_mask,mems=None)

In [13]:
embedding = embedding.cpu().numpy()

<b>8. Remove padding ([PAD]) and special tokens ([CLS],[SEP]) that is added by XLNet model<b>

In [14]:
features = [] 
for seq_num in range(len(embedding)):
    seq_len = (attention_mask[seq_num] == 1).sum()
    padded_seq_len = len(attention_mask[seq_num])
    seq_emd = embedding[seq_num][padded_seq_len-seq_len:padded_seq_len-2]
    features.append(seq_emd)

In [15]:
print(features)

[array([[ 0.48745167, -0.77087843,  0.9900183 , ..., -0.3735618 ,
        -1.0589756 ,  0.95599765],
       [ 0.2132598 , -0.54789025,  0.6115491 , ..., -0.0547135 ,
        -0.8787895 ,  0.24645136],
       [ 0.37891388, -0.63965726,  0.6722448 , ..., -0.14891215,
        -0.6769581 ,  0.34598234],
       ...,
       [ 0.09264987, -0.68101084,  0.5181861 , ..., -0.32756191,
        -0.5673121 , -0.16441266],
       [-0.08533158, -0.7438229 ,  0.29890764, ..., -0.24376382,
        -0.11153158, -0.72609884],
       [-0.48225796, -0.8381684 ,  0.08214322, ..., -0.2519644 ,
        -0.03577379, -0.5348936 ]], dtype=float32), array([[ 1.0403059 , -0.95049363,  0.33534417, ..., -0.237473  ,
        -0.27550715,  0.47948807],
       [ 0.5085368 , -0.95084655,  1.0235124 , ..., -0.05893078,
        -0.97528106,  0.08713181],
       [ 0.6562669 , -0.7540624 ,  0.4323474 , ...,  0.36707532,
        -0.70632553, -0.5553411 ],
       [ 0.59033877,  0.1415198 ,  0.29578793, ...,  0.20600986,
     

In [18]:
from transformers import pipeline
import numpy as np

In [11]:
nlp = pipeline('feature-extraction', model=model, tokenizer=vocab,device=0)

In [12]:
fet = nlp(sequences_Example)

In [16]:
len(fet[0])

9

In [20]:
embedding = np.array(fet)

In [29]:
features = [] 
for seq_num in range(len(embedding)):
    seq_len = len(sequences_Example[seq_num].replace(" ", ""))
    padded_seq_len = len(embedding[seq_num])
    print(seq_len)
    print(padded_seq_len)
    seq_emd = embedding[seq_num][padded_seq_len-seq_len:padded_seq_len-2]
    features.append(seq_emd)

7
9
5
9


In [22]:
len(sequences_Example[0])

13

In [23]:
len(sequences_Example[0].replace(" ", ""))

7

In [28]:
features[1].shape

(3, 1024)