<h3> Extracting protein sequences' features using Bert pretrained-model <h3>

<b>1. Load necessry libraries including huggingface transformers<b>

In [1]:
import torch
from transformers import BertModel, BertTokenizer, pipeline
import re
import numpy as np

<b>2. Set the file location of Bert and the vocabulary file<b>

In [2]:
modelPath = '/media/agemagician/Disk2/share_files/summit/uniref100/bert/30_layers/'
vocabPath = '/media/agemagician/Disk2/share_files/summit/uniref100/bert/30_layers/vocab.txt'

<b>3. Load the vocabulary and Bert Model<b>

In [3]:
vocab = BertTokenizer(vocabPath, do_lower_case=False )

In [4]:
model = BertModel.from_pretrained(modelPath)

<b>4. Load the model into the GPU if avilabile<b>

In [5]:
model = pipeline('feature-extraction', model=model, tokenizer=vocab,device=0)

<b>5. Create or load sequences and map rarely occured amino acids (U,Z,O,B) to (X)<b>

In [6]:
sequences_Example = ["A E T C Z A O","S K T Z P"]

In [7]:
sequences_Example = [re.sub(r"[UZOB]", "X", sequence) for sequence in sequences_Example]

<b>6. Extracting sequences' features and covert the output to numpy if needed<b>

In [8]:
embedding = model(sequences_Example)

In [9]:
embedding = np.array(embedding)

In [10]:
print(embedding)

[[[ 0.43194884 -0.29573825  0.1871604  ... -0.60362345 -0.41244522
   -0.49437129]
  [ 0.34346321 -0.11853541  0.11670393 ...  0.12733918 -0.33663505
   -0.4125227 ]
  [ 0.37897065 -0.23096789  0.00771705 ... -0.44364253 -0.56715494
   -0.63721484]
  ...
  [ 0.69124544 -0.44460136  0.00450774 ... -0.35402179 -0.46115145
   -0.76821858]
  [ 0.59799904 -0.23531727  0.09535453 ... -0.58651459 -0.61789775
   -0.80608463]
  [ 0.51396555 -0.34585887  0.05222348 ... -0.67083639 -0.57604814
   -0.96094573]]

 [[ 0.23120911 -0.60032845  0.39122692 ... -0.5308159  -0.5575766
   -0.55705237]
  [ 0.36907712 -0.51549917  0.25218609 ... -0.50687969 -0.56165153
   -0.61570871]
  [ 0.32650244 -0.58309758  0.19571821 ... -0.57145095 -0.25643727
   -0.47911468]
  ...
  [ 0.44227415 -0.48037705  0.06044348 ... -0.2559799  -0.75932127
   -0.66039222]
  [ 0.28604189 -0.37552303  0.08326415 ... -0.6351983  -0.67632711
   -0.62236679]
  [ 0.29940799 -0.4034704   0.13874586 ... -0.99162865 -0.60535759
   -0.5

<b>Optional: Remove padding ([PAD]) and special tokens ([CLS],[SEP]) that is added by Bert model<b>

In [11]:
features = [] 

for seq_num in range(len(embedding)):
    seq_len = len(sequences_Example[seq_num].replace(" ", ""))
    start_Idx = 1
    end_Idx = seq_len+1
    seq_emd = embedding[seq_num][start_Idx:end_Idx]
    features.append(seq_emd)

In [12]:
print(features)

[array([[ 0.34346321, -0.11853541,  0.11670393, ...,  0.12733918,
        -0.33663505, -0.4125227 ],
       [ 0.37897065, -0.23096789,  0.00771705, ..., -0.44364253,
        -0.56715494, -0.63721484],
       [ 0.28404582, -0.56559259,  0.06742072, ..., -0.80574137,
        -0.64692605, -0.66759783],
       ...,
       [ 0.38815227, -0.2500954 ,  0.1672174 , ..., -0.53784174,
        -0.37415683, -0.52703422],
       [ 0.69124544, -0.44460136,  0.00450774, ..., -0.35402179,
        -0.46115145, -0.76821858],
       [ 0.59799904, -0.23531727,  0.09535453, ..., -0.58651459,
        -0.61789775, -0.80608463]]), array([[ 0.36907712, -0.51549917,  0.25218609, ..., -0.50687969,
        -0.56165153, -0.61570871],
       [ 0.32650244, -0.58309758,  0.19571821, ..., -0.57145095,
        -0.25643727, -0.47911468],
       [ 0.0239465 , -0.10554147,  0.27909502, ..., -0.55666339,
        -0.34469691, -0.42556283],
       [ 0.467527  , -0.51940775,  0.0586937 , ..., -0.28577682,
        -0.73576313,