<a href="https://colab.research.google.com/github/agemagician/ProtTrans/blob/master/Embedding/TensorFlow/Advanced/ProtBert-BFD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h3> Extracting protein sequences' features using ProtBert-BFD pretrained-model <h3>

<b>1. Load necessry libraries including huggingface transformers<b>

In [1]:
!pip install -q transformers

[K     |████████████████████████████████| 778kB 2.7MB/s 
[K     |████████████████████████████████| 3.0MB 7.5MB/s 
[K     |████████████████████████████████| 890kB 23.4MB/s 
[K     |████████████████████████████████| 1.1MB 33.9MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [2]:
import tensorflow as tf
from transformers import TFBertModel, BertTokenizer,BertConfig
import re
import os
import requests
from tqdm.auto import tqdm
import numpy as np

<b>2. Set the url location of ProtBert-BFD and the vocabulary file<b>

In [3]:
modelDataUrl = 'https://www.dropbox.com/s/mq6w2ougf0x35ke/model.ckpt-200000.data-00000-of-00001?dl=1'
modelMetaUrl = 'https://www.dropbox.com/s/lskivosjjedhv7b/model.ckpt-200000.meta?dl=1'
modelIndexUrl = 'https://www.dropbox.com/s/z3xwqr6k5d2pjv2/model.ckpt-200000.index?dl=1'
modelCheckpointUrl = 'https://www.dropbox.com/s/lr2fm99x8gpqtee/checkpoint?dl=1'
configUrl = 'https://www.dropbox.com/s/33en5mbl4wf27om/bert_config.json?dl=1'
vocabUrl = 'https://www.dropbox.com/s/tffddoqfubkfcsw/vocab.txt?dl=1'

<b>3. Download ProtBert-BFD models and vocabulary files<b>

In [4]:
downloadFolderPath = 'models/ProtBert-BFD/'

In [5]:
modelFolderPath = downloadFolderPath

modelDataFilePath = os.path.join(modelFolderPath, 'model.ckpt-200000.data-00000-of-00001')
modelMetaFilePath = os.path.join(modelFolderPath, 'model.ckpt-200000.meta')
modelIndexFilePath = os.path.join(modelFolderPath, 'model.ckpt-200000.index')
modelCheckpointFilePath = os.path.join(modelFolderPath, 'checkpoint')

configFilePath = os.path.join(modelFolderPath, 'config.json')

vocabFilePath = os.path.join(modelFolderPath, 'vocab.txt')

In [6]:
if not os.path.exists(modelFolderPath):
    os.makedirs(modelFolderPath)

In [7]:
def download_file(url, filename):
  response = requests.get(url, stream=True)
  with tqdm.wrapattr(open(filename, "wb"), "write", miniters=1,
                    total=int(response.headers.get('content-length', 0)),
                    desc=filename) as fout:
      for chunk in response.iter_content(chunk_size=4096):
          fout.write(chunk)

In [8]:
if not os.path.exists(modelDataFilePath):
    download_file(modelDataUrl, modelDataFilePath)

if not os.path.exists(modelMetaFilePath):
    download_file(modelMetaUrl, modelMetaFilePath)

if not os.path.exists(modelIndexFilePath):
    download_file(modelIndexUrl, modelIndexFilePath)

if not os.path.exists(modelCheckpointFilePath):
    download_file(modelCheckpointUrl, modelCheckpointFilePath)

if not os.path.exists(configFilePath):
    download_file(configUrl, configFilePath)

if not os.path.exists(vocabFilePath):
    download_file(vocabUrl, vocabFilePath)

HBox(children=(FloatProgress(value=0.0, description='models/ProtBert-BFD/model.ckpt-200000.data-00000-of-00001…




HBox(children=(FloatProgress(value=0.0, description='models/ProtBert-BFD/model.ckpt-200000.meta', max=69583027…




HBox(children=(FloatProgress(value=0.0, description='models/ProtBert-BFD/model.ckpt-200000.index', max=21309.0…




HBox(children=(FloatProgress(value=0.0, description='models/ProtBert-BFD/checkpoint', max=283.0, style=Progres…




HBox(children=(FloatProgress(value=0.0, description='models/ProtBert-BFD/config.json', max=313.0, style=Progre…




HBox(children=(FloatProgress(value=0.0, description='models/ProtBert-BFD/vocab.txt', max=81.0, style=ProgressS…




<b>4. Load the vocabulary and ProtBert-BFD Model</b>

In [9]:
tokenizer = BertTokenizer(vocabFilePath, do_lower_case=False )

In [10]:
config = BertConfig.from_json_file(configFilePath)

In [11]:
model = TFBertModel(config,modelCheckpointFilePath )

<b>5. Create or load sequences and map rarely occured amino acids (U,Z,O,B) to (X)<b>

In [12]:
sequences_Example = ["A E T C Z A O","S K T Z P"]

In [13]:
sequences_Example = [re.sub(r"[UZOB]", "X", sequence) for sequence in sequences_Example]

<b>6. Tokenize, encode sequences and load it into the GPU if possibile<b>

In [14]:
ids = tokenizer.batch_encode_plus(sequences_Example, add_special_tokens=True, pad_to_max_length=True, return_tensors="tf")

In [15]:
input_ids = ids['input_ids']
attention_mask = ids['attention_mask']

<b>7. Extracting sequences' features and load it into the CPU if needed<b>

In [16]:
embedding = model(input_ids)[0]

In [17]:
embedding = np.asarray(embedding)

In [18]:
attention_mask = np.asarray(attention_mask)

<b>8. Remove padding ([PAD]) and special tokens ([CLS],[SEP]) that is added by ProtBert-BFD model<b>

In [19]:
features = [] 
for seq_num in range(len(embedding)):
    seq_len = (attention_mask[seq_num] == 1).sum()
    seq_emd = embedding[seq_num][1:seq_len-1]
    features.append(seq_emd)

In [20]:
print(features)

[array([[-0.43126333,  0.13795705,  1.3532674 , ...,  0.66592115,
        -0.4503428 , -1.6323206 ],
       [-0.85842854,  0.23084313,  1.2675498 , ...,  0.42682803,
        -0.47989544, -1.0834988 ],
       [ 0.17322882,  0.24332894,  1.0507231 , ...,  0.44253966,
         0.2878237 , -1.5834416 ],
       ...,
       [-0.6921205 ,  0.26783723,  1.5405961 , ...,  0.36913323,
        -0.02786866, -1.4740794 ],
       [-0.41246465,  0.29625502,  1.6742653 , ...,  0.5257025 ,
        -0.09974476, -1.6049011 ],
       [-0.3720993 , -0.02519915,  0.9137806 , ...,  0.12062608,
        -0.28654727, -0.9148072 ]], dtype=float32), array([[-0.14618629, -0.22533408,  1.4318085 , ..., -0.1517215 ,
        -0.01618051, -1.4807022 ],
       [-0.5721659 , -0.4159835 ,  0.99389946, ..., -0.22691797,
        -0.3114474 , -0.9577436 ],
       [ 0.33151752, -0.16505283,  1.1424873 , ..., -0.20811258,
         0.19781403, -1.6013726 ],
       [ 0.22176996, -0.08336659,  1.3803043 , ..., -0.10119343,
     