<a href="https://colab.research.google.com/github/agemagician/Prot-Transformers/blob/master/Benchmark/Bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h3> Benchmark ProtBert Model using GPU or CPU <h3>

<b>1. Load necessry libraries including huggingface transformers<b>

In [3]:
!pip install -q transformers

In [4]:
import torch
from transformers import BertModel
import time
from datetime import timedelta
import os
import requests
from tqdm.auto import tqdm

<b>2. Set the url location of ProtBert and the vocabulary file<b>

In [5]:
modelUrl = 'https://www.dropbox.com/s/dm3m1o0tsv9terq/pytorch_model.bin?dl=1'
configUrl = 'https://www.dropbox.com/s/d3yw7v4tvi5f4sk/bert_config.json?dl=1'
vocabUrl = 'https://www.dropbox.com/s/jvrleji50ql5m5i/vocab.txt?dl=1'

<b>3. Download ProtBert models and vocabulary files<b>

In [6]:
downloadFolderPath = 'models/ProtBert/'

In [7]:
modelFolderPath = downloadFolderPath

modelFilePath = os.path.join(modelFolderPath, 'pytorch_model.bin')

configFilePath = os.path.join(modelFolderPath, 'config.json')

vocabFilePath = os.path.join(modelFolderPath, 'vocab.txt')

In [8]:
if not os.path.exists(modelFolderPath):
    os.makedirs(modelFolderPath)

In [9]:
def download_file(url, filename):
  response = requests.get(url, stream=True)
  with tqdm.wrapattr(open(filename, "wb"), "write", miniters=1,
                    total=int(response.headers.get('content-length', 0)),
                    desc=filename) as fout:
      for chunk in response.iter_content(chunk_size=4096):
          fout.write(chunk)

In [10]:
if not os.path.exists(modelFilePath):
    download_file(modelUrl, modelFilePath)

if not os.path.exists(configFilePath):
    download_file(configUrl, configFilePath)

if not os.path.exists(vocabFilePath):
    download_file(vocabUrl, vocabFilePath)

HBox(children=(FloatProgress(value=0.0, description='models/ProtBert/pytorch_model.bin', max=1684058277.0, sty…




HBox(children=(FloatProgress(value=0.0, description='models/ProtBert/config.json', max=313.0, style=ProgressSt…




HBox(children=(FloatProgress(value=0.0, description='models/ProtBert/vocab.txt', max=81.0, style=ProgressStyle…




<b>4. Load ProtBert Model<b>

In [11]:
model = BertModel.from_pretrained(modelFolderPath)

<b>5. Load the model into the GPU if avilabile and switch to inference mode<b>

In [12]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [13]:
model = model.to(device)
model = model.eval()

<b>6. Benchmark Configuration<b>

In [14]:
min_batch_size = 8
max_batch_size = 32
inc_batch_size = 8

min_sequence_length = 64
max_sequence_length = 512
inc_sequence_length = 64

iterations = 10

<b>7. Start Benchmarking<b>

In [15]:
device_name = torch.cuda.get_device_name(device.index) if device.type == 'cuda' else 'CPU'

with torch.no_grad():
    print((' Benchmarking using ' + device_name + ' ').center(80, '*'))
    print(' Start '.center(80, '*'))
    for sequence_length in range(min_sequence_length,max_sequence_length+1,inc_sequence_length):
        for batch_size in range(min_batch_size,max_batch_size+1,inc_batch_size):
            start = time.time()
            for i in range(iterations):
                input_ids = torch.randint(1, 20, (batch_size,sequence_length)).to(device)
                results = model(input_ids)[0].cpu().numpy()
            end = time.time()
            ms_per_protein = (end-start)/(iterations*batch_size)
            print('Sequence Length: %4d \t Batch Size: %4d \t Ms per protein %4.2f' %(sequence_length,batch_size,ms_per_protein))
        print(' Done '.center(80, '*'))
    print(' Finished '.center(80, '*'))

******************* Benchmarking using Tesla P100-PCIE-16GB ********************
************************************ Start *************************************
Sequence Length:   64 	 Batch Size:    8 	 Ms per protein 0.01
Sequence Length:   64 	 Batch Size:   16 	 Ms per protein 0.01
Sequence Length:   64 	 Batch Size:   24 	 Ms per protein 0.01
Sequence Length:   64 	 Batch Size:   32 	 Ms per protein 0.01
************************************* Done *************************************
Sequence Length:  128 	 Batch Size:    8 	 Ms per protein 0.02
Sequence Length:  128 	 Batch Size:   16 	 Ms per protein 0.02
Sequence Length:  128 	 Batch Size:   24 	 Ms per protein 0.02
Sequence Length:  128 	 Batch Size:   32 	 Ms per protein 0.02
************************************* Done *************************************
Sequence Length:  192 	 Batch Size:    8 	 Ms per protein 0.02
Sequence Length:  192 	 Batch Size:   16 	 Ms per protein 0.02
Sequence Length:  192 	 Batch Size:   24 	 Ms 