With TorchScript, PyTorch aims to create a unified framework from research to production. TorchScript takes our PyTorch modules as input and convert them into a production-friendly format.

To focus on the production use case, PyTorch uses 'Script mode' which has 2 components PyTorch JIT and TorchScript.

In [None]:
!pip install transformers

In [None]:
from transformers import BertTokenizer, BertModel
import numpy as np
import torch
from time import perf_counter

In [None]:
def timer(f,*args):

    start = perf_counter()
    f(*args)
    return (1000 * (perf_counter() - start))

script_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', torchscript=True)
script_model = BertModel.from_pretrained("bert-base-uncased", torchscript=True)


# Tokenizing input text
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = script_tokenizer.tokenize(text)

# Masking one of the input tokens
masked_index = 8

tokenized_text[masked_index] = '[MASK]'

indexed_tokens = script_tokenizer.convert_tokens_to_ids(tokenized_text)

segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

# Creating a dummy input
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# BERT on CPU
native_model = BertModel.from_pretrained("bert-base-uncased")
np.mean([timer(native_model,tokens_tensor,segments_tensors) for _ in range(100)])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


88.82381234999514

In [None]:
# BERT on GPU
# Both sample data model need be on the GPU device for the inference to take place
native_gpu = native_model.cuda()
tokens_tensor_gpu = tokens_tensor.cuda()
segments_tensors_gpu = segments_tensors.cuda()
np.mean([timer(native_gpu,tokens_tensor_gpu,segments_tensors_gpu) for _ in range(100)])

18.771747219999497

Script mode is invoked by either torch.jit.trace or torch.jit.script. Here, I am using trace method.

In [None]:
# torch.jit.trace on CPU
traced_model = torch.jit.trace(script_model, [tokens_tensor, segments_tensors])
np.mean([timer(traced_model,tokens_tensor,segments_tensors) for _ in range(100)])

86.93036488000462


In [None]:
# torch.jit.trace on GPU
traced_model_gpu = torch.jit.trace(script_model.cuda(), [tokens_tensor.cuda(), segments_tensors.cuda()])
np.mean([timer(traced_model_gpu,tokens_tensor.cuda(),segments_tensors.cuda()) for _ in range(100)])

9.323610329995063


In [None]:
traced_model.code

'def forward(self,\n    input_ids: Tensor,\n    attention_mask: Tensor) -> Tuple[Tensor, Tensor]:\n  pooler = self.pooler\n  encoder = self.encoder\n  embeddings = self.embeddings\n  embeddings0 = self.embeddings\n  token_type_ids = embeddings0.token_type_ids\n  batch_size = ops.prim.NumToTensor(torch.size(input_ids, 0))\n  _0 = int(batch_size)\n  seq_length = ops.prim.NumToTensor(torch.size(input_ids, 1))\n  _1 = int(seq_length)\n  _2 = int(seq_length)\n  _3 = torch.slice(token_type_ids, 0, 0, 9223372036854775807)\n  buffered_token_type_ids = torch.slice(_3, 1, 0, _2)\n  input = torch.expand(buffered_token_type_ids, [_0, _1])\n  _4 = torch.slice(attention_mask, 0, 0, 9223372036854775807)\n  _5 = torch.unsqueeze(torch.unsqueeze(_4, 1), 2)\n  extended_attention_mask = torch.slice(_5, 3, 0, 9223372036854775807)\n  _6 = torch.rsub(torch.to(extended_attention_mask, 6), 1.)\n  attention_mask0 = torch.mul(_6, CONSTANTS.c0)\n  _7 = (embeddings).forward(input_ids, input, )\n  _8 = (encoder).fo

In [None]:
traced_model_gpu.code

'def forward(self,\n    input_ids: Tensor,\n    attention_mask: Tensor) -> Tuple[Tensor, Tensor]:\n  pooler = self.pooler\n  encoder = self.encoder\n  embeddings = self.embeddings\n  embeddings0 = self.embeddings\n  token_type_ids = embeddings0.token_type_ids\n  batch_size = ops.prim.NumToTensor(torch.size(input_ids, 0))\n  _0 = int(batch_size)\n  seq_length = ops.prim.NumToTensor(torch.size(input_ids, 1))\n  _1 = int(seq_length)\n  _2 = int(seq_length)\n  _3 = torch.slice(token_type_ids, 0, 0, 9223372036854775807)\n  buffered_token_type_ids = torch.slice(_3, 1, 0, _2)\n  input = torch.expand(buffered_token_type_ids, [_0, _1])\n  _4 = torch.slice(attention_mask, 0, 0, 9223372036854775807)\n  _5 = torch.unsqueeze(torch.unsqueeze(_4, 1), 2)\n  extended_attention_mask = torch.slice(_5, 3, 0, 9223372036854775807)\n  _6 = torch.rsub(torch.to(extended_attention_mask, 6), 1.)\n  attention_mask0 = torch.mul(_6, CONSTANTS.c0)\n  _7 = (embeddings).forward(input_ids, input, )\n  _8 = (encoder).fo

In [None]:
import torchvision
import torch
from time import perf_counter
import numpy as np

def timer(f,*args):
    start = perf_counter()
    f(*args)
    return (1000 * (perf_counter() - start))

In [None]:
# ResNet on CPU
model_ft = torchvision.models.resnet18()
model_ft.eval()
x_ft = torch.rand(1,3, 224,224)
np.mean([timer(model_ft,x_ft) for _ in range(10)])

92.92151069999545


In [None]:
# ResNet on GPU
model_ft_gpu = torchvision.models.resnet18(pretrained=True).cuda()
x_ft_gpu = x_ft.cuda()
model_ft_gpu.eval()
np.mean([timer(model_ft_gpu,x_ft_gpu) for _ in range(10)])

9.044108600010077


Script mode is invoked by either torch.jit.trace or torch.jit.script. Here, I am using script method.

In [None]:
# torch.jit.script on CPU
script_cell = torch.jit.script(model_ft, (x_ft))
np.mean([timer(script_cell,x_ft) for _ in range(10)])

  "`optimize` is deprecated and has no effect. Use `with torch.jit.optimized_execution() instead"


89.58781770000996

In [None]:
# torch.jit.script on GPU
script_cell_gpu = torch.jit.script(model_ft_gpu, (x_ft_gpu))
np.mean([timer(script_cell_gpu,x_ft.cuda()) for _ in range(100)])

2.527740690003384

In [None]:
script_cell.code

'def forward(self,\n    x: Tensor) -> Tensor:\n  return (self)._forward_impl(x, )\n'

In [None]:
script_cell_gpu.code

'def forward(self,\n    x: Tensor) -> Tensor:\n  return (self)._forward_impl(x, )\n'

In [None]:
torch.jit.save(traced_model,'traced_bert.pt')

In [None]:
loaded = torch.jit.load('traced_bert.pt')