<a href="https://colab.research.google.com/github/agarwalsourabh55/TransformersOptimum/blob/optimum_branch/TransformersOptimum.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#### REFERENCE LINK 
## https://www.philschmid.de/optimize-sentence-transformers

In [3]:
# !pip install "optimum[onnxruntime]==1.3.0" evaluate mkl-include mkl

In [4]:
from optimum.onnxruntime import ORTModelForFeatureExtraction
from transformers import AutoTokenizer
from pathlib import Path


model_id="sentence-transformers/all-MiniLM-L6-v2"
onnx_path = Path("onnx")

# load vanilla transformers and convert to onnx
model = ORTModelForFeatureExtraction.from_pretrained(model_id, from_transformers=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# save onnx checkpoint and tokenizer
model.save_pretrained(onnx_path)
tokenizer.save_pretrained(onnx_path)

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/86.7M [00:00<?, ?B/s]

('onnx/tokenizer_config.json',
 'onnx/special_tokens_map.json',
 'onnx/vocab.txt',
 'onnx/added_tokens.json',
 'onnx/tokenizer.json')

In [5]:
from transformers import Pipeline
import torch.nn.functional as F
import torch

# copied from the model card
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


class SentenceEmbeddingPipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        # we don't have any hyperameters to sanitize
        preprocess_kwargs = {}
        return preprocess_kwargs, {}, {}

    def preprocess(self, inputs):
        encoded_inputs = self.tokenizer(inputs, padding=True, truncation=True, return_tensors='pt')
        return encoded_inputs

    def _forward(self, model_inputs):
        outputs = self.model(**model_inputs)
        return {"outputs": outputs, "attention_mask": model_inputs["attention_mask"]}

    def postprocess(self, model_outputs):
        # Perform pooling
        sentence_embeddings = mean_pooling(model_outputs["outputs"], model_outputs['attention_mask'])
        # Normalize embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        return sentence_embeddings


In [6]:
# init pipeline
vanilla_emb = SentenceEmbeddingPipeline(model=model, tokenizer=tokenizer)

# run inference
pred = vanilla_emb("Could you assist me in finding my lost card?")

# print an excerpt from the sentence embedding
print(pred[0][:5])


tensor([-0.0631,  0.0426,  0.0037,  0.0377,  0.0414])


In [7]:
pred

tensor([[-6.3117e-02,  4.2629e-02,  3.6944e-03,  3.7706e-02,  4.1427e-02,
          5.9090e-02, -2.5913e-03,  3.6958e-02,  2.4465e-02, -8.9954e-02,
         -4.8769e-03, -4.6063e-02, -9.0055e-03, -5.8913e-02, -5.7958e-02,
         -3.8567e-02, -1.6060e-02,  4.3758e-02, -9.5715e-03,  3.8877e-02,
         -6.0921e-02,  5.3137e-02, -6.2273e-02,  7.7119e-04, -9.4132e-03,
          1.7522e-03, -1.5734e-02, -2.4913e-02, -3.5375e-02, -4.8350e-02,
          7.2593e-02,  4.9164e-02,  4.3899e-02,  3.3858e-02,  1.2083e-01,
         -5.6280e-02, -5.5931e-02,  1.1271e-02,  2.5695e-02, -3.1152e-02,
         -6.5340e-02,  3.6819e-02, -1.1976e-02,  3.2635e-02,  5.0967e-02,
         -9.2405e-03, -3.9800e-02,  5.1867e-02,  1.7502e-02,  1.2167e-02,
         -1.2734e-03, -1.4549e-02, -1.7959e-02,  3.0593e-03, -7.6466e-03,
          6.4590e-02,  4.5230e-02, -3.0241e-02, -5.3600e-02,  2.9540e-02,
          1.2354e-01, -1.4451e-02, -2.8343e-02, -7.5770e-03, -1.1255e-01,
         -3.2812e-02,  3.9370e-02, -5.

In [9]:
from optimum.onnxruntime import ORTOptimizer
from optimum.onnxruntime.configuration import OptimizationConfig

# create ORTOptimizer and define optimization configuration
optimizer = ORTOptimizer.from_pretrained(model_id, feature=model.pipeline_task)
optimization_config = OptimizationConfig(optimization_level=99) # enable all optimizations

# apply the optimization configuration to the model
optimizer.export(
    onnx_model_path=onnx_path / "model.onnx",
    onnx_optimized_model_output_path=onnx_path / "model-optimized.onnx",
    optimization_config=optimization_config,
)


PosixPath('onnx/model-optimized.onnx')

In [10]:
from optimum.onnxruntime import ORTModelForFeatureExtraction

# load optimized model
model = ORTModelForFeatureExtraction.from_pretrained(onnx_path, file_name="model-optimized.onnx")

# create optimized pipeline
optimized_emb = SentenceEmbeddingPipeline(model=model, tokenizer=tokenizer)
pred = optimized_emb("Could you assist me in finding my lost card?")
print(pred[0][:5])


tensor([-0.0631,  0.0426,  0.0037,  0.0377,  0.0414])


In [11]:
from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig

# create ORTQuantizer and define quantization configuration
dynamic_quantizer = ORTQuantizer.from_pretrained(model_id, feature=model.pipeline_task)
dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)

# apply the quantization configuration to the model
model_quantized_path = dynamic_quantizer.export(
    onnx_model_path=onnx_path / "model-optimized.onnx",
    onnx_quantized_model_output_path=onnx_path / "model-quantized.onnx",
    quantization_config=dqconfig,
)

In [12]:
import os

# get model file size
size = os.path.getsize(onnx_path / "model-optimized.onnx")/(1024*1024)
quantized_model = os.path.getsize(onnx_path / "model-quantized.onnx")/(1024*1024)

print(f"Model file size: {size:.2f} MB")
print(f"Quantized Model file size: {quantized_model:.2f} MB")
#  Model file size: 86.66 MB
#  Quantized Model file size: 63.47 MB

Model file size: 86.66 MB
Quantized Model file size: 63.47 MB


In [13]:
from optimum.onnxruntime import ORTModelForFeatureExtraction
from transformers import AutoTokenizer

model = ORTModelForFeatureExtraction.from_pretrained(onnx_path,file_name="model-quantized.onnx")
tokenizer = AutoTokenizer.from_pretrained(onnx_path)

q8_emb = SentenceEmbeddingPipeline(model=model, tokenizer=tokenizer)

pred = q8_emb("Could you assist me in finding my lost card?")
print(pred[0][:5])

tensor([-0.0569,  0.0131, -0.0102,  0.0448,  0.0432])


In [14]:
from datasets import load_dataset
from evaluate import load

eval_dataset = load_dataset("glue","stsb",split="validation")
metric = load('glue', 'stsb')

# creating a subset for faster evaluation
# COMMENT IN to run evaluation on a subset of the dataset
# eval_dataset = eval_dataset.select(range(200))


Downloading builder script:   0%|          | 0.00/7.78k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Downloading and preparing dataset glue/stsb (download: 784.05 KiB, generated: 1.09 MiB, post-processed: Unknown size, total: 1.86 MiB) to /root/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data:   0%|          | 0.00/803k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

In [15]:
def compute_sentence_similarity(sentence_1, sentence_2,pipeline):
    embedding_1 = pipeline(sentence_1)
    embedding_2 = pipeline(sentence_2)
    # compute cosine similarity between two sentences
    return torch.nn.functional.cosine_similarity(embedding_1, embedding_2, dim=1)


def evaluate_stsb(example):
  default = compute_sentence_similarity(example["sentence1"], example["sentence2"], vanilla_emb)
  quantized = compute_sentence_similarity(example["sentence1"], example["sentence2"], q8_emb)
  return {
      'reference': (example["label"] - 1) / (5 - 1), # rescale to [0,1]
      'default': float(default),
      'quantized': float(quantized),
      }

# run evaluation
result = eval_dataset.map(evaluate_stsb)

# compute metrics
default_acc = metric.compute(predictions=result["default"], references=result["reference"])
quantized = metric.compute(predictions=result["quantized"], references=result["reference"])

print(f"vanilla model: pearson={default_acc['pearson']}%")
print(f"quantized model: pearson={quantized['pearson']}%")
print(f"The quantized model achieves {round(quantized['pearson']/default_acc['pearson'],2)*100:.2f}% accuracy of the fp32 model")




  0%|          | 0/1500 [00:00<?, ?ex/s]

vanilla model: pearson=0.8696194631201818%
quantized model: pearson=0.8665284798317718%
The quantized model achieves 100.00% accuracy of the fp32 model


In [16]:
from time import perf_counter
import numpy as np

payload="Hello, my name is Philipp and I live in Nuremberg, Germany. Currently I am working as a Technical Lead at Hugging Face to democratize artificial intelligence through open source and open science. In the past I designed and implemented cloud-native machine learning architectures for fin-tech and insurance companies. I found my passion for cloud concepts and machine learning 5 years ago. Since then I never stopped learning. Currently, I am focusing myself in the area NLP and how to leverage models like BERT, Roberta, T5, ViT, and GPT2 to generate business value. I cannot wait to see what is next for me"
print(f'Payload sequence length: {len(tokenizer(payload)["input_ids"])}')

def measure_latency(pipe):
    latencies = []
    # warm up
    for _ in range(10):
        _ = pipe(payload)
    # Timed run
    for _ in range(100):
        start_time = perf_counter()
        _ =  pipe(payload)
        latency = perf_counter() - start_time
        latencies.append(latency)
    # Compute run statistics
    time_avg_ms = 1000 * np.mean(latencies)
    time_std_ms = 1000 * np.std(latencies)
    time_p95_ms = 1000 * np.percentile(latencies,95)
    return f"P95 latency (ms) - {time_p95_ms}; Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f};", time_p95_ms


vanilla_model=measure_latency(vanilla_emb)
quantized_model=measure_latency(q8_emb)

print(f"Vanilla model: {vanilla_model[0]}")
print(f"Quantized model: {quantized_model[0]}")
print(f"Improvement through quantization: {round(vanilla_model[1]/quantized_model[1],2)}x")


Payload sequence length: 128
Vanilla model: P95 latency (ms) - 62.4759662999395; Average latency (ms) - 56.27 +\- 2.64;
Quantized model: P95 latency (ms) - 48.39269269992883; Average latency (ms) - 45.72 +\- 1.79;
Improvement through quantization: 1.29x
