<a href="https://colab.research.google.com/github/akdeniz27/dynamic_quantization/blob/main/Dynamic_Quantization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers --upgrade
!pip install "optimum[onnxruntime]" --upgrade
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from optimum.onnxruntime import ORTQuantizer, ORTModelForTokenClassification
from optimum.onnxruntime.configuration import AutoQuantizationConfig
from datasets import load_dataset

In [6]:
model_original = "akdeniz27/bert-base-turkish-cased-ner"
dataset_id = load_dataset("/content/drive/MyDrive/Token_classification_for_Turkish_NER/TurkNER/turkner_4.py", split="test")
onnx_path = "/content/drive/MyDrive/Quantization/dynamic_quantized_bert_base_turkish"

# Load PyTorch model and convert to ONNX
model_onnx = ORTModelForTokenClassification.from_pretrained(model_original, from_transformers=True)
tokenizer = AutoTokenizer.from_pretrained(model_original)

# save onnx checkpoint and tokenizer
model_onnx.save_pretrained(onnx_path)
tokenizer.save_pretrained(onnx_path)

# create ORTQuantizer and define quantization configuration
quantizer = ORTQuantizer.from_pretrained(model_onnx)
dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)

model_quantized_path = quantizer.quantize(
    save_dir=onnx_path,
    quantization_config=dqconfig,
)

Downloading and preparing dataset turkner_4/turkner to /root/.cache/huggingface/datasets/turkner_4/turkner/1.0.0/16538dc41f8269d90715b793a831c604b40faf988b058f78c8b62c422374013b...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

   

Extracting data files #2:   0%|          | 0/1 [00:00<?, ?obj/s]

Extracting data files #1:   0%|          | 0/1 [00:00<?, ?obj/s]

Extracting data files #0:   0%|          | 0/1 [00:00<?, ?obj/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset turkner_4 downloaded and prepared to /root/.cache/huggingface/datasets/turkner_4/turkner/1.0.0/16538dc41f8269d90715b793a831c604b40faf988b058f78c8b62c422374013b. Subsequent calls will reuse this data.


Downloading:   0%|          | 0.00/956 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/373 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/251k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/497k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [41]:
text =  """Mustafa Kemal Atatürk, Türk asker, devlet adamı ve Türkiye Cumhuriyeti'nin kurucusudur.
Birinci Dünya Savaşı sırasında, Osmanlı ordusunda görev yapan Atatürk, Çanakkale Cephesi'nde miralaylığa, Sina Cephesi ve Filistin Cephesi'nde ise Yıldırım Orduları komutanlığına atandı.
Savaşın sonunda, Osmanlı İmparatorluğu'nun yenilgisini takiben Kurtuluş Savaşı ile simgelenen Türk Ulusal Hareketi'ne öncülük ve önderlik etti.
Türk Kurtuluş Savaşı sürecinde Ankara Hükümeti'ni kurdu, Türk Orduları Başkomutanı olarak Sakarya Meydan Muharebesi'ndeki başarısından dolayı 19 Eylül 1921 tarihinde "Gazi" unvanını aldı ve mareşallik rütbesine yükseldi.
"""

In [42]:
model_quantized = ORTModelForTokenClassification.from_pretrained(onnx_path, file_name="model_quantized.onnx")
tokenizer_quantized = AutoTokenizer.from_pretrained(onnx_path)

ner_quantized = pipeline("token-classification", model=model_quantized, tokenizer=tokenizer_quantized, aggregation_strategy="first")

In [43]:
ner_quantized(text)

[{'entity_group': 'PER',
  'score': 0.99433225,
  'word': 'Mustafa Kemal Atatürk',
  'start': 0,
  'end': 21},
 {'entity_group': 'LOC',
  'score': 0.9910083,
  'word': 'Türkiye Cumhuriyeti',
  'start': 51,
  'end': 70},
 {'entity_group': 'LOC',
  'score': 0.99219656,
  'word': 'Osmanlı',
  'start': 120,
  'end': 127},
 {'entity_group': 'PER',
  'score': 0.99739945,
  'word': 'Atatürk',
  'start': 150,
  'end': 157},
 {'entity_group': 'LOC',
  'score': 0.79637533,
  'word': 'Çanakkale Cephesi',
  'start': 159,
  'end': 176},
 {'entity_group': 'LOC',
  'score': 0.9767453,
  'word': 'Sina',
  'start': 194,
  'end': 198},
 {'entity_group': 'LOC',
  'score': 0.9912424,
  'word': 'Filistin',
  'start': 210,
  'end': 218},
 {'entity_group': 'LOC',
  'score': 0.98198247,
  'word': 'Osmanlı İmparatorluğu',
  'start': 292,
  'end': 313},
 {'entity_group': 'ORG',
  'score': 0.70012087,
  'word': 'Hareketi',
  'start': 381,
  'end': 389},
 {'entity_group': 'LOC',
  'score': 0.9261657,
  'word': 'A

In [44]:
ner_original = pipeline("token-classification", model=model_original, tokenizer=tokenizer, aggregation_strategy="first")

In [45]:
ner_original(text)

[{'entity_group': 'PER',
  'score': 0.9963313,
  'word': 'Mustafa Kemal Atatürk',
  'start': 0,
  'end': 21},
 {'entity_group': 'LOC',
  'score': 0.9881506,
  'word': 'Türkiye Cumhuriyeti',
  'start': 51,
  'end': 70},
 {'entity_group': 'LOC',
  'score': 0.9959254,
  'word': 'Osmanlı',
  'start': 120,
  'end': 127},
 {'entity_group': 'PER',
  'score': 0.99825114,
  'word': 'Atatürk',
  'start': 150,
  'end': 157},
 {'entity_group': 'LOC',
  'score': 0.81727326,
  'word': 'Çanakkale Cephesi',
  'start': 159,
  'end': 176},
 {'entity_group': 'LOC',
  'score': 0.9796963,
  'word': 'Sina',
  'start': 194,
  'end': 198},
 {'entity_group': 'LOC',
  'score': 0.9911573,
  'word': 'Filistin',
  'start': 210,
  'end': 218},
 {'entity_group': 'LOC',
  'score': 0.9874549,
  'word': 'Osmanlı İmparatorluğu',
  'start': 292,
  'end': 313},
 {'entity_group': 'ORG',
  'score': 0.7687833,
  'word': 'Türk Ulusal Hareketi',
  'start': 369,
  'end': 389},
 {'entity_group': 'LOC',
  'score': 0.7720524,
  'w

In [12]:
from evaluate import evaluator
from datasets import load_dataset

In [13]:
eval = evaluator("token-classification")
eval_dataset = load_dataset("/content/drive/MyDrive/Token_classification_for_Turkish_NER/TurkNER/turkner_4.py", split="test")

ner_quantized_2 = pipeline("token-classification", model=model_quantized, tokenizer=tokenizer_quantized)
ner_original_2 = pipeline("token-classification", model=model_original, tokenizer=tokenizer)

def measure_performance(pipe):
    results = eval.compute(
        model_or_pipeline=pipe,
        data=eval_dataset,
        metric="seqeval", 
    )
    return results

original_model_results=measure_performance(ner_original_2)
quantized_model_results=measure_performance(ner_quantized_2)

print(f"Original model: {original_model_results['overall_f1']*100:.2f}%")
print(f"Quantized model: {quantized_model_results['overall_f1']*100:.2f}%")
print(f"The quantized model achieves {round(quantized_model_results['overall_f1']/original_model_results['overall_f1'])*100:.2f}% overall_f1 of the fp32 model")



  0%|          | 0/2753 [00:00<?, ?ex/s]

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]



Original model: 96.09%
Quantized model: 96.04%
The quantized model achieves 100.00% overall_f1 of the fp32 model


In [47]:
from time import perf_counter
import numpy as np

payload = text
print(f'Payload sequence length: {len(tokenizer(payload)["input_ids"])}')

def measure_latency(pipe):
    latencies = []
    # warm up
    for _ in range(10):
        _ = pipe(payload)
    # Timed run
    for _ in range(300):
        start_time = perf_counter()
        _ =  pipe(payload)
        latency = perf_counter() - start_time
        latencies.append(latency)
    # Compute run statistics
    time_avg_ms = 1000 * np.mean(latencies)
    time_std_ms = 1000 * np.std(latencies)
    time_p95_ms = 1000 * np.percentile(latencies,95)
    return f"P95 latency (ms) - {time_p95_ms}; Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f};", time_p95_ms

original_model=measure_latency(ner_original_2)
quantized_model=measure_latency(ner_quantized_2)

print(f"Original model: {original_model[0]}")
print(f"Quantized model: {quantized_model[0]}")
print(f"Improvement through quantization: {round(original_model[1]/quantized_model[1],2)}x")

Payload sequence length: 128
Original model: P95 latency (ms) - 1372.894465249943; Average latency (ms) - 969.13 +\- 146.91;
Quantized model: P95 latency (ms) - 224.2452960001856; Average latency (ms) - 167.80 +\- 33.36;
Improvement through quantization: 6.12x
