In [1]:
pip install transformers tensorflow tensorflow-addons


Collecting tensorflow-addons
  Downloading tensorflow_addons-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (611 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.8/611.8 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.7 (from tensorflow-addons)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow-addons
Successfully installed tensorflow-addons-0.23.0 typeguard-2.13.3


In [2]:
import time
import numpy as np
from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer
import tensorflow as tf

# Step 1: Download the flan-t5-base model and tokenizer
model_name = "google/flan-t5-base"
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Step 2: Convert the model to TensorFlow Lite format directly from the Keras model
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
# Enable TensorFlow Select ops
converter.target_spec.supported_ops = [
    tf.lite.OpsSet.TFLITE_BUILTINS,  # Enable TensorFlow Lite ops.
    tf.lite.OpsSet.SELECT_TF_OPS     # Enable TensorFlow Select ops.
]
tflite_model = converter.convert()

# Save the quantized model
tflite_model_path = "model_quantized.tflite"
with open(tflite_model_path, "wb") as f:
    f.write(tflite_model)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [3]:

# Load the TFLite model and allocate tensors
interpreter = tf.lite.Interpreter(model_path=tflite_model_path)
interpreter.allocate_tensors()

input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()


In [5]:
import os

# Function to evaluate latency
def evaluate_latency(model, tokenizer, input_text, num_runs=10):
    inputs = tokenizer(input_text, return_tensors="tf", padding=True)
    decoder_input_ids = tokenizer("<pad>", return_tensors="tf").input_ids

    # Warm-up run
    model(inputs.input_ids, decoder_input_ids=decoder_input_ids)

    start_time = time.time()
    for _ in range(num_runs):
        model(inputs.input_ids, decoder_input_ids=decoder_input_ids)
    end_time = time.time()

    avg_latency = (end_time - start_time) / num_runs
    return avg_latency

# Function to get the total size of the model files in a directory
def get_model_size(directory):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(directory):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size

In [6]:
# Compare normal model and quantized model
input_text = "Translate English to German: How are you?"

# Normal model evaluation
normal_latency = evaluate_latency(model, tokenizer, input_text)
model.save_pretrained("model")
normal_model_size = get_model_size("model")


In [7]:

# Function to evaluate latency for TFLite model
def evaluate_tflite_latency(interpreter, input_text, num_runs=10):
    # Use only the first token to match the expected input shape [1, 1]
    inputs = tokenizer(input_text, return_tensors="tf", padding=True)
    input_ids = np.array([[inputs['input_ids'][0][0]]], dtype=np.int32)
    attention_mask = np.array([[inputs['attention_mask'][0][0]]], dtype=np.int32)
    decoder_input_ids = np.array([[tokenizer.pad_token_id]], dtype=np.int32)
    decoder_attention_mask = np.array([[1]], dtype=np.int32)

    # Ensure input_data matches the expected shape of the TFLite model
    input_ids = np.reshape(input_ids, input_details[3]['shape'])
    attention_mask = np.reshape(attention_mask, input_details[1]['shape'])
    decoder_input_ids = np.reshape(decoder_input_ids, input_details[2]['shape'])
    decoder_attention_mask = np.reshape(decoder_attention_mask, input_details[0]['shape'])

    interpreter.set_tensor(input_details[3]['index'], input_ids)
    interpreter.set_tensor(input_details[1]['index'], attention_mask)
    interpreter.set_tensor(input_details[2]['index'], decoder_input_ids)
    interpreter.set_tensor(input_details[0]['index'], decoder_attention_mask)

    # Warm-up run
    interpreter.invoke()

    start_time = time.time()
    for _ in range(num_runs):
        interpreter.invoke()
    end_time = time.time()

    avg_latency = (end_time - start_time) / num_runs
    return avg_latency

In [8]:

# Quantized model evaluation
quantized_latency = evaluate_tflite_latency(interpreter, input_text)
quantized_model_size = os.path.getsize(tflite_model_path)

In [9]:

# Print results
print(f"Normal Model Latency: {normal_latency:.4f} seconds")
print(f"Quantized Model Latency: {quantized_latency:.4f} seconds")
print(f"Normal Model Size: {normal_model_size / 1024 / 1024:.2f} MB")
print(f"Quantized Model Size: {quantized_model_size / 1024 / 1024:.2f} MB")


Normal Model Latency: 0.4523 seconds
Quantized Model Latency: 0.0309 seconds
Normal Model Size: 1133.24 MB
Quantized Model Size: 236.85 MB
