<a href="https://colab.research.google.com/github/Yapping72/ICT3102-e-mc2-assignment-1/blob/main/ICT3102_Assignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# `Project Dependencies`

In [1]:
!pip install transformers
!pip install sentencepiece
!pip install datasets

Collecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/1a/06/3817f9bb923437ead9a794f0ac0d03b8b5e0478ab112db4c413dd37c09da/transformers-4.33.2-py3-none-any.whl.metadata
  Downloading transformers-4.33.2-py3-none-any.whl.metadata (119 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.9/119.9 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting filelock (from transformers)
  Obtaining dependency information for filelock from https://files.pythonhosted.org/packages/5e/5d/97afbafd9d584ff1b45fcb354a479a3609bd97f912f8f1f6c563cb1fae21/filelock-3.12.4-py3-none-any.whl.metadata
  Downloading filelock-3.12.4-py3-none-any.whl.metadata (2.8 kB)
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Obtaining dependency information for huggingface-hub<1.0,>=0.15.1 from https://files.pythonhosted.org/packages/72/21/51cddb8850ed3f4dbc21e57c3dabc49e64d5577857ddda7b2eb0ffc2ec0e/huggingfac

# `Project Code`

In [18]:
from transformers import AutoTokenizer
from datasets import load_dataset

# byte_pair_tokenization = ["openai-gpt", "gpt2", "NousResearch/Llama-2-13b-hf"]
# unigram_tokenization = ["google/bigbird-roberta-base", "facebook/mbart-large-50-many-to-many-mmt" , "albert-base-v2" , "xlnet-base-cased"]
# wordpiece_tokenization = ['distilbert-base-uncased','google/mobilebert-uncased','funnel-transformer/small-base','sentence-transformers/all-mpnet-base-v2']
# sentencepiece_tokenization = ["google/flan-t5-base"]

corpus = ["I have a new GPU!", "I wonder how fast the model will train on this.", "Car park there"]


dataset = load_dataset("HuggingFaceH4/self-instruct-seed")
# corpus = dataset['train']['instruction']

def initialize_model(model_name:str):
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  return tokenizer

def time_model(tokenizer_object, corpus):
    """Calculate timing for encode_plus"""
    # Capture the timeit result
    total_time = 0
    for text in corpus:
        # Capture the timeit result for each text
        timeit_result = %timeit -n 0 -r 1 -o tokenizer.encode_plus(text)
        total_time += timeit_result.average
    # Return the average time in milliseconds
    return total_time * 1e3

def time_model_batch(tokenizer, corpus):
    """Calculate timing for batch_encode_plus"""
    # Capture the timeit result
    timeit_result = %timeit -n 0 -r 1 -o tokenizer.batch_encode_plus(corpus)
    # Return the average time in milliseconds
    return timeit_result.average * 1e3

def analyse_encode_plus(tokenizers: list, corpus: list) -> dict:
    results = {}
    results['method'] = "Unbatched"
    for hugging_face_tokenizer in tokenizers:
        try:
          tokenizer = initialize_model(hugging_face_tokenizer)
          formattedCorpus = tokenizer(corpus, truncation=True)
          # print(formattedCorpus)
          average_time = time_model(tokenizer, formattedCorpus)

          # Extract tokenizer name or path for dictionary key
          tokenizer_name = tokenizer.name_or_path
          results[tokenizer_name] = average_time
        except Exception as e:
          print(f"Error occured for {hugging_face_tokenizer}: {e}")
          continue

    return results

def analyse_batch(tokenizers: list, corpus: list) -> dict:
    results = {}
    results['method'] = "Batched"
    for hugging_face_tokenizer in tokenizers:
        try:
          tokenizer = initialize_model(hugging_face_tokenizer)
          average_time = time_model_batch(tokenizer, corpus)

          # Extract tokenizer name or path for dictionary key
          tokenizer_name = tokenizer.name_or_path
          results[tokenizer_name] = average_time
        except Exception as e:
          print(f"Error occured for {hugging_face_tokenizer}: {e}")
          continue

    return results

In [19]:
byte_pair = ["openai-gpt", "gpt2", "NousResearch/Llama-2-13b-hf"]
byte_pair_timing_unbatched = analyse_encode_plus(byte_pair, corpus)
byte_pair_timing_batched = analyse_batch(byte_pair,corpus)

69.8 µs ± 0 ns per loop (mean ± std. dev. of 1 run, 10000 loops each)
63.9 µs ± 0 ns per loop (mean ± std. dev. of 1 run, 10000 loops each)
34.9 µs ± 0 ns per loop (mean ± std. dev. of 1 run, 10000 loops each)
38 µs ± 0 ns per loop (mean ± std. dev. of 1 run, 10000 loops each)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


36.4 µs ± 0 ns per loop (mean ± std. dev. of 1 run, 10000 loops each)
67.4 µs ± 0 ns per loop (mean ± std. dev. of 1 run, 10000 loops each)
197 µs ± 0 ns per loop (mean ± std. dev. of 1 run, 1000 loops each)
262 µs ± 0 ns per loop (mean ± std. dev. of 1 run, 1000 loops each)
224 µs ± 0 ns per loop (mean ± std. dev. of 1 run, 1000 loops each)


In [11]:
# Get response time for word_piece models
word_piece =['distilbert-base-uncased','google/mobilebert-uncased','funnel-transformer/small-base','sentence-transformers/all-mpnet-base-v2']
word_piece_timing_unbatched = analyse_encode_plus(word_piece, corpus)
word_piece_timing_batched = analyse_batch(word_piece,corpus)

137 µs ± 33.2 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
101 µs ± 10.6 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
237 µs ± 49.3 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
176 µs ± 17.8 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
189 µs ± 10.7 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
205 µs ± 18.6 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
193 µs ± 12 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
179 µs ± 11 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [12]:
sentence_piece = ["google/flan-t5-base"]
sentence_piece_timing_unbatched = analyse_encode_plus(sentence_piece, corpus)
sentence_piece_timing_batched = analyse_batch(sentence_piece,corpus)

88.8 µs ± 26.5 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
156 µs ± 11 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [13]:
unigram = ["google/bigbird-roberta-base", "facebook/mbart-large-50-many-to-many-mmt" , "albert-base-v2" , "xlnet-base-cased"]
unigram_timing_unbatched = analyse_encode_plus(unigram, corpus)
unigram_timing_batched = analyse_batch(unigram,corpus)

79.3 µs ± 16.7 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
74.2 µs ± 1.01 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
132 µs ± 38.7 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
105 µs ± 26.9 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
143 µs ± 2.98 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
156 µs ± 4.26 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
200 µs ± 11.8 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
188 µs ± 12.6 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [22]:
print(byte_pair_timing_unbatched)
print(byte_pair_timing_batched)
print(word_piece_timing_unbatched)
print(word_piece_timing_batched)
print(sentence_piece_timing_unbatched)
print(sentence_piece_timing_batched)
print(unigram_timing_unbatched)
print(unigram_timing_batched)

{'method': 'Unbatched', 'openai-gpt': 0.13366224120002243, 'gpt2': 0.07297341069997855, 'NousResearch/Llama-2-13b-hf': 0.10379259730000286}
{'method': 'Batched', 'openai-gpt': 0.19661517999998068, 'gpt2': 0.26245640100000855, 'NousResearch/Llama-2-13b-hf': 0.22394624000003205}
