# `Project Dependencies`

In [1]:
!pip install transformers
!pip install sentencepiece
!pip install datasets

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m48.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m
Colle

# `Project Code`

Updated Project Code With detokenisation

In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset
import time

# byte_pair_tokenization = ["openai-gpt", "gpt2", "NousResearch/Llama-2-13b-hf"]
# unigram_tokenization = ["google/bigbird-roberta-base", "facebook/mbart-large-50-many-to-many-mmt" , "albert-base-v2" , "xlnet-base-cased"]
# wordpiece_tokenization = ['distilbert-base-uncased','google/mobilebert-uncased','funnel-transformer/small-base','sentence-transformers/all-mpnet-base-v2']
# sentencepiece_tokenization = ["google/flan-t5-base"]

#corpus = ["I have a new GPU!", "I wonder how fast the model will train on this.","Hello there"]

dataset = load_dataset("HuggingFaceH4/self-instruct-seed")
corpus = dataset['train']['instruction']
dataset2 = load_dataset("fka/awesome-chatgpt-prompts")
corpus2 = dataset2['train']['prompt']

def initialize_model(model_name: str):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return tokenizer

#Unbatched data
def time_model(tokenizer_object, text, tokenize=True):
    if tokenize:
        # Capture the start time
        start_time = time.time()
        tokenizer_object.encode_plus(text)
    else:
        tokens = tokenizer_object.encode_plus(text)
        # Capture the start time
        start_time = time.time()
        tokenizer_object.decode(tokens['input_ids'], skip_special_tokens=True)

    # Capture the end time
    end_time = time.time()
    # Calculate the elapsed time in milliseconds
    elapsed_time = (end_time - start_time) * 1e3
    return elapsed_time

def analyse_encode_plus(tokenizers: list, corpus: list) -> dict:
    results = {}
    results['method'] = "Unbatched"

    for hugging_face_tokenizer in tokenizers:
        try:
            tokenizer = initialize_model(hugging_face_tokenizer)
            total_time_tokenization = 0  # Total time for tokenization
            total_time_detokenization = 0  # Total time for detokenization

            for text in corpus:
                # Tokenize each line in the corpus individually
                tokenization_time = time_model(tokenizer, text, tokenize=True)
                total_time_tokenization += tokenization_time

                # Detokenize the encoded text
                detokenization_time = time_model(tokenizer, text, tokenize=False)
                total_time_detokenization += detokenization_time

            # Extract tokenizer name or path for dictionary key
            tokenizer_name = tokenizer.name_or_path
            results[tokenizer_name] = {
                "Tokenization": total_time_tokenization,
                "Detokenization": total_time_detokenization
            }

            print(f"{tokenizer_name}:")
            print(f"Tokenization Time: {total_time_tokenization} ms")
            print(f"Detokenization Time: {total_time_detokenization} ms")
        except Exception as e:
            print(f"Error occurred for {hugging_face_tokenizer}: {e}")
            continue

    return results

def time_model_batch(tokenizer, corpus,tokenize=True):

    if tokenize:
        # Capture the start time
        start_time = time.time()
        tokenizer.batch_encode_plus(corpus)
    else:
        tokens = tokenizer.batch_encode_plus(corpus)
        # Capture the start time
        start_time = time.time()
        tokenizer.batch_decode(tokens['input_ids'], skip_special_tokens=True)

    # Capture the end time
    end_time = time.time()
    # Calculate the elapsed time in milliseconds
    elapsed_time = (end_time - start_time) * 1e3
    return elapsed_time

def analyse_batch(tokenizers: list, corpus: list) -> dict:
    results = {}
    results['method'] = "Batched"
    for hugging_face_tokenizer in tokenizers:
        tokenizer = initialize_model(hugging_face_tokenizer)
        try:
            #get time for tokenization
            tokenization_time = time_model_batch(tokenizer, corpus, tokenize=True)
            #get time for detokenization
            detokenization_time = time_model_batch(tokenizer, corpus,tokenize=False)

            # Extract tokenizer name or path for dictionary key
            tokenizer_name = tokenizer.name_or_path
            results[tokenizer_name] = {
                "Tokenization": tokenization_time,
                "Detokenization": detokenization_time
            }

            print(f"{tokenizer_name}:")
            print(f"Tokenization Time: {tokenization_time} ms")
            print(f"Detokenization Time: {detokenization_time} ms")
        except Exception as e:
          print(f"Error occured for {hugging_face_tokenizer}: {e}")
          continue

    return results


In [None]:
byte_pair = ["openai-gpt", "gpt2", "NousResearch/Llama-2-13b-hf"]
print("--------------------------Unbatched Data-------------------------------")
byte_pair_timing_unbatched = analyse_encode_plus(byte_pair, corpus)
print("--------------------------Batched Data-------------------------------")
byte_pair_timing_batched = analyse_batch(byte_pair,corpus)

--------------------------Unbatched Data-------------------------------
openai-gpt:
Tokenization Time: 24.712324142456055 ms
Detokenization Time: 24.597883224487305 ms
gpt2:
Tokenization Time: 28.779983520507812 ms
Detokenization Time: 25.022029876708984 ms
NousResearch/Llama-2-13b-hf:
Tokenization Time: 22.885560989379883 ms
Detokenization Time: 30.135154724121094 ms
--------------------------Batched Data-------------------------------
openai-gpt:
Tokenization Time: 14.763593673706055 ms
Detokenization Time: 28.710603713989258 ms
gpt2:
Tokenization Time: 11.205911636352539 ms
Detokenization Time: 26.978492736816406 ms
NousResearch/Llama-2-13b-hf:
Tokenization Time: 10.309696197509766 ms
Detokenization Time: 25.06566047668457 ms


In [None]:
# Get response time for word_piece models
word_piece =['distilbert-base-uncased','google/mobilebert-uncased','funnel-transformer/small-base','sentence-transformers/all-mpnet-base-v2']
print("--------------------------Unbatched Data-------------------------------")
word_piece_timing_unbatched = analyse_encode_plus(word_piece, corpus)
print("--------------------------Batched Data-------------------------------")
word_piece_timing_batched = analyse_batch(word_piece,corpus)

--------------------------Unbatched Data-------------------------------
distilbert-base-uncased:
Tokenization Time: 24.35016632080078 ms
Detokenization Time: 31.602859497070312 ms
google/mobilebert-uncased:
Tokenization Time: 28.52153778076172 ms
Detokenization Time: 36.28993034362793 ms
funnel-transformer/small-base:
Tokenization Time: 25.632143020629883 ms
Detokenization Time: 35.263776779174805 ms
sentence-transformers/all-mpnet-base-v2:
Tokenization Time: 25.9096622467041 ms
Detokenization Time: 34.56592559814453 ms
--------------------------Batched Data-------------------------------
distilbert-base-uncased:
Tokenization Time: 13.002872467041016 ms
Detokenization Time: 33.32972526550293 ms
google/mobilebert-uncased:
Tokenization Time: 12.912988662719727 ms
Detokenization Time: 29.02817726135254 ms
funnel-transformer/small-base:
Tokenization Time: 13.851165771484375 ms
Detokenization Time: 26.905536651611328 ms
sentence-transformers/all-mpnet-base-v2:
Tokenization Time: 11.75808906

In [None]:
sentence_piece = ["google/flan-t5-base"]
sentence_piece_timing_unbatched = analyse_encode_plus(sentence_piece, corpus)
sentence_piece_timing_batched = analyse_batch(sentence_piece,corpus)

google/flan-t5-base:
Tokenization Time: 29.726028442382812 ms
Detokenization Time: 30.49778938293457 ms
google/flan-t5-base:
Tokenization Time: 14.184236526489258 ms
Detokenization Time: 25.858163833618164 ms


In [None]:
unigram = ["google/bigbird-roberta-base", "facebook/mbart-large-50-many-to-many-mmt" , "albert-base-v2" , "xlnet-base-cased"]
print("--------------------------Unbatched Data-------------------------------")
unigram_timing_unbatched = analyse_encode_plus(unigram, corpus)
print("--------------------------Batched Data-------------------------------")
unigram_timing_batched = analyse_batch(unigram,corpus)

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/846k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/775 [00:00<?, ?B/s]

google/bigbird-roberta-base: 44.766902923583984 ms


Downloading (…)okenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

facebook/mbart-large-50-many-to-many-mmt: 28.438806533813477 ms


Downloading (…)lve/main/config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

albert-base-v2: 26.40247344970703 ms


Downloading (…)lve/main/config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

xlnet-base-cased: 24.908065795898438 ms
google/bigbird-roberta-base: 0.12436458042689733 ms
facebook/mbart-large-50-many-to-many-mmt: 0.2441052028111049 ms
albert-base-v2: 0.2801050458635603 ms
xlnet-base-cased: 0.25938442775181364 ms


In [3]:
from transformers import M2M100ForConditionalGeneration, AutoTokenizer

def translate(corpus,language):
    # Load the pre-trained model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en")
    model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")

    # Initialize an empty list to store the translated sentences
    translated_corpus = []

    # Loop through each sentence in the input corpus
    for sentence in corpus:
        # Tokenize and encode the English sentence
        encoded_en = tokenizer(sentence, return_tensors="pt")

        # Generate the translation to Chinese
        generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.get_lang_id(language))

        # Decode the generated tokens back to text
        translated_sentence = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

        #Print translated_sentence
        print(translated_sentence)

        # Append the translated sentence to the result list
        translated_corpus.append(translated_sentence)

    return translated_corpus

chinese_corpus = translate(corpus,"zh")


# # decodes the translated tokens
# zh_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

['有什么我可以吃的早餐不包含鸡蛋,但包含蛋白质,大约有700至1000卡路里?', '这些夫妇之间的关系是什么?', '为下列各人创建一个单词描述。', '描述某种情况,在某种情况下,某种类型可能会伤害你。', '为以下电子邮件创建适当的主观标题:', '你如何在工作面试中回答这个问题?', 'Brainstorm 列出可能的新年决议。', '向我解释下面的字符,并试着给我一些例子。', '创建一个haiku 使用以下单词:', '建议我在周末观看电影,并解释为什么。', '建议一些游戏,可以由一个群体的人玩。', '为健康的食物制作一份菜单。', '解释人类的行为。', '取决于句子 输出取决于行,每个行格式为(当前词 ->其头词)。', '使用所提供的事实,写一个问题答案对。', '要让夫妇有相同的类似性,写下第四个字。', '考虑到症状的描述,确定可能的疾病,并建议一些药物。', '创建生日计划检查列表。', '回复电子邮件,拒绝邀请。', '在您的观点上,您将如何解释该主题与没有背景的人的关系?', '你需要写一个创意的开幕场景一个恐怖电影。', '为孩子们创造一个有趣的数学问题。', '考虑到一个数字集,找到所有可能的子集,以计算到一个数字。', '请给我下面的食物的菜谱。', '计划一个每周的午餐菜单给一个学校 写下一个主要菜肴,一个碳水化合物侧菜,一个蔬菜侧菜,和甜点为每一天。', '找到最小的四个完美的数字。', '将句子中的所有实体链接到一个维基百科页面. 对于每个实体,你应该输出维基百科页面标题,或者输出没有,如果你不知道。', '你比大多数红头更聪明吗?', '在此函数中添加评论,使代码对人类更容易阅读。', '考虑到事实,把它们组成一个一致和迷人的故事。', '用柔软的纺织制作一份盐的菜肴列表。', '建议完成下面的Python代码。', '考虑到一个练习的名字,解释如何做到这一点。', '创建一个关于吃饭和烹饪习惯的调查。', '我和我的女朋友分手,我感到如此悲伤,你能给我一些建议吗?', '解决方程式并找到 X 的值 显示您的步骤。', '写下代表所提供的宗教的标志。', '使用评论作为指南,将代码中的“一切”评论替换为正确的代码。', '让我们来谈谈这个词的同义词。', '在段落中提取所有国家名称,并列出它们由commas分开。', 

In [4]:
# Save the translations to a text file
with open("translated_corpus.txt", "w", encoding="utf-8") as file:
    for translation in chinese_corpus:
        file.write(translation + "\n")

print("Translations saved to 'translated_corpus.txt'.")

Translations saved to 'translated_corpus.txt'.


In [None]:
print(byte_pair_timing_unbatched)
print(byte_pair_timing_batched)
print(word_piece_timing_unbatched)
print(word_piece_timing_batched)
print(sentence_piece_timing_unbatched)
print(sentence_piece_timing_batched)
print(unigram_timing_unbatched)
print(unigram_timing_batched)

{'method': 'Unbatched', 'openai-gpt': {'Tokenization': 24.712324142456055, 'Detokenization': 24.597883224487305}, 'gpt2': {'Tokenization': 28.779983520507812, 'Detokenization': 25.022029876708984}, 'NousResearch/Llama-2-13b-hf': {'Tokenization': 22.885560989379883, 'Detokenization': 30.135154724121094}}
{'method': 'Batched', 'openai-gpt': {'Tokenization': 14.763593673706055, 'Detokenization': 28.710603713989258}, 'gpt2': {'Tokenization': 11.205911636352539, 'Detokenization': 26.978492736816406}, 'NousResearch/Llama-2-13b-hf': {'Tokenization': 10.309696197509766, 'Detokenization': 25.06566047668457}}
{'method': 'Unbatched', 'distilbert-base-uncased': {'Tokenization': 24.35016632080078, 'Detokenization': 31.602859497070312}, 'google/mobilebert-uncased': {'Tokenization': 28.52153778076172, 'Detokenization': 36.28993034362793}, 'funnel-transformer/small-base': {'Tokenization': 25.632143020629883, 'Detokenization': 35.263776779174805}, 'sentence-transformers/all-mpnet-base-v2': {'Tokenizati