In [3]:
# read first 10 lines of the csv file

"""
with open('en-fr.csv',encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    for i in range(20):
        next(reader.split(','))
"""

"\nwith open('en-fr.csv',encoding='utf-8') as csvfile:\n    reader = csv.reader(csvfile)\n    for i in range(20):\n        next(reader.split(','))\n"

In [65]:
# parse the whole csv file and store all unique english words in a set

import csv
import re

split_pattern = r"([\$\t,;:?.!'\"%-])"

unique_words_en = set()
unique_words_fr= set()

with open('eng_-french.csv', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    for count,row in enumerate(reader):
        row[0] = re.sub(split_pattern, r' \1 ', row[0]).strip().lower()
        row[1] = re.sub(split_pattern, r' \1 ', row[1]).strip().lower()
        for word in row[0].split():
            unique_words_en.add(word)
        for word in row[1].split():
            unique_words_fr.add(word)
        if count %100000==0 :
            print(count) # print the count of rows processed    
print(f"Total unique English words: {len(unique_words_en)}")
print(f"Total unique French words: {len(unique_words_fr)}")

0
100000
Total unique English words: 14145
Total unique French words: 24517


In [66]:
# save the file english tokens to disk
with open('unique_words_en.txt', 'w', encoding='utf-8') as f:
    for word in unique_words_en:
        f.write(word + '\n')

# save the file french tokens to disk
with open('unique_words_fr.txt', 'w', encoding='utf-8') as f:
    for word in unique_words_fr:
        f.write(word + '\n')

In [56]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors

import csv
# Step 1: Initialize the BPE Tokenizer
tokenizer = Tokenizer(models.BPE())  # Using Byte Pair Encoding (BPE) model

# Step 2: Pre-tokenization and special tokens handling
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() # Split by spaces

# Step 3: Setup decoder to decode tokens back to text
tokenizer.decoder = decoders.ByteLevel()

# Convert dataset into a list of file-like objects for the trainer
with open('eng_-french.csv','r', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    for count,row in enumerate(reader):
        with open("sentences_en.txt", "a", encoding='utf-8') as f1:
            f1.write(row[0] + "\n")
        with open("sentences_fr.txt", "a", encoding='utf-8') as f2:
            f2.write(row[1] + "\n")


#Step 5: Setup the BPE Trainer
trainer = trainers.BpeTrainer(
    vocab_size=3000,  # You can set the vocabulary size
    min_frequency=2,  # Minimum frequency of tokens to be included in vocab
    special_tokens=[
        "[BOS]",  # Beginning of sequence
        "[EOS]",  # End of sequence
        "[PAD]",  # Padding
    ]
)

# Step 6: Train the tokenizer on the dataset
tokenizer.train(files=["sentences_en.txt","sentences_fr.txt"], trainer=trainer)

# Step 7: Test the tokenizer
encoded_output = tokenizer.encode("Hello my name is Adnane")
print(f"Tokens: {encoded_output.tokens}")
print(f"Token IDs: {encoded_output.ids}")

# Step 8: Decode back the tokens into the original sentence
decoded_output = tokenizer.decode(encoded_output.ids)
print(f"Decoded sentence: {decoded_output}")

# Step 9: Save the tokenizer to a file (for future use)
tokenizer.save("bpe_tokenizer.json")





Tokens: ['ĠHe', 'll', 'o', 'Ġmy', 'Ġname', 'Ġis', 'ĠA', 'd', 'n', 'an', 'e']
Token IDs: [282, 148, 69, 343, 1567, 216, 263, 58, 68, 139, 59]
Decoded sentence:  Hello my name is Adnane


In [4]:
# Step 7: Test the tokenizer
encoded_output = tokenizer.encode("Hello my name is Adnane")
print(f"Tokens: {encoded_output.tokens}")
print(f"Token IDs: {encoded_output.ids}")

# Step 8: Decode back the tokens into the original sentence
decoded_output = tokenizer.decode(encoded_output.ids)
print(f"Decoded sentence: {decoded_output}")

# Step 9: Save the tokenizer to a file (for future use)
tokenizer.save("bpe_tokenizer.json")

Tokens: ['He', 'll', 'o', 'my', 'name', 'is', 'A', 'd', 'n', 'an', 'e']
Token IDs: [254, 158, 73, 309, 1416, 131, 33, 62, 72, 129, 63]
Decoded sentence: HellomynameisAdnane


In [8]:
# Load the tokenizer from the saved file
loaded_tokenizer = Tokenizer.from_file("bpe_tokenizer.json")

# Print all the tokens in the tokenizer
print(loaded_tokenizer.get_vocab().keys())

dict_keys(['or', 'prof', 'say', 'sol', 'soon', 'termin', 'ut', 'professeur', 'ence', 'right', 'while', 'midi', 'che', 'v', 'poli', 'prison', 'ake', 'roit', 'partie', 'hô', 'amie', 'alk', 'iture', 'père', 'pass', 'ic', 'uit', 'danger', 'Pen', 'hier', 'déjà', 'erreur', 'journ', 'neig', 'sant', 'Regar', '6', 'jam', 'pend', 'cri', 'ure', 'contre', 'where', 'If', 'espère', 'peuvent', 'inte', 'drai', 'ill', 'as', 'bras', 'ally', 'faim', 'pos', 'avons', 'saw', 'situation', '’', 'se', 'rester', 'out', 'tant', '2', 'ped', 'hap', 'good', 'sister', 'Where', 'isn', 'suc', 'ried', 'travaille', 'dents', 'f', 'este', 'elles', 'don', 'travai', '[SEP]', 'endroit', 'même', 'restaur', 'inté', 'lear', 'ir', 'w', 'Si', 'all', 'sleep', 'gla', 'plus', 'fenêtre', 'Elles', 'pian', 'aimer', 'give', 'sortir', 'pays', 'adore', 'ét', 'U', 'table', 'maison', 'eux', 'compte', 'difficile', 'Ces', 'tout', 'cen', 'ue', 'posé', 'comprendre', 'ni', 'nat', 'secret', 'enten', 'u', 'ille', 'sois', 'bout', 'fore', 'understan