In [1]:
import pandas as pd
import numpy  as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from typing_extensions import Sequence
import os
import json
import typing
from tqdm import tqdm

class CustomTokenizer:
  ## this class is used to tokenize and detokenize the sentence

  ##
  def __init__(self,
               split: str=" ",
               char_level: bool=False,
               lower:bool=True,
               start_token:str="<start>",
               end_token:str="<end>",
               filters: list = ['!', "'", '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '\t', '\n'],
               filter_nums: bool = True,
               start: int=1
               ):
    self.split=split
    self.char_level=char_level
    self.lower=lower
    self.start_token=start_token
    self.end_token=end_token
    self.filters=filters
    self.filter_nums=filter_nums
    self.start=start
    self.max_length=0
    self.word_index={}
    self.index_word={}

  def start_token_index(self):
    return self.word_index[self.start_token]

  def end_token_index(self):
    return self.word_index[self.end_token]

  def sort(self):

    ## for creating sorted word_index and index_word dictionary

    all_words = sorted(list(self.word_index.keys()))

        # Reconstruct word_index and index_word with new sequential indices
    self.word_index = {}
    self.index_word = {}
    current_index = self.start
    for word in all_words:
      self.word_index[word] = current_index
      self.index_word[current_index] = word
      current_index += 1


  def split_line(self,line:str):
     ## split the line into words and special characters

     if self.lower:
      line=line.lower()
     else:
      line=line


     if self.char_level:
      return [char for char in line]

     line_token=line.split(self.split)


     new_token=[]

     for index,token in enumerate(line_token):
      new_tokens=[''];
      for char_index,char in enumerate(token):
        if (char in self.filters) or (self.filter_nums and char.isdigit()):
          if(len(token)-1!=char_index):
            new_tokens+=[char,'']
          else:
            new_tokens+=[char]
        else:
          new_tokens[-1]+=[char]

        new_token+=new_tokens
        if len(line_token)-1 !=index:
          new_token+=[self.split]

     new_token=[token for token in new_token if token !='']

     return new_token


  def fit_on_texts(self,text:typing.List[str]):
    ## it fit the tokenizer on the list of lines

    ## update the word_index and index_word dictionary based on the text

    self.word_index={word:index for index,word in enumerate([self.start_token,self.end_token,self.split]+self.filters)}

    for line in tqdm(text,desc="fitting tokenizer"):
      line_tokens=self.split_line(line)
      self.max_length=max(self.max_length,len(line_tokens)+2)
      for token in line_tokens:

        if token not in self.word_index:
          self.word_index[token]=len(self.word_index)

    self.sort()

  def update(self, lines: typing.List[str]):
        """ Updates the tokenizer with new lines of text
        This function will update the word_index and index_word dictionaries and set the max_length attribute

        Args:
            lines (typing.List[str]): List of lines of text to update the tokenizer with
        """
        new_tokens = 0
        for line in tqdm(lines, desc="Updating tokenizer"):
            line_tokens = self.split_line(line)
            self.max_length = max(self.max_length, len(line_tokens) +2) # +2 for start and end tokens
            for token in line_tokens:
                if token not in self.word_index:
                    self.word_index[token] = len(self.word_index)
                    new_tokens += 1

        self.sort()
        print(f"Added {new_tokens} new tokens")

  def detokenizer(self,sequences:typing.List[int],remove_start_end:bool=True):

  ## this is used to detokenize the list of integers into the considerble text

  ## return the list of the word
    lines=[]
    for sequence in sequences:
      line=""
      for token in sequence:
        if token==0:
          break
        if remove_start_end and (token==self.start_token_index() or token==self.end_token_index()):
          continue
        line+=self.index_word[token]

      lines.append(line)
    return lines


  def text_to_sequences(self,text:typing.List[str],add_start_end:bool=True):
  ## this is used to convert the  text into there considerable token

  ## this function return the list of integer

    Sequences=[]
    for lines in text:
      line=self.split_line(lines)
      Sequence=[self.word_index[word] for word in line if word in self.word_index]
      if add_start_end:
        Sequence= [self.word_index[self.start_token]] + Sequence + [self.word_index[self.end_token]]
      Sequences.append(Sequence)
    return Sequences


  def save(self, path: str, type: str="json"):
        """ Saves the tokenizer to a file

        Args:
            path (str): Path to save the tokenizer to
            type (str, optional): Type of file to save the tokenizer to. Defaults to "json".
        """
        serialised_dict = self.dict()
        if type == "json":
            if os.path.dirname(path):
                os.makedirs(os.path.dirname(path), exist_ok=True)
            with open(path, "w") as f:
                json.dump(serialised_dict, f)

  def dict(self):
        """ Returns a dictionary of the tokenizer

        Returns:
            dict: Dictionary of the tokenizer
        """
        return {
            "split": self.split,
            "lower": self.lower,
            "char_level": self.char_level,
            "index_word": self.index_word,
            "max_length": self.max_length,
            "start_token": self.start_token,
            "end_token": self.end_token,
            "filters": self.filters,
            "filter_nums": self.filter_nums,
            "start": self.start
        }

  @staticmethod
  def load(path: typing.Union[str, dict], type: str="json"):
        """ Loads a tokenizer from a file

        Args:
            path (typing.Union[str, dict]): Path to load the tokenizer from or a dictionary of the tokenizer
            type (str, optional): Type of file to load the tokenizer from. Defaults to "json".

        Returns:
            CustomTokenizer: Loaded tokenizer
        """
        if isinstance(path, str):
            if type == "json":
                with open(path, "r") as f:
                    load_dict = json.load(f)

        elif isinstance(path, dict):
            load_dict = path

        tokenizer = CustomTokenizer()
        tokenizer.split = load_dict["split"]
        tokenizer.lower = load_dict["lower"]
        tokenizer.char_level = load_dict["char_level"]
        # Ensure index_word keys are integers upon loading
        tokenizer.index_word = {int(k): v for k, v in load_dict["index_word"].items()}
        tokenizer.max_length = load_dict["max_length"]
        tokenizer.start_token = load_dict["start_token"]
        tokenizer.end_token = load_dict["end_token"]
        tokenizer.filters = load_dict["filters"]
        tokenizer.filter_nums = bool(load_dict["filter_nums"])
        tokenizer.start = load_dict["start"]
        # Reconstruct word_index from the loaded index_word
        tokenizer.word_index = {v: k for k, v in tokenizer.index_word.items()}

        return tokenizer

  def __len__(self):
        return len(self.index_word)




In [3]:
import zipfile
zip_path='/content/eng_-french.csv.zip'
with zipfile.ZipFile(zip_path,'r') as zip_ref:
  zip_ref.extractall('tokenizer_eng_to_french')

In [4]:
import os
for i in os.listdir('/content/tokenizer_eng_to_french'):
  print(i)

eng_-french.csv


In [5]:
df=pd.read_csv('/content/tokenizer_eng_to_french/eng_-french.csv')

In [6]:
new_df=df.head(500)

In [12]:
new_df.sample(5)

Unnamed: 0,English words/sentences,French words/sentences
264,Get lost!,Va voir ailleurs si j'y suis !
366,I'm safe.,Je suis en sécurité.
28,Cheers!,Merci !
282,He tries.,Il essaye.
21,I won!,Je l'ai emporté !


In [None]:
train_eng=df['English words/sentences']

In [8]:
eng_train=new_df['English words/sentences'].tolist()

In [9]:
fre_train=new_df['French words/sentences'].tolist()

In [10]:
tokenize = CustomTokenizer(char_level=True)
tokenize.fit_on_texts(eng_train)
tokenize.save("tokenizer_eng.json")

fitting tokenizer: 100%|██████████| 500/500 [00:00<00:00, 606463.85it/s]


In [11]:
tokenize1=CustomTokenizer(char_level=True)
tokenize1.fit_on_texts(fre_train)
tokenize1.save("tokenizer_fre.json")

fitting tokenizer: 100%|██████████| 500/500 [00:00<00:00, 208216.04it/s]


In [13]:
tokenize_sen=tokenize.text_to_sequences(["Hang on."])[0]
tokenize_sen2=tokenize.text_to_sequences(["Get lost!"])[0]
print(tokenize_sen)
print(tokenize_sen2)

detokinze_sen=tokenize.detokenizer([tokenize_sen],remove_start_end=False)
detokinze_sen2=tokenize.detokenizer([tokenize_sen2],remove_start_end=False)
print(detokinze_sen)
print(detokinze_sen2)



[25, 43, 36, 49, 42, 3, 50, 49, 17, 24]
[25, 42, 40, 55, 3, 47, 50, 54, 55, 4, 24]
['<start>hang on.<end>']
['<start>get lost!<end>']
