In [None]:
import os
import shutil
from typing import Optional

class base_bpe_tokenizer:
    
    """
        This class implements byte-pair encoding tokenization
        It includes:
            1. training tokenization - compress given token sequence based on the byte-pair encoding algorithm, - use child class
                details at https://en.wikipedia.org/wiki/Byte_pair_encoding#:~:text=Byte%20pair%20encoding%20(also%20known,for%20use%20in%20downstream%20modeling.
            2. provide encode and decode functions
            3. abilities to update or retrain tokenization based on given text 
            4. save the dictionaries to a directory and named after a unique name given based on the text training one
        
        Args:
            mode: str, takes value of "train" or "infer" - train allows training a tokenizer from scratch or keep training a tokenizer (must use the same text, this is when user wishes to increase the vocab_size)
            text: str, actual text or None if user uses inference mode only
            title: str, name of the text file user wishes to train the tokenizer on - will be used to name the vocab_dict and merge_history
            retrain: bool, whether to retrain the tokenizer, default = False

        Folder/Title Naming Convention:
            "book_title_tok_folder" - all lower case, connected with underscore
        
        Dictionary File Naming Convetion:
            "book_title_vocab_dict.pkl"
            "book_title_merge_history.pkl"
    """  

    def __init__(self,text:Optional[str], title:str, mode:str = "infer",retrain:bool = False):
        self.mode = mode
        self.title = title
        self.text = text
        self.retrain = retrain
        self._inspect_dict_folder()
    
    def _inspect_dict_folder(self):
        """ 
            if mode = "infer", folder must exist or raise an error
            if mode = "train", create folder if (not exist or retrain=True)
        """
        folder = self.title + "_tok_folder"
        file1 = os.path.join(folder,self.title+"_vocab_dict.pkl")
        file2 = os.path.join(folder,self.title+"_merge_history.pkl")
        exist = (os.path.exists(file1) and os.path.exists(file2))
        if exist:
            if self.retrain:
                os.remove(file1)
                os.remove(file2)
        else:
            if self.mode == "infer": raise FileNotFoundError(f"cannot find tokenization trained with text from {self.title}")
            else:
                if os.path.exists(folder): shutil.rmtree(folder)
                os.makedirs(folder)

            





