In [None]:
import os
import shutil
from typing import Optional

class ConfigError(Exception):
    """for invalid user configuration of tokenizer class"""
    pass

class BaseBPETokenizer:
    
    """
        This class implements byte-pair encoding tokenization
        It includes:
            1. training tokenization - compress given token sequence based on the byte-pair encoding algorithm, - use child class
                details at https://en.wikipedia.org/wiki/Byte_pair_encoding#:~:text=Byte%20pair%20encoding%20(also%20known,for%20use%20in%20downstream%20modeling.
            2. provide encode and decode functions
            3. abilities to update or retrain tokenization based on given text 
            4. save the dictionaries to a directory and named after a unique name given based on the text training one
        
        Args:
            mode: str, takes value of "train" or "infer" - train allows training a tokenizer from scratch or keep training a tokenizer (must use the same text, this is when user wishes to increase the vocab_size)
            text: str, actual text or None if user uses inference mode only
            title: str, name of the text file user wishes to train the tokenizer on - will be used to name the vocab_dict and merge_history
            retrain: bool, whether to retrain the tokenizer, default = False

        Folder/Title Naming Convention:
            "book_title_tok_folder" - all lower case, connected with underscore
        
        Dictionary File Naming Convetion:
            "book_title_vocab_dict.pkl"
            "book_title_merge_history.pkl"
    """  

    def __init__(self,text:Optional[str], title:str, mode:str = "infer",retrain:bool = False):
        if mode not in ["train","infer"]: raise ConfigError("Entered invalid mode, must be one of 'train' or 'infer'")
        self.mode = mode
        self.title = title
        self.retrain = retrain
        if (text is None and mode == "train"): raise ConfigError("Text not provided for training")
        self._inspect_dict_folder()
        # additional initialization for training mode
        if self.mode == "train": 
            self._base_vocab = {i: bytes([i]) for i in range(256)}
            self._init_tokens = list(text.encode("utf-8"))

    def _create_folder(self,folder_name):
        """create folder for dictionaries after training tokenizer"""
        if os.path.exists(folder_name): shutil.rmtree(folder_name)
        os.makedirs(folder_name)

    
    def _inspect_dict_folder(self):
        """ 
            if mode = "infer", folder must exist or raise an error
            if mode = "train", create folder if (not exist or retrain=True)
        """
        folder = self.title + "_tok_folder"
        file1 = os.path.join(folder,self.title+"_vocab_dict.pkl")
        file2 = os.path.join(folder,self.title+"_merge_history.pkl")
        exist = (os.path.exists(file1) and os.path.exists(file2))
        if exist:
            if self.retrain:
                os.remove(file1)
                os.remove(file2)
        else:
            if self.mode == "infer": raise FileNotFoundError(f"cannot find tokenization trained with text from {self.title}")
            else: self._create_folder(folder)
    
    def _get_pair_counts(self,tokens):
        """
            treverse through the entire encoded text, produce a dictionary with paired occurrences of adjacent tokens
                key: token pairs, e.g., (106, 32)
                value: counts of occurrence of key, e.g., 300
                meaning: token pair (106, 32) occurred 300 times in the text
        """
        count_dict = {}
        for (c1, c2) in zip(tokens[:-1], tokens[1:]):
            count_dict[(c1,c2)] = count_dict.get((c1,c2),0) + 1
        return count_dict
    
    def _merge_top_pair(self, old_tokens, pair, new_token):
        """
            merge a token pair into a new token
        """
        new_tokens = []
        i = 0
        while i < len(old_tokens):
            if i < len(old_tokens)-1 and (old_tokens[i], old_tokens[i+1]) == pair:
                new_tokens.append(new_token)
                i += 2
            else:
                new_tokens.append(old_tokens[i])
                i += 1
        return new_tokens

    def _call_subclass(self):
        if self.mode == "train": return BaseBPETokenizerTrain()
        else: return BaseBPETokenizerInfer() 

class BaseBPETokenizerTrain(BaseBPETokenizer):
    pass

class BaseBPETokenizerInfer(BaseBPETokenizer):
    pass

            







In [33]:
class parent:
    def __init__(self,mode,number):
        self.mode = mode
        self.number = number
        self.operation = "addition"

    def _print_status(self):
        print(f"I am {self.mode}ing")


class trainclass(parent):
    def __init__(self,mode,number):
        super(trainclass,self).__init__(mode,number)
        self.number2 = self.number * 10
        self.sub_mode = mode
    def __str__(self):
        return f"I am in train subclass and im with {self.sub_mode} mode"
    
class inferclass(parent):
    def __init__(self,mode):
        super().__init__()
        self.number2 = self.number * 20
        self.sub_mode = mode
    def __str__(self):
        return f"I am in infer subclass and im with {self.sub_mode} mode"

In [34]:
a = trainclass("train",30)

In [35]:
a._print_status()

I am training


In [36]:
str(a)

'I am in train subclass and im with train mode'

In [37]:
a.number2

300

In [38]:
a.number

30