In [5]:
with open('/kaggle/input/lalka/lalka-tom-pierwszy.txt', 'r',encoding='utf-8') as f:
    lalka1 =  f.read()

with open('/kaggle/input/lalka/lalka-tom-drugi.txt', 'r',encoding='utf-8') as f:
    lalka2 =  f.read()

text = lalka1 + lalka2

In [None]:
from kar

In [76]:
def merge(tokens, pair, idx):
    new_tokens = []
    i = 0
    while i < len(tokens):
        if i < len(tokens)-1 and tokens[i] == pair[0] and tokens[i+1] == pair[1]:
            new_tokens.append(idx)
            i +=2
        else:
            new_tokens.append(tokens[i])
            i+=1
    return new_tokens

def get_stats(tokens, update_stats = None):
    stats = {} if update_stats is None else update_stats
    if len(tokens) >= 2:
        for pair in zip(tokens, tokens[1:]):
            stats[pair] = stats.get(pair, 0) + 1
    return stats

        
def render_token(t: bytes) -> str:
    s = t.decode('utf-8', errors='replace')
    return s

In [194]:
class Tokenizer:
    def __init__(self):
        self.merges = {}
        self.pattern = ""
        self.special_tokens = {}
        self.vocab = self._build_vocab()
        
    def _version(self):
        return 'base_tokenizer_v1'
        
    def train(self, text, vocab_size, verbose = False):
        raise NotImplementedError
    
    def encode(self, text):
        raise NotImplementedError
        
    def decode(self, text):
        raise NotImplementedError
        
    def _build_vocab(self):
        vocab = {idx: bytes([idx]) for idx in range(256)}
        for (p0,p1), idx in self.merges.items():
            vocab[idx] = vocab[p0] + vocab[p1]
        for special, idx in self.special_tokens.items():
            vocab[idx] = special.encode('utf8')
        return vocab
    
    def save(self, path):
        with open(path+'.model', 'w') as f:
            f.write(f'{self._version()}\n')
            f.write(f'{self.pattern}\n')
            f.write(f'{len(self.special_tokens)}\n')
            for special, idx in self.special_tokens.items():
                f.write(f'{special} {idx}\n')
            for idx1, idx2 in self.merges:
                f.write(f'{idx1} {idx2}\n')
        inverted_merges = {v:pair for pair,v in self.merges.items()}
        with open(path + '.vocab', 'w', encoding='utf-8') as f:
            for idx, token in self.vocab.items():
                s = render_token(token)
                if idx in inverted_merges:
                    idx0, idx1 = inverted_merges[idx]
                    s0 = render_token(self.vocab[idx0])
                    s1 = render_token(self.vocab[idx1])
                    f.write(f'[{s0}][{s1}] -> [{s}] {idx}\n')
                else:
                    f.write(f"[{s}] {idx}\n")
    
    def load(self, path):
        idx = 256
        merges ={}
        special_tokens = {}
        with open(path+'.model', 'r') as f:
            version_control = f.readline().strip()
            assert version_control == f'{self._version}', 'Wrong file, tokenizer version does not much'
            self.pattern = f.readline().strip()
            special_tokens_length = int(f.readline().strip())
            for _ in range(special_tokens_length):
                special, special_idx = f.readline().strip().split()
                special_tokens[special] = int(special_idx)
            for line in f:
                idx1, idx2 = map(int, line.split())
                merges[(idx1, idx2)] = idx
                idx += 1
        self.merges = merges
        self.special_tokens = special_tokens
        self.vocab = self._build_vocab()
            
            
            
        

In [78]:
class BasicTokenizer(Tokenizer):
    def __init__(self):
        super().__init__()
    
    def train(self, text, vocab_size, verbose = False):
        tokens = text.encode('utf-8')
        tokens = list(tokens)
        
        idx = 256
        merges = {}
        vocab = {idx: bytes([idx]) for idx in range(256)}
        while idx < vocab_size:
            stats = get_stats(tokens)
            candidate = max(stats, key= stats.get)
            merges[candidate] = idx
            vocab[idx] = vocab[candidate[0]] + vocab[candidate[1]]
            tokens = merge(tokens, candidate, idx)
            
            if verbose: 
                print(f"{candidate[0]} + {candidate[1]} => {idx}")
            idx +=1
        self.merges = merges
        self.vocab = vocab

    def encode(self, text):
        tokens = text.encode('utf-8')
        tokens = list(tokens)
        
        
        no_more = False
        while no_more:
            stats = get_stats(tokens)
            candidate = min(stats, key= lambda x: self.merges.get(x, float('inf')))
            
            if candidate not in self.merges:
                no_more = True
            
            tokens = merge(tokens, candidate, self.merges[candidate])
        return tokens
            
        
    def decode(self, tokens):
        text_bytes = b"".join(self.vocab[idx] for idx in tokens)
        text = text_bytes.decode('utf-8', errors = 'replace')
        return text

In [79]:
import regex as re

In [80]:
GPT2_SPLIT_PATTERN = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""

class RegexTokenizer(BasicTokenizer):
    def __init__(self, pattern = None):
        super().__init__()
        self.regex_pattern =  GPT4_SPLIT_PATTERN if pattern is None else pattern
        self.compiled_pattern = re.compile(self.regex_pattern)
        self.special_tokens = {}
        self.invers_special_tokens = {}
        
    
    def train(self, text, vocab_size, verbose = False):
        
        chunked_text = re.findall(self.compiled_pattern, text)
        chunked_tokens = [list(chunk.encode('utf-8')) for chunk in chunked_text ]

        idx = 256
        merges = {}
        vocab = {idx: bytes([idx]) for idx in range(256)}
        while idx < vocab_size:
            #update count of connected tokens for every token group
            stats = {}
            for tokens in chunked_tokens:
                get_stats(tokens,stats)
            
            candidate = max(stats, key= stats.get)
            
            chunked_tokens = [merge(tokens, candidate, idx) for tokens in chunked_tokens]
                    
            merges[candidate] = idx
            vocab[idx] = vocab[candidate[0]] + vocab[candidate[1]]
            
            if verbose: 
                print(f"{candidate[0]} + {candidate[1]} => {idx}")
            idx +=1
        self.merges = merges
        self.vocab = vocab
        
    # special tokens in the form of vocab dictionary {<ENDOFTEXT>:50000,....}
    def special_token_registry(self, special = None):
        if special is not None:
            self.special_tokens = special 
            self.invers_special_tokens = {v:k for k,v in special.items()}
            self.vocab.update(self.invers_special_tokens)
        
    # encode pure utf-8 chunk of bytes
    def _encode_chunk(self, text_bytes):
        
        tokens = list(text_bytes)
        while len(tokens) >= 2:
            stats = get_stats(tokens)
            candidate = min(stats, key= lambda x: self.merges.get(x, float('inf')))
            
            if candidate not in self.merges:
                break
                            
            tokens = merge(tokens, candidate, self.merges[candidate])
        return tokens
    
    # encode text without special tokens
    def _encode_ordinary(self, text):
        chunked_test = re.findall(self.compiled_pattern, text)
        
        tokens = []
        for chunk in chunked_test:
            chunk = chunk.encode('utf-8')
            tokens.extend(self._encode_chunk(chunk))
        
        return tokens
                    
                    
    def encode(self, text, allowed_special = 'none_raise'):
                        
        if allowed_special == 'all':
            special = self.special_tokens
        elif allowed_special == 'none':
            special = {}
        elif allowed_special == 'none_raise':
            special = {}
            assert all(token not in text for token in self.special_tokens) 
        else:
            raise ValueError(f"allowed_special={allowed_special} not understood")
                            
        if not special:
            return self._encode_ordinary(text)
        
        special_pattern = '(' + "|".join(re.escape(k) for k in special) + ")"
        special_chunked = re.split(special_pattern, text)
        
        tokens = []
        for chunk in special_chunked:
            if chunk in special:
                tokens.extend([special[chunk]])
            else:
                tokens.extend(self._encode_ordinary(chunk))
        return tokens
    
    def decode(self, tokens):
        text_bytes = b""
        for idx in tokens:
            if idx in self.invers_special_tokens:
                text_bytes += self.vocab[idx].encode('utf-8')
            else:
                text_bytes += self.vocab[idx]
        text = text_bytes.decode('utf-8', errors = 'replace')
        return text

In [81]:
!pip install tiktoken



In [82]:
import tiktoken


In [83]:
def bpe(mergeable_ranks, token, max_rank = None):
    parts = [bytes([b]) for b in token]
    while True:
        min_rank = None
        min_idx = None
        for i , pair in enumerate(zip(parts[:-1], parts[1:])):
            rank = mergeable_ranks.get(pair[0] + pair[1])
            if rank is not None and (min_rank is None or rank < min_rank):
                min_rank = rank
                min_idx = i
        if min_rank is None or (min_rank is not None and min_rank >= max_rank) :
            break
        parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx+1]] + parts[min_idx + 2:]
    return parts

def recover_merges(mergeable_ranks):
    merges = {}
    for byte, rank in mergeable_ranks.items():
        if len(byte) < 2:
            continue
        pair = tuple(bpe(mergeable_ranks, byte, rank))
        
        idx0 = mergeable_ranks[pair[0]]
        idx1 = mergeable_ranks[pair[1]]
        merges[(idx0, idx1)] = rank
        
    return merges

In [197]:
GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
GPT4_SPECIAL_TOKENS = {
    '<|endoftext|>': 100257,
    '<|fim_prefix|>': 100258,
    '<|fim_middle|>': 100259,
    '<|fim_suffix|>': 100260,
    '<|endofprompt|>': 100276
}

class GPT4Tokenizer(RegexTokenizer):
    def __init__(self):
        super().__init__(pattern =GPT4_SPLIT_PATTERN)
        enc = tiktoken.get_encoding("cl100k_base")
        self.mergeable_ranks = enc._mergeable_ranks
        self.merges = recover_merges(self.mergeable_ranks)
        self.byte_shuffle = {self.mergeable_ranks[bytes([i])]:i  for i in range(256)}
        self.byte_reshuffle = {v:k for k,v in self.byte_shuffle.items()}
        
        #change order of first 256 characters and update all other merges to be compatilbe with new order.
        self.merges = self.shuffle_merges()
        
        
        self.special_tokens = GPT4_SPECIAL_TOKENS
        self.vocab = self._build_vocab()
    
    def shuffle_merges(self):
        merges_keys = []
        for (p0,p1) in list(self.merges.keys()):
            if p0 <=255:
                 p0 = self.byte_shuffle[p0]
            if p1 <=255:
                p1 = self.byte_shuffle[p1]
            merges_keys.append((p0,p1))
        shuffled_merges = {merges_keys[i]:i+256 for i in range(len(self.merges))}
        return shuffled_merges
    
    def train(self, text, vocab_size, verbose=False):
        raise NotImplementedError
        
    def encode(self, text, allowed_special = 'none_raise'):
        shuffled_tokens = super().encode(text,allowed_special)
        return [self.byte_reshuffle[token] if token <= 255 else token for token in shuffled_tokens]
    
    def decode(self, tokens):
        shuffled_tokens = [self.byte_shuffle[token] if token <= 255 else token for token in tokens]
        return super().decode(shuffled_tokens)
    
    def save(self, path):
        raise NotImplementedError("GPT4Tokenizer is loaded from tiktoken so saving is useless.")
        
    def load(self, path):
        raise NotImplementedError("GPT4Tokenizer is loaded from tiktoken at initialization ")

In [198]:
gpt4tok = GPT4Tokenizer()

In [187]:
gpt4tok.decode(gpt4tok.encode('You can define a dictionary using curly brackets.'))

'You can define a dictionary using curly brackets.'

In [1]:
import sentencepiece

In [3]:
import sentencepiece as spm
s = spm.SentencePieceProcessor()

In [7]:
sp = spm.SentencePieceProcessor()
sp.load("/kaggle/input/tokenizer/tokenizer.model")

True

In [9]:
import sentencepiece as spm
from sentencepiece import sentencepiece_model_pb2 as model_pb2

# Load the SentencePiece model
sp = spm.SentencePieceProcessor()
sp.Load("/kaggle/input/tokenizer/tokenizer.model")

# Get the model proto
model_proto = model_pb2.ModelProto()
model_proto.ParseFromString(sp.serialized_model_proto())

# Now you can access various training parameters
print(f"Vocab size: {model_proto.trainer_spec.vocab_size}")
print(f"Character coverage: {model_proto.trainer_spec.character_coverage}")
print(f"Model type: {model_proto.trainer_spec.model_type}")
print(f"Input sentence size: {model_proto.trainer_spec.input_sentence_size}")
print(f"Max sentence length: {model_proto.trainer_spec.max_sentence_length}")
print(f"Mining sentence size: {model_proto.trainer_spec.mining_sentence_size}")
print(f"Training algorithm: {model_proto.trainer_spec.training_algorithm}")
print(f"Normalization rule name: {model_proto.normalizer_spec.name}")


Vocab size: 32000
Character coverage: 0.9999499917030334
Model type: 2
Input sentence size: 200000000
Max sentence length: 4192
Mining sentence size: 0


AttributeError: training_algorithm

In [12]:
print( model_proto.trainer_spec)

input: "/large_experiments/theorem/datasets/MERGED/all.test1.merged"
model_prefix: "spm_model_32k_200M_charcov099995_allowWSO__v2"
model_type: BPE
vocab_size: 32000
self_test_sample_size: 0
input_format: "text"
character_coverage: 0.9999499917030334
input_sentence_size: 200000000
seed_sentencepiece_size: 1000000
shrinking_factor: 0.75
num_threads: 80
num_sub_iterations: 2
max_sentence_length: 4192
shuffle_input_sentence: true
max_sentencepiece_length: 16
split_by_unicode_script: true
split_by_whitespace: true
split_by_number: true
treat_whitespace_as_suffix: false
split_digits: true
allow_whitespace_only_pieces: true
vocabulary_output_piece_score: true
hard_vocab_limit: true
use_all_vocab: false
byte_fallback: true
required_chars: ""
unk_id: 0
bos_id: 1
eos_id: 2
pad_id: -1
unk_surface: " \342\201\207 "
unk_piece: "<unk>"
bos_piece: "<s>"
eos_piece: "</s>"
pad_piece: "<pad>"
train_extremely_large_corpus: false
enable_differential_privacy: false
differential_privacy_noise_level: 0.0
dif