Restarted cs336_basics (Python 3.10.16)

In [1]:
import regex
from tqdm import tqdm
from datetime import datetime
import time
from cs336_basics.utils import GPT2_PRETOKENIZER_PATTERN
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
from collections import Counter

def get_pair_freq_table(pertokens_freq: dict):
    pair_freq_table = Counter()
    for pretoken, freq in pertokens_freq.items():
        for pair in zip(pretoken, pretoken[1:]):
            pair_freq_table[pair] = pair_freq_table.get(pair, 0) + freq
    return pair_freq_table

def _merge_and_update_freq_table(
    pertokens_freq: dict,
    pair_freq_table,
    most_freq_pair: tuple[bytes, bytes],
    new_token_bytes: int,
) -> None:

    p0, p1 = most_freq_pair
    new_pertokens_freq = {}
    for pertoken, freq in pertokens_freq.items():
        i = 0
        new_pre_token = []
        while i < len(pertoken) - 1:
            if pertoken[i] != p0 or pertoken[i + 1] != p1:
                new_pre_token.extend(pertoken[i:i+2])
                i += 1
            else:
                left  = pertoken[i - 1] if i > 0 else None
                right = pertoken[i + 2] if i + 2 < len(pertoken) else None

                pair_freq_table[most_freq_pair] -= freq
                if left  is not None:
                    pair_freq_table[(left,  p0)] -= freq
                if right is not None:
                    pair_freq_table[(p1, right)] -= freq

                new_pre_token[i : i + 2] = [b"".join([new_token_bytes])]

                if left  is not None:
                    pair_freq_table[(left, new_token_bytes)] = pair_freq_table.get((left, new_token_bytes), 0) + freq
                if right is not None:
                    pair_freq_table[(new_token_bytes, right)] = pair_freq_table.get((new_token_bytes, right), 0) + freq
            
                if i: i -= 1
            if i >= len(pertoken) - 1:
                new_pertokens_freq[tuple(new_pre_token)] = freq
    return new_pertokens_freq


def train_bpe(
    input_path: str,
    vocab_size: int,
    special_tokens: list[str]
) -> tuple[dict[int, bytes], list[tuple[bytes, bytes]]]:

    vocab = {idx: bytes([idx]) for idx in range(256)}

    for i, special_token in enumerate(special_tokens):
        s_token = special_token.encode("utf-8")
        vocab[256 + i] = s_token

    with open(input_path, encoding="utf-8") as f:
        text = f.read()

    for special_token in special_tokens:
        text = text.replace(special_token, "")

    pretoknes = Counter(regex.findall(GPT2_PRETOKENIZER_PATTERN, text))

    pertokens_freq = {}
    for pertoken, freq in pretoknes.items():
        bytes_tuple = tuple(bytes([b]) for b in pertoken.encode("utf-8"))
        pertokens_freq[bytes_tuple] = freq


    pair_freq_table= get_pair_freq_table(pertokens_freq)

    merges = []
    total_merging = vocab_size - len(vocab)
    with tqdm(total=total_merging, desc="BPE Merges") as pbar:
        while len(vocab) < vocab_size and pair_freq_table:
            most_freq_pair = max(pair_freq_table,
                key=lambda p: (pair_freq_table[p], p))
            merges.append(most_freq_pair)
            new_token_bytes = b"".join(most_freq_pair)
            new_token_id = max(vocab.keys()) + 1
            vocab[new_token_id] = new_token_bytes
            pertokens_freq = _merge_and_update_freq_table(
                pertokens_freq, pair_freq_table, most_freq_pair, new_token_bytes
            )

            pbar.update(1)

    return vocab, merges

In [2]:
train_bpe(
    input_path="/Users/ameefaour/Desktop/CS336_LLM_from_scratch/spring2024-assignment1-basics/data/test.txt",
    vocab_size=1000,
    special_tokens=["<|endoftext|>"]
)

BPE Merges:   0%|          | 0/743 [00:00<?, ?it/s]

Restarted cs336_basics (Python 3.10.16)

In [1]:
import regex
from tqdm import tqdm
from datetime import datetime
import time
from cs336_basics.utils import GPT2_PRETOKENIZER_PATTERN
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
from collections import Counter

def get_pair_freq_table(pertokens_freq: dict):
    pair_freq_table = Counter()
    for pretoken, freq in pertokens_freq.items():
        for pair in zip(pretoken, pretoken[1:]):
            pair_freq_table[pair] = pair_freq_table.get(pair, 0) + freq
    return pair_freq_table

def _merge_and_update_freq_table(
    pertokens_freq: dict,
    pair_freq_table,
    most_freq_pair: tuple[bytes, bytes],
    new_token_bytes: int,
) -> None:

    p0, p1 = most_freq_pair
    new_pertokens_freq = {}
    for pertoken, freq in pertokens_freq.items():
        i = 0
        new_pre_token = list(pertoken)
        while i < len(pertoken) - 1:
            if pertoken[i] != p0 or pertoken[i + 1] != p1:
                i += 1
            else:
                left  = pertoken[i - 1] if i > 0 else None
                right = pertoken[i + 2] if i + 2 < len(pertoken) else None

                pair_freq_table[most_freq_pair] -= freq
                if left  is not None:
                    pair_freq_table[(left,  p0)] -= freq
                if right is not None:
                    pair_freq_table[(p1, right)] -= freq

                new_pre_token[i : i + 2] = [b"".join([new_token_bytes])]

                if left  is not None:
                    pair_freq_table[(left, new_token_bytes)] = pair_freq_table.get((left, new_token_bytes), 0) + freq
                if right is not None:
                    pair_freq_table[(new_token_bytes, right)] = pair_freq_table.get((new_token_bytes, right), 0) + freq
            
                if i: i -= 1
            if i >= len(pertoken) - 1:
                new_pertokens_freq[tuple(new_pre_token)] = freq
    return new_pertokens_freq


def train_bpe(
    input_path: str,
    vocab_size: int,
    special_tokens: list[str]
) -> tuple[dict[int, bytes], list[tuple[bytes, bytes]]]:

    vocab = {idx: bytes([idx]) for idx in range(256)}

    for i, special_token in enumerate(special_tokens):
        s_token = special_token.encode("utf-8")
        vocab[256 + i] = s_token

    with open(input_path, encoding="utf-8") as f:
        text = f.read()

    for special_token in special_tokens:
        text = text.replace(special_token, "")

    pretoknes = Counter(regex.findall(GPT2_PRETOKENIZER_PATTERN, text))

    pertokens_freq = {}
    for pertoken, freq in pretoknes.items():
        bytes_tuple = tuple(bytes([b]) for b in pertoken.encode("utf-8"))
        pertokens_freq[bytes_tuple] = freq


    pair_freq_table= get_pair_freq_table(pertokens_freq)

    merges = []
    total_merging = vocab_size - len(vocab)
    with tqdm(total=total_merging, desc="BPE Merges") as pbar:
        while len(vocab) < vocab_size and pair_freq_table:
            most_freq_pair = max(pair_freq_table,
                key=lambda p: (pair_freq_table[p], p))
            merges.append(most_freq_pair)
            new_token_bytes = b"".join(most_freq_pair)
            new_token_id = max(vocab.keys()) + 1
            vocab[new_token_id] = new_token_bytes
            pertokens_freq = _merge_and_update_freq_table(
                pertokens_freq, pair_freq_table, most_freq_pair, new_token_bytes
            )

            pbar.update(1)

    return vocab, merges

In [2]:
train_bpe(
    input_path="/Users/ameefaour/Desktop/CS336_LLM_from_scratch/spring2024-assignment1-basics/data/test.txt",
    vocab_size=1000,
    special_tokens=["<|endoftext|>"]
)

BPE Merges:   0%|          | 0/743 [00:00<?, ?it/s]

Restarted cs336_basics (Python 3.10.16)

In [1]:
import regex
from tqdm import tqdm
from datetime import datetime
import time
from cs336_basics.utils import GPT2_PRETOKENIZER_PATTERN
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
from collections import Counter

def get_pair_freq_table(pertokens_freq: dict):
    pair_freq_table = Counter()
    for pretoken, freq in pertokens_freq.items():
        for pair in zip(pretoken, pretoken[1:]):
            pair_freq_table[pair] = pair_freq_table.get(pair, 0) + freq
    return pair_freq_table

def _merge_and_update_freq_table(
    pertokens_freq: dict,
    pair_freq_table,
    most_freq_pair: tuple[bytes, bytes],
    new_token_bytes: int,
) -> None:

    p0, p1 = most_freq_pair
    new_pertokens_freq = {}
    for pertoken, freq in pertokens_freq.items():
        i = 0
        new_pre_token = list(pertoken)
        while i < len(new_pre_token) - 1:
            if pertoken[i] != p0 or pertoken[i + 1] != p1:
                i += 1
            else:
                left  = pertoken[i - 1] if i > 0 else None
                right = pertoken[i + 2] if i + 2 < len(pertoken) else None

                pair_freq_table[most_freq_pair] -= freq
                if left  is not None:
                    pair_freq_table[(left,  p0)] -= freq
                if right is not None:
                    pair_freq_table[(p1, right)] -= freq

                new_pre_token[i : i + 2] = [b"".join([new_token_bytes])]

                if left  is not None:
                    pair_freq_table[(left, new_token_bytes)] = pair_freq_table.get((left, new_token_bytes), 0) + freq
                if right is not None:
                    pair_freq_table[(new_token_bytes, right)] = pair_freq_table.get((new_token_bytes, right), 0) + freq
            
                if i: i -= 1
            if i >= len(new_pre_token) - 1:
                new_pertokens_freq[tuple(new_pre_token)] = freq
    return new_pertokens_freq


def train_bpe(
    input_path: str,
    vocab_size: int,
    special_tokens: list[str]
) -> tuple[dict[int, bytes], list[tuple[bytes, bytes]]]:

    vocab = {idx: bytes([idx]) for idx in range(256)}

    for i, special_token in enumerate(special_tokens):
        s_token = special_token.encode("utf-8")
        vocab[256 + i] = s_token

    with open(input_path, encoding="utf-8") as f:
        text = f.read()

    for special_token in special_tokens:
        text = text.replace(special_token, "")

    pretoknes = Counter(regex.findall(GPT2_PRETOKENIZER_PATTERN, text))

    pertokens_freq = {}
    for pertoken, freq in pretoknes.items():
        bytes_tuple = tuple(bytes([b]) for b in pertoken.encode("utf-8"))
        pertokens_freq[bytes_tuple] = freq


    pair_freq_table= get_pair_freq_table(pertokens_freq)

    merges = []
    total_merging = vocab_size - len(vocab)
    with tqdm(total=total_merging, desc="BPE Merges") as pbar:
        while len(vocab) < vocab_size and pair_freq_table:
            most_freq_pair = max(pair_freq_table,
                key=lambda p: (pair_freq_table[p], p))
            merges.append(most_freq_pair)
            new_token_bytes = b"".join(most_freq_pair)
            new_token_id = max(vocab.keys()) + 1
            vocab[new_token_id] = new_token_bytes
            pertokens_freq = _merge_and_update_freq_table(
                pertokens_freq, pair_freq_table, most_freq_pair, new_token_bytes
            )

            pbar.update(1)

    return vocab, merges

In [2]:
train_bpe(
    input_path="/Users/ameefaour/Desktop/CS336_LLM_from_scratch/spring2024-assignment1-basics/data/test.txt",
    vocab_size=1000,
    special_tokens=["<|endoftext|>"]
)

BPE Merges:   0%|          | 0/743 [00:00<?, ?it/s]

Restarted cs336_basics (Python 3.10.16)

In [1]:
import regex
from tqdm import tqdm
from datetime import datetime
import time
from cs336_basics.utils import GPT2_PRETOKENIZER_PATTERN
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
from collections import Counter

def get_pair_freq_table(pertokens_freq: dict):
    pair_freq_table = Counter()
    for pretoken, freq in pertokens_freq.items():
        for pair in zip(pretoken, pretoken[1:]):
            pair_freq_table[pair] = pair_freq_table.get(pair, 0) + freq
    return pair_freq_table

def _merge_and_update_freq_table(
    pertokens_freq: dict,
    pair_freq_table,
    most_freq_pair: tuple[bytes, bytes],
    new_token_bytes: int,
) -> None:

    p0, p1 = most_freq_pair
    new_pertokens_freq = {}
    for pertoken, freq in pertokens_freq.items():
        i = 0
        new_pre_token = list(pertoken)
        while i < len(new_pre_token) - 1:
            if pertoken[i] == p0 or pertoken[i + 1] == p1:
                left  = pertoken[i - 1] if i > 0 else None
                right = pertoken[i + 2] if i + 2 < len(pertoken) else None

                pair_freq_table[most_freq_pair] -= freq
                if left  is not None:
                    pair_freq_table[(left,  p0)] -= freq
                if right is not None:
                    pair_freq_table[(p1, right)] -= freq

                new_pre_token[i : i + 2] = [b"".join([new_token_bytes])]

                if left  is not None:
                    pair_freq_table[(left, new_token_bytes)] = pair_freq_table.get((left, new_token_bytes), 0) + freq
                if right is not None:
                    pair_freq_table[(new_token_bytes, right)] = pair_freq_table.get((new_token_bytes, right), 0) + freq
            
            i += 1
            if i >= len(new_pre_token) - 1:
                new_pertokens_freq[tuple(new_pre_token)] = freq
    return new_pertokens_freq


def train_bpe(
    input_path: str,
    vocab_size: int,
    special_tokens: list[str]
) -> tuple[dict[int, bytes], list[tuple[bytes, bytes]]]:

    vocab = {idx: bytes([idx]) for idx in range(256)}

    for i, special_token in enumerate(special_tokens):
        s_token = special_token.encode("utf-8")
        vocab[256 + i] = s_token

    with open(input_path, encoding="utf-8") as f:
        text = f.read()

    for special_token in special_tokens:
        text = text.replace(special_token, "")

    pretoknes = Counter(regex.findall(GPT2_PRETOKENIZER_PATTERN, text))

    pertokens_freq = {}
    for pertoken, freq in pretoknes.items():
        bytes_tuple = tuple(bytes([b]) for b in pertoken.encode("utf-8"))
        pertokens_freq[bytes_tuple] = freq


    pair_freq_table= get_pair_freq_table(pertokens_freq)

    merges = []
    total_merging = vocab_size - len(vocab)
    with tqdm(total=total_merging, desc="BPE Merges") as pbar:
        while len(vocab) < vocab_size and pair_freq_table:
            most_freq_pair = max(pair_freq_table,
                key=lambda p: (pair_freq_table[p], p))
            merges.append(most_freq_pair)
            new_token_bytes = b"".join(most_freq_pair)
            new_token_id = max(vocab.keys()) + 1
            vocab[new_token_id] = new_token_bytes
            pertokens_freq = _merge_and_update_freq_table(
                pertokens_freq, pair_freq_table, most_freq_pair, new_token_bytes
            )

            pbar.update(1)

    return vocab, merges

In [2]:
train_bpe(
    input_path="/Users/ameefaour/Desktop/CS336_LLM_from_scratch/spring2024-assignment1-basics/data/test.txt",
    vocab_size=1000,
    special_tokens=["<|endoftext|>"]
)

BPE Merges: 100%|██████████| 743/743 [00:00<00:00, 7336.58it/s]


({0: b'\x00',
  1: b'\x01',
  2: b'\x02',
  3: b'\x03',
  4: b'\x04',
  5: b'\x05',
  6: b'\x06',
  7: b'\x07',
  8: b'\x08',
  9: b'\t',
  10: b'\n',
  11: b'\x0b',
  12: b'\x0c',
  13: b'\r',
  14: b'\x0e',
  15: b'\x0f',
  16: b'\x10',
  17: b'\x11',
  18: b'\x12',
  19: b'\x13',
  20: b'\x14',
  21: b'\x15',
  22: b'\x16',
  23: b'\x17',
  24: b'\x18',
  25: b'\x19',
  26: b'\x1a',
  27: b'\x1b',
  28: b'\x1c',
  29: b'\x1d',
  30: b'\x1e',
  31: b'\x1f',
  32: b' ',
  33: b'!',
  34: b'"',
  35: b'#',
  36: b'$',
  37: b'%',
  38: b'&',
  39: b"'",
  40: b'(',
  41: b')',
  42: b'*',
  43: b'+',
  44: b',',
  45: b'-',
  46: b'.',
  47: b'/',
  48: b'0',
  49: b'1',
  50: b'2',
  51: b'3',
  52: b'4',
  53: b'5',
  54: b'6',
  55: b'7',
  56: b'8',
  57: b'9',
  58: b':',
  59: b';',
  60: b'<',
  61: b'=',
  62: b'>',
  63: b'?',
  64: b'@',
  65: b'A',
  66: b'B',
  67: b'C',
  68: b'D',
  69: b'E',
  70: b'F',
  71: b'G',
  72: b'H',
  73: b'I',
  74: b'J',
  75: b'K',
  76: b'

Restarted cs336_basics (Python 3.10.16)

In [1]:
import regex
from tqdm import tqdm
from datetime import datetime
import time
from cs336_basics.utils import GPT2_PRETOKENIZER_PATTERN
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
from collections import Counter

def get_pair_freq_table(pertokens_freq: dict):
    pair_freq_table = Counter()
    for pretoken, freq in pertokens_freq.items():
        for pair in zip(pretoken, pretoken[1:]):
            pair_freq_table[pair] = pair_freq_table.get(pair, 0) + freq
    return pair_freq_table

def _merge_and_update_freq_table(
    pertokens_freq: dict,
    pair_freq_table,
    most_freq_pair: tuple[bytes, bytes],
    new_token_bytes: int,
) -> None:

    p0, p1 = most_freq_pair
    new_pertokens_freq = {}
    for pertoken, freq in pertokens_freq.items():
        i = 0
        new_pre_token = list(pertoken)
        while i < len(new_pre_token) - 1:
            if pertoken[i] == p0 or pertoken[i + 1] == p1:
                left  = pertoken[i - 1] if i > 0 else None
                right = pertoken[i + 2] if i + 2 < len(pertoken) else None

                pair_freq_table[most_freq_pair] -= freq
                if left  is not None:
                    pair_freq_table[(left,  p0)] -= freq
                if right is not None:
                    pair_freq_table[(p1, right)] -= freq

                new_pre_token[i : i + 2] = [b"".join([new_token_bytes])]

                if left  is not None:
                    pair_freq_table[(left, new_token_bytes)] = pair_freq_table.get((left, new_token_bytes), 0) + freq
                if right is not None:
                    pair_freq_table[(new_token_bytes, right)] = pair_freq_table.get((new_token_bytes, right), 0) + freq
            
            i += 1
            if i >= len(new_pre_token) - 1:
                new_pertokens_freq[tuple(new_pre_token)] = freq
    return new_pertokens_freq


def train_bpe(
    input_path: str,
    vocab_size: int,
    special_tokens: list[str]
) -> tuple[dict[int, bytes], list[tuple[bytes, bytes]]]:

    vocab = {idx: bytes([idx]) for idx in range(256)}

    for i, special_token in enumerate(special_tokens):
        s_token = special_token.encode("utf-8")
        vocab[256 + i] = s_token

    with open(input_path, encoding="utf-8") as f:
        text = f.read()

    for special_token in special_tokens:
        text = text.replace(special_token, "")

    pretoknes = Counter(regex.findall(GPT2_PRETOKENIZER_PATTERN, text))

    pertokens_freq = {}
    for pertoken, freq in pretoknes.items():
        bytes_tuple = tuple(bytes([b]) for b in pertoken.encode("utf-8"))
        pertokens_freq[bytes_tuple] = freq


    pair_freq_table= get_pair_freq_table(pertokens_freq)

    merges = []
    total_merging = vocab_size - len(vocab)
    with tqdm(total=total_merging, desc="BPE Merges") as pbar:
        while len(vocab) < vocab_size and pair_freq_table:
            most_freq_pair = max(pair_freq_table,
                key=lambda p: (pair_freq_table[p], p))
            merges.append(most_freq_pair)
            new_token_bytes = b"".join(most_freq_pair)
            new_token_id = max(vocab.keys()) + 1
            vocab[new_token_id] = new_token_bytes
            pertokens_freq = _merge_and_update_freq_table(
                pertokens_freq, pair_freq_table, most_freq_pair, new_token_bytes
            )

            pbar.update(1)

    return vocab, merges

In [2]:
train_bpe(
    input_path="/Users/ameefaour/Desktop/CS336_LLM_from_scratch/spring2024-assignment1-basics/data/test.txt",
    vocab_size=1000,
    special_tokens=["<|endoftext|>"]
)

BPE Merges:   0%|          | 0/743 [00:00<?, ?it/s]

Restarted cs336_basics (Python 3.10.16)

In [1]:
import regex
from tqdm import tqdm
from datetime import datetime
import time
from cs336_basics.utils import GPT2_PRETOKENIZER_PATTERN
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
from collections import Counter

def get_pair_freq_table(pertokens_freq: dict):
    pair_freq_table = Counter()
    for pretoken, freq in pertokens_freq.items():
        for pair in zip(pretoken, pretoken[1:]):
            pair_freq_table[pair] = pair_freq_table.get(pair, 0) + freq
    return pair_freq_table

def _merge_and_update_freq_table(
    pertokens_freq: dict,
    pair_freq_table,
    most_freq_pair: tuple[bytes, bytes],
    new_token_bytes: int,
) -> None:

    p0, p1 = most_freq_pair
    new_pertokens_freq = {}
    for pertoken, freq in pertokens_freq.items():
        i = 0
        new_pre_token = list(pertoken)
        while i < len(new_pre_token) - 1:
            if pertoken[i] == p0 and pertoken[i + 1] == p1:
                left  = pertoken[i - 1] if i > 0 else None
                right = pertoken[i + 2] if i + 2 < len(pertoken) else None

                pair_freq_table[most_freq_pair] -= freq
                if left  is not None:
                    pair_freq_table[(left,  p0)] -= freq
                if right is not None:
                    pair_freq_table[(p1, right)] -= freq

                new_pre_token[i : i + 2] = [b"".join([new_token_bytes])]

                if left  is not None:
                    pair_freq_table[(left, new_token_bytes)] = pair_freq_table.get((left, new_token_bytes), 0) + freq
                if right is not None:
                    pair_freq_table[(new_token_bytes, right)] = pair_freq_table.get((new_token_bytes, right), 0) + freq
            
            i += 1
            if i >= len(new_pre_token) - 1:
                new_pertokens_freq[tuple(new_pre_token)] = freq
    return new_pertokens_freq


def train_bpe(
    input_path: str,
    vocab_size: int,
    special_tokens: list[str]
) -> tuple[dict[int, bytes], list[tuple[bytes, bytes]]]:

    vocab = {idx: bytes([idx]) for idx in range(256)}

    for i, special_token in enumerate(special_tokens):
        s_token = special_token.encode("utf-8")
        vocab[256 + i] = s_token

    with open(input_path, encoding="utf-8") as f:
        text = f.read()

    for special_token in special_tokens:
        text = text.replace(special_token, "")

    pretoknes = Counter(regex.findall(GPT2_PRETOKENIZER_PATTERN, text))

    pertokens_freq = {}
    for pertoken, freq in pretoknes.items():
        bytes_tuple = tuple(bytes([b]) for b in pertoken.encode("utf-8"))
        pertokens_freq[bytes_tuple] = freq


    pair_freq_table= get_pair_freq_table(pertokens_freq)

    merges = []
    total_merging = vocab_size - len(vocab)
    with tqdm(total=total_merging, desc="BPE Merges") as pbar:
        while len(vocab) < vocab_size and pair_freq_table:
            most_freq_pair = max(pair_freq_table,
                key=lambda p: (pair_freq_table[p], p))
            merges.append(most_freq_pair)
            new_token_bytes = b"".join(most_freq_pair)
            new_token_id = max(vocab.keys()) + 1
            vocab[new_token_id] = new_token_bytes
            pertokens_freq = _merge_and_update_freq_table(
                pertokens_freq, pair_freq_table, most_freq_pair, new_token_bytes
            )

            pbar.update(1)

    return vocab, merges

In [2]:
train_bpe(
    input_path="/Users/ameefaour/Desktop/CS336_LLM_from_scratch/spring2024-assignment1-basics/data/test.txt",
    vocab_size=1000,
    special_tokens=["<|endoftext|>"]
)

BPE Merges: 100%|██████████| 743/743 [03:28<00:00,  3.56it/s]  


({0: b'\x00',
  1: b'\x01',
  2: b'\x02',
  3: b'\x03',
  4: b'\x04',
  5: b'\x05',
  6: b'\x06',
  7: b'\x07',
  8: b'\x08',
  9: b'\t',
  10: b'\n',
  11: b'\x0b',
  12: b'\x0c',
  13: b'\r',
  14: b'\x0e',
  15: b'\x0f',
  16: b'\x10',
  17: b'\x11',
  18: b'\x12',
  19: b'\x13',
  20: b'\x14',
  21: b'\x15',
  22: b'\x16',
  23: b'\x17',
  24: b'\x18',
  25: b'\x19',
  26: b'\x1a',
  27: b'\x1b',
  28: b'\x1c',
  29: b'\x1d',
  30: b'\x1e',
  31: b'\x1f',
  32: b' ',
  33: b'!',
  34: b'"',
  35: b'#',
  36: b'$',
  37: b'%',
  38: b'&',
  39: b"'",
  40: b'(',
  41: b')',
  42: b'*',
  43: b'+',
  44: b',',
  45: b'-',
  46: b'.',
  47: b'/',
  48: b'0',
  49: b'1',
  50: b'2',
  51: b'3',
  52: b'4',
  53: b'5',
  54: b'6',
  55: b'7',
  56: b'8',
  57: b'9',
  58: b':',
  59: b';',
  60: b'<',
  61: b'=',
  62: b'>',
  63: b'?',
  64: b'@',
  65: b'A',
  66: b'B',
  67: b'C',
  68: b'D',
  69: b'E',
  70: b'F',
  71: b'G',
  72: b'H',
  73: b'I',
  74: b'J',
  75: b'K',
  76: b'