# GPT Tokenizer
This tokenizer follows the video created by [Andrej Karpathy](https://www.youtube.com/watch?v=zduSFxRajkE). <br> <br>
We use the Byte Pair Encoding algorithm to turn characters into tokens. We start by encoding each character into its UTF-8 byte representation, then we find the pair of tokens that occur most frequently. This new pair of tokens are replaced by a new minted token that was not used before. We keep repeating this until we are happy with our vocabulary.

In [133]:
#Create sample text
text = "Hello tokenizer 😄🤪🧐🙎🏿!"


#Encode to UTF-8 bytes, they have numbers from 0 to 255
#Not that an actual glyph can be represented by multiple bytes. Each Unicode code point has 1-4 bytes in size in UTF-8, then the actual glyph can have multile code points
#For example 🙎🏿is represented as two code points, one base human, and a second skin tone, that is a total of 8 bytes
utfBytes = text.encode('utf-8')

#Map bytes to integers, now our starting point as tokens
tokens = list(utfBytes)


print(f'Text: {text} length: {len(text)} \nBytes: {utfBytes} length: {len(utfBytes)}\nIntegers: {tokens} length: {len(tokens)}\n')

Text: Hello tokenizer 😄🤪🧐🙎🏿! length: 22 
Bytes: b'Hello tokenizer \xf0\x9f\x98\x84\xf0\x9f\xa4\xaa\xf0\x9f\xa7\x90\xf0\x9f\x99\x8e\xf0\x9f\x8f\xbf!' length: 37
Integers: [72, 101, 108, 108, 111, 32, 116, 111, 107, 101, 110, 105, 122, 101, 114, 32, 240, 159, 152, 132, 240, 159, 164, 170, 240, 159, 167, 144, 240, 159, 153, 142, 240, 159, 143, 191, 33] length: 37



In [130]:
tokens[0]

72

In [189]:
#Go through each pair of integers and count how many times they appear
def get_stats(ids):
    counts = {}
    for pair in zip(ids, ids[1:]):
        if pair in counts:
            counts[pair] += 1
        else:
            counts[pair] = 1

    #Sort the stats by count
    
    return counts

In [201]:
#Get the raw stats
stats = get_stats(tokens)

#Print the stats
print(stats)

{(72, 101): 1, (101, 108): 1, (108, 108): 1, (108, 111): 1, (111, 32): 1, (32, 116): 1, (116, 111): 1, (111, 107): 1, (107, 101): 1, (101, 110): 1, (110, 105): 1, (105, 122): 1, (122, 101): 1, (101, 114): 1, (114, 32): 1, (32, 240): 1, (240, 159): 5, (159, 152): 1, (152, 132): 1, (132, 240): 1, (159, 164): 1, (164, 170): 1, (170, 240): 1, (159, 167): 1, (167, 144): 1, (144, 240): 1, (159, 153): 1, (153, 142): 1, (142, 240): 1, (159, 143): 1, (143, 191): 1, (191, 33): 1}


In [196]:
top_pair

((240, 159), 5)

In [202]:
top_pair

(240, 159)

In [204]:
top_pair = max(stats, key=stats.get)
print(f'Top pair: {top_pair} appears {stats[top_pair]} times')

Top pair: (240, 159) appears 5 times


In [205]:
#Take a list of ids, and a tuple pair, then search and replace that pair with the new idx
def merge_tokens(tokens, pair, new_token):
    new_ids = []
    i = 0
    while i < len(tokens):
        if i < len(tokens) - 1 and (tokens[i], tokens[i+1]) == pair:
            #We should replase the pair with the new token
            new_ids.append(new_token)
            i += 2
        else:
            #We should keep the token as is
            new_ids.append(tokens[i])
            i += 1
    return new_ids

In [206]:
merge_tokens(tokens, top_pair, -42)

[72,
 101,
 108,
 108,
 111,
 32,
 116,
 111,
 107,
 101,
 110,
 105,
 122,
 101,
 114,
 32,
 -42,
 152,
 132,
 -42,
 164,
 170,
 -42,
 167,
 144,
 -42,
 153,
 142,
 -42,
 143,
 191,
 33]

In [207]:
#Create our vocabulary, we start with the orginial bytes as integers
vocab = {}
for i in range(256):
    token = i
    vocab[token] = i

In [157]:
vocab

{0: 0,
 1: 1,
 2: 2,
 3: 3,
 4: 4,
 5: 5,
 6: 6,
 7: 7,
 8: 8,
 9: 9,
 10: 10,
 11: 11,
 12: 12,
 13: 13,
 14: 14,
 15: 15,
 16: 16,
 17: 17,
 18: 18,
 19: 19,
 20: 20,
 21: 21,
 22: 22,
 23: 23,
 24: 24,
 25: 25,
 26: 26,
 27: 27,
 28: 28,
 29: 29,
 30: 30,
 31: 31,
 32: 32,
 33: 33,
 34: 34,
 35: 35,
 36: 36,
 37: 37,
 38: 38,
 39: 39,
 40: 40,
 41: 41,
 42: 42,
 43: 43,
 44: 44,
 45: 45,
 46: 46,
 47: 47,
 48: 48,
 49: 49,
 50: 50,
 51: 51,
 52: 52,
 53: 53,
 54: 54,
 55: 55,
 56: 56,
 57: 57,
 58: 58,
 59: 59,
 60: 60,
 61: 61,
 62: 62,
 63: 63,
 64: 64,
 65: 65,
 66: 66,
 67: 67,
 68: 68,
 69: 69,
 70: 70,
 71: 71,
 72: 72,
 73: 73,
 74: 74,
 75: 75,
 76: 76,
 77: 77,
 78: 78,
 79: 79,
 80: 80,
 81: 81,
 82: 82,
 83: 83,
 84: 84,
 85: 85,
 86: 86,
 87: 87,
 88: 88,
 89: 89,
 90: 90,
 91: 91,
 92: 92,
 93: 93,
 94: 94,
 95: 95,
 96: 96,
 97: 97,
 98: 98,
 99: 99,
 100: 100,
 101: 101,
 102: 102,
 103: 103,
 104: 104,
 105: 105,
 106: 106,
 107: 107,
 108: 108,
 109: 109,
 110: 110,

In [208]:
num_merges = 10

merges = {}
idx = max(vocab) #Start the last known token in our vocabulary
ids = list(tokens) #Copy the list of tokens so that we don't modify the original list
for i in range(num_merges):
    stats = get_stats(ids)
    top_pair = max(stats, key=stats.get) #Get the top pair
   
    idx += 1
    print(f'Merging: {top_pair} to the new token: {idx}')
    ids = merge_tokens(ids, top_pair, idx)

    #Save it to the merges dictionary
    merges[top_pair] = idx

    



Merging: (240, 159) to the new token: 256
Merging: (72, 101) to the new token: 257
Merging: (257, 108) to the new token: 258
Merging: (258, 108) to the new token: 259
Merging: (259, 111) to the new token: 260
Merging: (260, 32) to the new token: 261
Merging: (261, 116) to the new token: 262
Merging: (262, 111) to the new token: 263
Merging: (263, 107) to the new token: 264
Merging: (264, 101) to the new token: 265


In [285]:
merges

{(240, 159): 256,
 (72, 101): 257,
 (257, 108): 258,
 (258, 108): 259,
 (259, 111): 260,
 (260, 32): 261,
 (261, 116): 262,
 (262, 111): 263,
 (263, 107): 264,
 (264, 101): 265}

# Create the vocabulary
Create the mapping between integer values to the list of UTF-8 bytes

In [286]:
vocab = {idx: bytes([idx]) for idx in range(256)} #Start with the original bytes as tokens, a byte is 8 bits, which is 256 possible values
for (p0, p1), idx in merges.items():
    vocab[idx] = vocab[p0] + vocab[p1] #The new token is the concatenation of the two previously known tokens

In [313]:
vocab

{0: b'\x00',
 1: b'\x01',
 2: b'\x02',
 3: b'\x03',
 4: b'\x04',
 5: b'\x05',
 6: b'\x06',
 7: b'\x07',
 8: b'\x08',
 9: b'\t',
 10: b'\n',
 11: b'\x0b',
 12: b'\x0c',
 13: b'\r',
 14: b'\x0e',
 15: b'\x0f',
 16: b'\x10',
 17: b'\x11',
 18: b'\x12',
 19: b'\x13',
 20: b'\x14',
 21: b'\x15',
 22: b'\x16',
 23: b'\x17',
 24: b'\x18',
 25: b'\x19',
 26: b'\x1a',
 27: b'\x1b',
 28: b'\x1c',
 29: b'\x1d',
 30: b'\x1e',
 31: b'\x1f',
 32: b' ',
 33: b'!',
 34: b'"',
 35: b'#',
 36: b'$',
 37: b'%',
 38: b'&',
 39: b"'",
 40: b'(',
 41: b')',
 42: b'*',
 43: b'+',
 44: b',',
 45: b'-',
 46: b'.',
 47: b'/',
 48: b'0',
 49: b'1',
 50: b'2',
 51: b'3',
 52: b'4',
 53: b'5',
 54: b'6',
 55: b'7',
 56: b'8',
 57: b'9',
 58: b':',
 59: b';',
 60: b'<',
 61: b'=',
 62: b'>',
 63: b'?',
 64: b'@',
 65: b'A',
 66: b'B',
 67: b'C',
 68: b'D',
 69: b'E',
 70: b'F',
 71: b'G',
 72: b'H',
 73: b'I',
 74: b'J',
 75: b'K',
 76: b'L',
 77: b'M',
 78: b'N',
 79: b'O',
 80: b'P',
 81: b'Q',
 82: b'R',
 83: b'

In [314]:
#Compare the original token list with the new token list
print(f'Original tokens: {len(tokens)}, New tokens: {len(ids)}, compression ratio: {len(ids)/len(tokens):.4f}\n\n')
#Compression ration

Original tokens: 37, New tokens: 23, compression ratio: 0.6216




In [315]:
def encode(text, merges):
    #Merges must be in the correct order, starting from the first pair to the last
    tokens = list(text.encode('utf-8')) #Bytes --> list of integers

    #Go through all the merges and replace the tokens with the more complex tokens
    for pair, id in merges.items():
        tokens = merge_tokens(tokens, pair, id)
    
    return tokens
        
    

In [316]:
encoded = encode(text, merges)

In [310]:
def decode(ids, vocab):

    byteArray = [vocab[i] for i in ids ] #Transform each integer to its corresponding string in the vocab, if not found, use the unknown token
    text = b''.join(byteArray) #then concatinate them
    text = text.decode('utf-8', errors='replace') #Decode the bytes to a string, if there are bytes that are not valid UTF-8, replace them with the unknown token
    return text

In [311]:
decoded = decode(encoded, vocab)
decoded

'Hello tokenizer 😄🤪🧐🙎🏿!'

In [312]:
decode(encode('Hello tokenizer 😄🤪🧐!!"#Z¤%/()', merges), vocab)

'Hello tokenizer 😄🤪🧐!!"#Z¤%/()'