In [1]:
from cs336_basics import bpe_tokenizer 
from cs336_basics import Tokenizer
import pickle
from pathlib import Path
dataroot = Path('../data/')

In [None]:
special_tokens = ['<|endoftext|>']
root = '../data/'
for name, vocab_size in [('TinyStoriesV2-GPT4-', 10000), ('owt_', 32000)]:
    vocab, merges = bpe_tokenizer(f'{root}{name}train.txt', vocab_size, special_tokens)
    with open(f'{root}{name}bpe_data.pkl', 'wb') as f:
        pickle.dump((vocab, merges, special_tokens), f)

In [7]:
def read_docs(file_path, num_docs, seperator):
    lines = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            index = line.find(seperator)
            if index != -1:
                num_docs -= 1
                if num_docs <= 0:
                    lines.append(line[:index])
                    break
                else:
                    lines.append(line)
            else:
                lines.append(line)
    return ''.join(lines)

tiny_story_docs = read_docs(dataroot / 'TinyStoriesV2-GPT4-train.txt', 10, '<|endoftext|>')
#print(tiny_story_docs)
owt_docs = read_docs(dataroot / 'owt_train.txt', 10, '<|endoftext|>')
tiny_story_bpe_info = pickle.load(open(dataroot / 'TinyStoriesV2-GPT4-bpe_data.pkl', 'rb'))
owt_bpe_info = pickle.load(open(dataroot / 'owt_bpe_data.pkl', 'rb'))
tiny_story_tokenizer = Tokenizer(*tiny_story_bpe_info)
owt_tokenizer = Tokenizer(*owt_bpe_info)

tiny_story_tokens = tiny_story_tokenizer.encode(tiny_story_docs)
owt_tokens = owt_tokenizer.encode(owt_docs)
print('tiny story compression ratio: ', len(tiny_story_docs) / len(tiny_story_tokens))
print('owt compression ratio: ', len(owt_docs) / len(owt_tokens))

owt_tokens_with_tiny_story_tokenizer = tiny_story_tokenizer.encode(owt_docs)
print('owt with tiny story tokenizer compression ratio: ', len(owt_docs) / len(owt_tokens_with_tiny_story_tokenizer))

tiny story compression ratio:  4.151898734177215
owt compression ratio:  4.655557208748698
owt with tiny story tokenizer compression ratio:  3.166363084395871


In [2]:
import numpy as np
import os

def tokenize(tokenizer: Tokenizer, text_path: str, token_path: str):
    dtype = np.uint16
    data_size = np.array([],dtype=dtype).itemsize
    text_file_size = os.path.getsize(text_path)
    approximate_max_token_count = text_file_size // 4
    mmap = np.memmap(token_path, dtype=dtype, mode='w+', shape=(approximate_max_token_count,))
    print('writing to ', token_path)
    print('approximate_max_token_count: ', approximate_max_token_count)
    cur_index = 0
    with open(text_path, 'r', encoding='utf-8') as f:
        for line in f:
            token = tokenizer.encode(line)
            n = len(token)
            mmap[cur_index:cur_index+n] = np.array(token, dtype=dtype)
            cur_index += n
    mmap.flush()
    del mmap

    print('accual_token_count', cur_index)
    accual_size_in_bytes = cur_index * data_size
    with open(token_path, 'ab') as f:
        f.truncate(accual_size_in_bytes)


for dataset_name in ['TinyStoriesV2-GPT4-', 'owt_']:
    bpe_info = pickle.load(open(dataroot / f'{dataset_name}bpe_data.pkl', 'rb'))
    tokenizer = Tokenizer(*bpe_info)
    for suffix in ['valid', 'train']:
        text_path = dataroot / f'{dataset_name}{suffix}.txt'
        token_path = dataroot / f'{dataset_name}{suffix}.dat'
        tokenize(tokenizer, text_path, token_path)


writing to  ..\data\TinyStoriesV2-GPT4-valid.dat
approximate_max_token_count:  5625650
accual_token_count 5461210
writing to  ..\data\TinyStoriesV2-GPT4-train.dat
approximate_max_token_count:  556938290
accual_token_count 540796778
writing to  ..\data\owt_valid.dat
approximate_max_token_count:  72499688
accual_token_count 66404546
writing to  ..\data\owt_train.dat
approximate_max_token_count:  2980127764
accual_token_count 2727257893
