In [None]:
%load_ext autoreload
%autoreload 2
from pathlib import Path
import cProfile, pstats
import pickle
import numpy as np
from tqdm import tqdm

from ece496b_basics.tokenizer import from_files, Tokenizer

def get_compression_ratio(text: str, indices: list[int]) -> float:
    """Given `text` that has been tokenized into `indices`, ."""
    num_bytes = len(bytes(text, encoding="utf-8"))  # Original
    num_tokens = len(indices)                       # Tokenized
    return num_bytes / num_tokens

DATA_PATH = Path("../data").resolve()
OUTPUT_PATH = Path("outputs").resolve()
tinystories_merges_path = OUTPUT_PATH / "tinystories_merges.pkl"
tinystories_vocab_path = OUTPUT_PATH / "tinystories_vocab.pkl"
owt_merges_path = OUTPUT_PATH / "owt_merges.pkl"
owt_vocab_path = OUTPUT_PATH / "owt_vocab.pkl"

ts_tokenizer = from_files(tinystories_vocab_path, tinystories_merges_path, special_tokens=["<|endoftext|>"])
owt_tokenizer = from_files(owt_vocab_path, owt_merges_path)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
# Create 10 MB snippets
with open(DATA_PATH / "TinyStoriesV2-GPT4-train.txt", "r") as file:
    lines = file.readlines(10485760)
with open(DATA_PATH / "tinystories_snippet.txt", "w") as file:
    file.writelines(lines)

with open(DATA_PATH / "owt_train.txt", "r") as file:
    lines = file.readlines(10485760)
with open(DATA_PATH / "owt_snippet.txt", "w") as file:
    file.writelines(lines)

In [23]:
with open(DATA_PATH / "tinystories_snippet.txt", "r") as file:
    tinystories_snippet = file.read()
with open(DATA_PATH / "owt_snippet.txt", "r") as file:
    owt_snippet = file.read()

In [30]:
ts_encoded_withts = ts_tokenizer.encode(tinystories_snippet)
ts_encoded_withowt = owt_tokenizer.encode(tinystories_snippet)

owt_encoded_withts = ts_tokenizer.encode(owt_snippet)
owt_encoded_withowt = owt_tokenizer.encode(owt_snippet)

In [28]:
print(f"TinyStories encoded with TinyStories: {get_compression_ratio(tinystories_snippet, ts_encoded_withts)}")
print(f"TinyStories encoded with owt: {get_compression_ratio(tinystories_snippet, ts_encoded_withowt)}")
print(f"owt encoded with TinyStories: {get_compression_ratio(owt_snippet, owt_encoded_withts)}")
print(f"owt encoded with owt: {get_compression_ratio(owt_snippet, owt_encoded_withowt)}")

TinyStories encoded with TinyStories: 4.116450125906134
TinyStories encoded with owt: 3.968451185686118
owt encoded with TinyStories: 3.2057830352906818
owt encoded with owt: 4.385997984719181


In [None]:
with open(DATA_PATH / "TinyStoriesV2-GPT4-train.txt", "r") as file:
    token_ids = []
    batch = ""
    lines = file.readlines()
    step_size = 100000
    for idx in tqdm(range(0, len(lines), step_size)):
        encoding = ts_tokenizer.encode("".join(lines[idx:idx+step_size]))
        token_ids.extend(encoding)
    token_array = np.array(token_ids, dtype=np.uint16)
    del lines

with open(OUTPUT_PATH / "tinystories_encoded.npy", "wb") as file:
    np.save(file, token_array)

# with open(DATA_PATH / "owt-train.txt", "r") as file:
#     token_ids = []
#     batch = ""
#     lines = file.readlines()
#     step_size = 100000
#     for idx in tqdm(range(0, len(lines), step_size)):
#         encoding = ts_tokenizer.encode("".join(lines[idx:idx+step_size]))
#         token_ids.extend(encoding)
#     token_array = np.array(token_ids, dtype=np.uint16)
#     del lines
#
# with open(OUTPUT_PATH / "owt_encoded.npy", "wb") as file:
#     np.save(file, token_array)