In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
# sentence does seem like a rough abstraction, and is common.
# we have things like nested quotes

In [2]:
# let's load tinystories for comparison
#
# note: `datasets` can list datasets but is deprecated
import huggingface_hub

# from https://huggingface.co/docs/huggingface_hub/en/guides/download#from-latest-version
import dataclasses
from typing import Callable
import pathlib


@dataclasses.dataclass(frozen=True)
class TrainAndVal[T]:
    """Helper for common pattern of transforming both train and val."""

    train: T
    val: T

    def apply[R](self, func: Callable[[T], R]) -> 'TrainAndVal[R]':
        return dataclasses.replace(self,
            train=func(self.train),
            val=func(self.val),
        )

def download_file_from_tinystories(filename: str) -> pathlib.Path:

    print(f"Downloading {filename}...")
    filepath = huggingface_hub.hf_hub_download(
        repo_id='roneneldan/TinyStories',
        filename=filename,
        repo_type="dataset",
    )

    print(f"Downloaded {filename} to {filepath}")
    return pathlib.Path(filepath)

# original in paper
# train_filename, val_filename = 'TinyStories-train.txt', 'TinyStories-valid.txt'

# GPT-4 only, significantly larger but newer
filenames = TrainAndVal('TinyStoriesV2-GPT4-train.txt', 'TinyStoriesV2-GPT4-valid.txt')

# download
filepaths = filenames.apply(download_file_from_tinystories)

Downloading TinyStoriesV2-GPT4-train.txt...
Downloaded TinyStoriesV2-GPT4-train.txt to /Users/bronsonschoen/.cache/huggingface/hub/datasets--roneneldan--TinyStories/snapshots/f54c09fd23315a6f9c86f9dc80f725de7d8f9c64/TinyStoriesV2-GPT4-train.txt
Downloading TinyStoriesV2-GPT4-valid.txt...
Downloaded TinyStoriesV2-GPT4-valid.txt to /Users/bronsonschoen/.cache/huggingface/hub/datasets--roneneldan--TinyStories/snapshots/f54c09fd23315a6f9c86f9dc80f725de7d8f9c64/TinyStoriesV2-GPT4-valid.txt


# Sentencepiece

In [3]:
# Example notebook: https://github.com/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb
#
# - all options: https://github.com/google/sentencepiece/blob/master/doc/options.md
#   - where is the proto that defines this that Karpathy was looking at?
#
# - actual protos with documentation:
#   - https://github.com/google/sentencepiece/blob/master/src/sentencepiece.proto
#   - https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
#
# - working example on tinystories here: https://github.com/karpathy/llama2.c/blob/master/tinystories.py

!pip install sentencepiece

import sentencepiece as spm



In [4]:
# write a toy.txt file with some random text
with open("toy.txt", "w", encoding="utf-8") as f:
  f.write("SentencePiece is an unsupervised text tokenizer and detokenizer mainly for Neural Network-based text generation systems where the vocabulary size is predetermined prior to the neural model training. SentencePiece implements subword units (e.g., byte-pair-encoding (BPE) [Sennrich et al.]) and unigram language model [Kudo.]) with the extension of direct training from raw sentences. SentencePiece allows us to make a purely end-to-end system that does not depend on language-specific pre/postprocessing.")

In [13]:
# train a sentencepiece model on it
# the settings here are (best effort) those used for training Llama 2
import os

vocab_size = 400
model_prefix = f"tok{vocab_size}" # output filename prefix

# changed to use `word`
options = dict(
  # input spec
  input="toy.txt",
  input_format="text",
  # output spec
  model_prefix=model_prefix,
  # algorithm spec
  # BPE alg
  model_type="bpe",
  # model_type="word",
  vocab_size=vocab_size,
  # normalization
  normalization_rule_name="identity", # ew, turn off normalization
  remove_extra_whitespaces=False,
  input_sentence_size=200000000, # max number of training sentences
  max_sentence_length=4192, # max number of bytes per sentence
  seed_sentencepiece_size=1000000,
  shuffle_input_sentence=True,
  # rare word treatment
  character_coverage=0.99995,
  byte_fallback=True,
  # merge rules
  split_digits=True,
  split_by_unicode_script=True,
  split_by_whitespace=True,
  split_by_number=True,
  max_sentencepiece_length=16,
  add_dummy_prefix=True,
  allow_whitespace_only_pieces=True,
  # special tokens
  unk_id=0, # the UNK token MUST exist
  bos_id=1, # the others are optional, set to -1 to turn off
  eos_id=2,
  pad_id=-1,
  # systems
  num_threads=os.cpu_count() // 2, # use (half) system resources
)

spm.SentencePieceTrainer.train(**options)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: toy.txt
  input_format: text
  model_prefix: tok400
  model_type: BPE
  vocab_size: 400
  self_test_sample_size: 0
  character_coverage: 0.99995
  input_sentence_size: 200000000
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 4
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 1
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 1
  required_chars: 
  byte_fallback: 1
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  diffe

In [14]:
sp = spm.SentencePieceProcessor()
sp.load(f'{model_prefix}.model')
vocab = [[sp.id_to_piece(idx), idx] for idx in range(sp.get_piece_size())]
vocab

[['<unk>', 0],
 ['<s>', 1],
 ['</s>', 2],
 ['<0x00>', 3],
 ['<0x01>', 4],
 ['<0x02>', 5],
 ['<0x03>', 6],
 ['<0x04>', 7],
 ['<0x05>', 8],
 ['<0x06>', 9],
 ['<0x07>', 10],
 ['<0x08>', 11],
 ['<0x09>', 12],
 ['<0x0A>', 13],
 ['<0x0B>', 14],
 ['<0x0C>', 15],
 ['<0x0D>', 16],
 ['<0x0E>', 17],
 ['<0x0F>', 18],
 ['<0x10>', 19],
 ['<0x11>', 20],
 ['<0x12>', 21],
 ['<0x13>', 22],
 ['<0x14>', 23],
 ['<0x15>', 24],
 ['<0x16>', 25],
 ['<0x17>', 26],
 ['<0x18>', 27],
 ['<0x19>', 28],
 ['<0x1A>', 29],
 ['<0x1B>', 30],
 ['<0x1C>', 31],
 ['<0x1D>', 32],
 ['<0x1E>', 33],
 ['<0x1F>', 34],
 ['<0x20>', 35],
 ['<0x21>', 36],
 ['<0x22>', 37],
 ['<0x23>', 38],
 ['<0x24>', 39],
 ['<0x25>', 40],
 ['<0x26>', 41],
 ['<0x27>', 42],
 ['<0x28>', 43],
 ['<0x29>', 44],
 ['<0x2A>', 45],
 ['<0x2B>', 46],
 ['<0x2C>', 47],
 ['<0x2D>', 48],
 ['<0x2E>', 49],
 ['<0x2F>', 50],
 ['<0x30>', 51],
 ['<0x31>', 52],
 ['<0x32>', 53],
 ['<0x33>', 54],
 ['<0x34>', 55],
 ['<0x35>', 56],
 ['<0x36>', 57],
 ['<0x37>', 58],
 ['<0x38>', 5

In [20]:
# useful functions for working with tinystories
import math
import pathlib
from typing import Iterable

class SpecialTokens:
    EOT = "<|endoftext|>"

def closest_power_of_two(n: int) -> int:
    # Find the power of 2 less than or equal to n
    lower = 2 ** math.floor(math.log2(n))

    # Find the power of 2 greater than n
    upper = lower * 2

    # Return the closest one
    return lower if (n - lower) < (upper - n) else upper

def next_power_of_two(n: int) -> int:
  
    # Find the power of 2 greater than n
    return 2 ** math.ceil(math.log2(n))

def get_first_n_examples(input_text: str, n: int) -> str:

    delimiter = SpecialTokens.EOT

    examples = input_text.split(delimiter)

    # Return all text if n is greater than available examples
    if n > len(examples) - 1:
        return input_text

    result = delimiter.join(examples[:n]) + delimiter
    return result.strip()

def head(filepath: pathlib.Path, n: int) -> Iterable[str]:
    """Equivalent to `head` command."""
    
    with filepath.open() as f:

        for _ in range(n):

            line = f.readline()
            
            # break if done
            if not line:
                break
            
            yield line.rstrip('\n')

In [22]:
# grab arbitrary small subset of tinystores to test
num_lines = 10000

def split_into_sentences(text: str) -> Iterable[str]:
    """Split text into sentences."""
    return text.split('\n')
list(head(filepaths.train, 10))

['',
 'Once upon a time there was a little boy named Ben. Ben loved to explore the world around him. He saw many amazing things, like beautiful vases that were on display in a store. One day, Ben was walking through the store when he came across a very special vase. When Ben saw it he was amazed!  ',
 'He said, “Wow, that is a really amazing vase! Can I buy it?” ',
 'The shopkeeper smiled and said, “Of course you can. You can take it home and show all your friends how amazing it is!”',
 "So Ben took the vase home and he was so proud of it! He called his friends over and showed them the amazing vase. All his friends thought the vase was beautiful and couldn't believe how lucky Ben was. ",
 "And that's how Ben found an amazing vase in the store!",
 '<|endoftext|>',
 'Once upon a time, there was a reliable otter named Ollie. He lived in a river with his family. They all loved to play and swim together.',
 'One day, Ollie\'s mom said, "Ollie, hurry and get some fish for dinner!" Ollie swam

In [None]:
# let's try to use a sane one for `toy.txt`

vocab_size = 128
model_prefix = f"tok{vocab_size}" # output filename prefix

# The maximum sentence length in byte. The sentences with the length
# larger than `max_sentence_length` is simply ignored, so we actually
# have to set it
#
# max_sentence_length

# need to actually split the dataset into one line per sentence, since that's
# what SentencePiece expects




In [None]:
spm.SentencePieceTrainer.train(
    input="toy.txt",
    input_type="text", # one-sentence-per-line text format (default)
    model_prefix=model_prefix,
    # vocab_size=..., Vocabulary size. 8k is the default size.
)