In [11]:
from pathlib import Path
import re
from tqdm.notebook import tqdm

# Path to your local copy
# DATA_PATH = Path("data/raw/TinyStories/TinyStories-train.txt")
DATA_PATH = Path("..") / "data" / "raw" / "TinyStories" / "TinyStories-train.txt"


# Regular expression to recognize the <|endoftext|> delimiter
DELIM_RE = re.compile(r"\s*<\|endoftext\|\>\s*", flags=re.IGNORECASE)


In [5]:
# Rolling through the huge .txt file without loading it all into memory (doing 1MB at a time)

def stream_stories(path: Path, chunk_size: int = 1024 * 1024):
    """
    Yield one story at a time by splitting on <|endoftext|>.
    Reads in small chunks so we never hold the whole file in memory.
    """
    buf = ""
    with path.open("r", encoding="utf-8", errors="ignore") as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            buf += chunk
            parts = DELIM_RE.split(buf)
            # all but the last are complete stories
            for s in parts[:-1]:
                s = s.strip()
                if s:
                    yield s
            buf = parts[-1]  # keep the tail for the next read
        # flush the remainder
        tail = buf.strip()
        if tail:
            yield tail


In [6]:
# Quick sanity checks through first 3 stories

for i, story in zip(range(3), stream_stories(DATA_PATH)):
    print(f"--- Story {i} ---")
    print(story[:400], "...\n")


--- Story 0 ---
One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.
Lily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."
Toge ...

--- Story 1 ---
Once upon a time, there was a little car named Beep. Beep loved to go fast and play in the sun. Beep was a healthy car because he always had good fuel. Good fuel made Beep happy and strong.
One day, Beep was driving in the park when he saw a big tree. The tree had many leaves that were falling. Beep liked how the leaves fall and wanted to play with them. Beep drove under the tree and watched the l ...

--- Story 2 ---
One day, a little fish named Fin was swimming near the shore. He saw a big crab and wanted to be friends. "Hi, I am Fin. Do you want to play

In [13]:
count = 0
for _ in tqdm(stream_stories(DATA_PATH), desc="Counting stories"):
    count += 1
print(f"Total number of stories: {count:,}")


Counting stories: 0it [00:00, ?it/s]

Total number of stories: 2,119,489


In [None]:
# Counting <|endoftext|> markers in the file to check for integrity

def count_delims(path: Path, needle="<|endoftext|>"):
    total = 0
    with path.open("r", encoding="utf-8", errors="ignore") as f:
        for chunk in iter(lambda: f.read(1024 * 1024), ""):
            total += chunk.lower().count(needle)
    return total

raw_delims = count_delims(DATA_PATH)
print(f"Raw <|endoftext|> markers: {raw_delims:,}")


Raw <|endoftext|> markers: 2,119,697
