# Data Prep notebook

This notebook pulls data from the data folder, and puts it into a dataframe

In [5]:
from pathlib import Path
import re
import pandas as pd
from collections import defaultdict

In [6]:

# --- podcast-specific cleaner ---
_TS_INLINE = re.compile(r'\b(?:\d{1,2}:)?\d{1,2}:\d{2}\b')   # hh:mm:ss or m:ss or mm:ss
_TS_LINE   = re.compile(r'^\s*(?:\d{1,2}:)?\d{1,2}:\d{2}\s*$') # timestamp-only lines
_SPEAKER   = re.compile(r'^\s*[A-Z][A-Z\s.\'-]{2,}:\s+')       # ALLCAPS NAME:
_BRACKETED = re.compile(r'\s*\[(?:MUSIC|APPLAUSE|LAUGHTER|SFX)[^\]]*\]\s*', re.I)

In [7]:

def preprocess_text(category: str, text: str) -> str:
    """
    Category-aware prep. 
    For 'podcasts':
        strip timestamps, speaker tags, and stage cues.
    """
    text = text.replace("\r\n", "\n").replace("\r", "\n")

    if category.lower() == "podcasts":
        cleaned_lines = []
        for line in text.split("\n"):
            # drop pure timestamp lines (e.g., "0:05", "01:12:33")
            if _TS_LINE.match(line):
                continue
            # remove bracketed cues like [MUSIC PLAYING], [APPLAUSE]
            line = _BRACKETED.sub(" ", line)
            # strip inline timestamps inside the sentence
            line = _TS_INLINE.sub(" ", line)
            # remove ALLCAPS speaker labels at start of line: "ANDREW HUBERMAN: ..."
            line = _SPEAKER.sub("", line)
            # collapse leftover whitespace
            line = re.sub(r'\s+', ' ', line).strip()
            if line:
                cleaned_lines.append(line)
        text = "\n".join(cleaned_lines)

    # light normalisation for everything
    text = re.sub(r'\n{3,}', "\n\n", text).strip()
    return text


In [8]:

# --- simple loader that applies the prep step ---
def load_contents_by_structure(
    base_dir: str | Path = "data",
    exts = (".txt",),
    min_words: int = 30,
):
    """
    Load texts from: base_dir/category/owner/name.txt
    Returns: (contents_dict, inventory_df)
      - contents_dict keys: 'category::owner::name'
    """
    base = Path(base_dir)
    contents = {}
    records = []

    for p in base.rglob("*"):
        if not p.is_file() or p.suffix.lower() not in exts:
            continue

        try:
            rel = p.relative_to(base)
        except ValueError:
            continue

        parts = rel.parts
        if len(parts) < 3:
            continue

        category, owner = parts[0], parts[1]
        name = p.stem

        raw = p.read_text(encoding="utf-8", errors="ignore")
        text = preprocess_text(category, raw)
        words = len(text.split())
        kept = words >= min_words
        key = f"{category}::{owner}::{name}"

        if kept:
            contents[key] = text

        records.append({
            "key": key,
            "path": str(p),
            "category": category,
            "owner": owner,
            "name": name,
            "ext": p.suffix.lower(),
            "words": words,
            "kept": kept,
        })

    inv = pd.DataFrame.from_records(records).sort_values(["category", "owner", "name"])
    return contents, inv


In [9]:
contents, inv = load_contents_by_structure("data", exts=(".txt",), min_words=30)
print(f"Loaded {sum(inv.kept)} files (of {len(inv)})")

Loaded 3 files (of 3)


In [10]:
inv

Unnamed: 0,key,path,category,owner,name,ext,words,kept
0,podcasts::huberman::how_to_build_immense_inner...,data/podcasts/huberman/how_to_build_immense_in...,podcasts,huberman,how_to_build_immense_inner_strength_David_Goggins,.txt,27446,True
2,songs::taylor_swift::shake_it_off,data/songs/taylor_swift/shake_it_off.txt,songs,taylor_swift,shake_it_off,.txt,560,True
1,songs::taylor_swift::the_fate_of_ophelia,data/songs/taylor_swift/the_fate_of_ophelia.txt,songs,taylor_swift,the_fate_of_ophelia,.txt,432,True


## compare to py data_loader.py file

In [16]:
import importlib
import data_loader  # Import the module itself
from data_loader import load_data  # Also import the function if needed

# After changes to data_loader.py
importlib.reload(data_loader)

# You need to re-import the function after reload
from data_loader import load_data

In [17]:
result = load_data()

In [18]:
result.contents

{'podcasts::huberman::how_to_build_immense_inner_strength_David_Goggins': 'David Goggins\nWelcome to the Huberman Lab podcast where we discuss science and science-based tools for everyday life.\nI\'m Andrew Huberman, and I\'m a professor\nof neurobiology and ophthalmology at Stanford School of Medicine. My guest today is David Goggins.\nDavid Goggins is a retired Navy SEAL who served in Iraq and Afghanistan. He\'s also a highly accomplished ultramarathon runner.\nFor those of you that don\'t know, ultramarathons are distances longer than 26 miles and, in David\'s case, often longer\nthan 200 miles. For his achievements in athletics, he has been inducted into the International Sports\nHall of Fame. He also held a Guinness World Record for the most pull-ups completed in 24 hours.\nI should mention that not only was David a decorated Navy SEAL, but he also graduated from Army Ranger School.\nDavid is also a highly successful writer, having authored two books, the first entitled "Can\'t Hu

In [19]:
result.inventory

Unnamed: 0,key,path,category,owner,name,ext,words,kept
0,podcasts::huberman::how_to_build_immense_inner...,data/podcasts/huberman/how_to_build_immense_in...,podcasts,huberman,how_to_build_immense_inner_strength_David_Goggins,.txt,27446,True
2,songs::taylor_swift::shake_it_off,data/songs/taylor_swift/shake_it_off.txt,songs,taylor_swift,shake_it_off,.txt,560,True
1,songs::taylor_swift::the_fate_of_ophelia,data/songs/taylor_swift/the_fate_of_ophelia.txt,songs,taylor_swift,the_fate_of_ophelia,.txt,432,True


In [20]:
print(f"Loaded {len(result.contents)} files")
print(result.inventory[result.inventory.kept])

Loaded 3 files
                                                 key  ...  kept
0  podcasts::huberman::how_to_build_immense_inner...  ...  True
2                  songs::taylor_swift::shake_it_off  ...  True
1           songs::taylor_swift::the_fate_of_ophelia  ...  True

[3 rows x 8 columns]
