# EraEx: Filter & Prep (Automated + Parallel)

Filters music tracks using library-based automation. 
Runs on **Google Colab** (with Drive mount) or Local.

**Automation**:
- **Genres**: NLTK WordNet
- **Filtering**: `guessit` (Parallelized for speed)

In [29]:
%pip install -r ../requirements.txt

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.3 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from pathlib import Path
import os
import sys

# Robust Drive/Local Path Detection
try:
    from google.colab import drive
    drive.mount('/content/drive')
    PROJECT_DIR = Path('/content/drive/MyDrive/EraEx')
    print("✓ Running on Google Colab")
except ImportError:
    # Local Execution: Handle running from root or notebooks folder
    if Path.cwd().name == 'notebooks':
        PROJECT_DIR = Path.cwd().parent
    else:
        PROJECT_DIR = Path.cwd()
    print(f"✓ Running Locally at {PROJECT_DIR}")

PROCESSED_DIR = PROJECT_DIR / 'data' / 'processed'
MUSIC_DIR = PROCESSED_DIR / 'music_tracks'
READY_DIR = PROCESSED_DIR / 'music_ready'

MUSIC_DIR.mkdir(parents=True, exist_ok=True)
READY_DIR.mkdir(parents=True, exist_ok=True)

YEAR_RANGE = range(2012, 2019)

print(f'Processed Dir: {PROCESSED_DIR}')

✓ Running Locally at c:\Users\Yabuku\Downloads\EraEx
Processed Dir: c:\Users\Yabuku\Downloads\EraEx\data\processed


In [3]:
import re
import gc
import polars as pl
import nltk
from nltk.corpus import wordnet as wn
from guessit import guessit
from joblib import Parallel, delayed
from tqdm.auto import tqdm

nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

def get_automated_genres():
    genres = set()
    try:
        base = wn.synset('musical_style.n.01')
        for syn in base.closure(lambda s: s.hyponyms()):
            for lemma in syn.lemmas():
                name = lemma.name().replace('_', ' ').lower()
                genres.add(name)
    except Exception:
        return set()
    return genres

MUSIC_GENRES = get_automated_genres()
print(f"Loaded {len(MUSIC_GENRES)} genres from WordNet.")

Loaded 140 genres from WordNet.


In [4]:
CHUNK_SIZE = 2_000_000

EPISODE_PATTERN = r"\b(s\d+\s*e\d+|season\s*\d+|episode\s*[\.\:#\-]?\s*\d+|ep\s*[\.\:#\-]?\s*\d+)\b"

def filter_music_tracks(df: pl.DataFrame) -> tuple:
    print(f"  > Filtering {df.height:,} rows (vectorized)...")

    title_lower = df["title"].fill_null("").str.to_lowercase()
    genre_lower = df["genre"].fill_null("").str.to_lowercase()

    genre_hit = genre_lower.is_in(list(MUSIC_GENRES))
    episode_hit = title_lower.str.contains(EPISODE_PATTERN)

    keep_mask = genre_hit | ~episode_hit

    genre_kept = genre_hit.sum()
    episode_removed = (~genre_hit & episode_hit).sum()
    print(f"  > Genre match: {genre_kept:,} | Episode rejected: {episode_removed:,} | Passed through: {(~genre_hit & ~episode_hit).sum():,}")

    music_df = df.filter(keep_mask)

    del title_lower, genre_lower, genre_hit, episode_hit, keep_mask
    gc.collect()

    return music_df, pl.DataFrame(schema=df.schema)

In [5]:
MAX_DESC_LENGTH = 400
def normalize_text(text): return re.sub(r'\s+', ' ', str(text or '')).strip()
def remove_urls(text): return re.sub(r'https?://\S+|www\.\S+', '', str(text or ''))
def normalize_tags(tags):
    if not tags: return ''
    tag_list = re.split(r'[,;|/\n]+', tags.lower())
    junk = {'soundcloud', 'source', 'iphone', 'android', 'recorder'}
    cleaned = [t.strip() for t in tag_list if len(t.strip()) > 1 and t.strip() not in junk]
    return ' '.join(dict.fromkeys(cleaned))

def build_doc_text(row):
    parts = []
    if row.get('title'): parts.append(f"TITLE: {normalize_text(row['title'])}")
    if row.get('artist'): parts.append(f"ARTIST: {normalize_text(row['artist'])}")
    if row.get('genre'): parts.append(f"GENRE: {row['genre']} {row['genre']}")
    
    tags = f"{row.get('tags', '')} {row.get('inferred_tags', '')}".strip()
    if tags:
        normalized = normalize_tags(tags)
        if normalized: parts.append(f"TAGS: {normalized} {normalized}")
            
    if row.get('extracted_vibe_text'):
        parts.append(f"VIBE: {remove_urls(row['extracted_vibe_text'])[:MAX_DESC_LENGTH]}")
    if row.get('description'):
        parts.append(f"DESC: {remove_urls(row['description'])[:MAX_DESC_LENGTH]}")
    if row.get('year'):
        parts.append(f"YEAR: {row['year']}")
    return ' '.join(parts)

In [34]:
def build_doc_texts_chunked(df: pl.DataFrame) -> list:
    total = df.height
    all_texts = []
    for start in tqdm(range(0, total, CHUNK_SIZE), desc="Building doc_text"):
        size = min(CHUNK_SIZE, total - start)
        chunk = df.slice(start, size)
        rows = chunk.to_dicts()
        all_texts.extend(build_doc_text(row) for row in rows)
        del rows, chunk
        gc.collect()
    return all_texts

def process_year(year):
    year_dir = PROCESSED_DIR / f'year={year}'
    if not year_dir.exists(): return None
    print(f'\nProcessing {year}...')

    parquet_files = [str(f) for f in year_dir.glob('*.parquet')]
    if not parquet_files: return None

    try:
        df = pl.scan_parquet(parquet_files).collect()
    except Exception:
        print("  > Schema mismatch fallback (loading slow)...")
        dfs = [pl.read_parquet(f) for f in parquet_files]
        df = pl.concat(dfs, how='diagonal')

    before = df.height
    music_df, _ = filter_music_tracks(df)
    del df
    gc.collect()
    print(f'  Filtered: {before:,} -> {music_df.height:,} music tracks')

    if 'permalink_url' in music_df.columns:
        music_df = music_df.unique(subset=['permalink_url'], keep='first')
        print(f'  After dedup: {music_df.height:,} rows')

    doc_texts = build_doc_texts_chunked(music_df)
    music_df = music_df.with_columns([pl.Series('doc_text_music', doc_texts)])
    del doc_texts
    gc.collect()

    out_dir = READY_DIR / f'year={year}'
    out_dir.mkdir(parents=True, exist_ok=True)
    music_df.write_parquet(out_dir / 'data.parquet')
    print(f'  Saved: {out_dir / "data.parquet"} ({music_df.height:,} rows)')

    row_count = music_df.height
    del music_df
    gc.collect()
    return row_count

In [6]:
total = sum(filter(None, [process_year(y) for y in YEAR_RANGE]))
print(f'\nTotal Ready: {total:,}')

NameError: name 'process_year' is not defined