# EraEx: Filter & Prep (Automated + Parallel)

Filters music tracks using library-based automation. 
Runs on **Google Colab** (with Drive mount) or Local.

**Automation**:
- **Genres**: NLTK WordNet
- **Filtering**: `guessit` (Parallelized for speed)

In [None]:
!pip install polars zstandard nrclex nltk guessit joblib tqdm

In [None]:
from pathlib import Path
import os
import sys

# Drive Mount Check
try:
    from google.colab import drive
    drive.mount('/content/drive')
    PROJECT_DIR = Path('/content/drive/MyDrive/EraEx')
    print("✓ Running on Google Colab")
except ImportError:
    PROJECT_DIR = Path.cwd().parent
    print("✓ Running Locally")

PROCESSED_DIR = PROJECT_DIR / 'data' / 'processed'
MUSIC_DIR = PROCESSED_DIR / 'music_tracks'
READY_DIR = PROCESSED_DIR / 'music_ready'

MUSIC_DIR.mkdir(parents=True, exist_ok=True)
READY_DIR.mkdir(parents=True, exist_ok=True)

YEAR_RANGE = range(2012, 2019)

print(f'Project Dir: {PROJECT_DIR}')

In [None]:
import re
import polars as pl
import nltk
from nltk.corpus import wordnet as wn
from guessit import guessit
from joblib import Parallel, delayed
from tqdm.auto import tqdm

nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

def get_automated_genres():
    genres = set()
    try:
        base = wn.synset('musical_style.n.01')
        for syn in base.closure(lambda s: s.hyponyms()):
            for lemma in syn.lemmas():
                name = lemma.name().replace('_', ' ').lower()
                genres.add(name)
    except Exception:
        return set()
    return genres

MUSIC_GENRES = get_automated_genres()
print(f"Loaded {len(MUSIC_GENRES)} genres from WordNet.")

In [None]:
def is_music_track(row) -> bool:
    """Thread-safe logic for parallel filtering"""
    title = str(row.get('title', '')).lower()
    genre = str(row.get('genre', '')).lower()
    
    # 1. Faster Check: Explicit Podcast/Interview Keywords (Optimized RegEx fallback)
    # Guessit is slow, so we can skip obvious ones if we wanted, but sticking to logic:
    
    # 2. Guessit
    g = guessit(title)
    if g.get('type') == 'episode' or 'season' in g or 'episode' in g:
        return False

    # 3. WordNet Genre confirmation
    if genre and (genre in MUSIC_GENRES):
        return True
    
    return True

def filter_music_tracks_parallel(df: pl.DataFrame) -> tuple:
    print("  > Starting parallel filtering (this takes time for guessit)...")
    rows = df.to_dicts()
    
    # Run is_music_track in parallel
    # n_jobs=-1 uses all cores. Adjust if memory issues.
    mask = Parallel(n_jobs=-1, batch_size=1000)(
        delayed(is_music_track)(row) for row in tqdm(rows, desc="Filtering", unit="row")
    )
    
    music_rows = [r for r, m in zip(rows, mask) if m]
    other_rows = [r for r, m in zip(rows, mask) if not m]
    
    music_df = pl.DataFrame(music_rows, schema=df.schema)
    other_df = pl.DataFrame(other_rows, schema=df.schema)
    
    return music_df, other_df

In [None]:
MAX_DESC_LENGTH = 400
def normalize_text(text): return re.sub(r'\s+', ' ', str(text or '')).strip()
def remove_urls(text): return re.sub(r'https?://\S+|www\.\S+', '', str(text or ''))
def normalize_tags(tags):
    if not tags: return ''
    tag_list = re.split(r'[,;|/\n]+', tags.lower())
    junk = {'soundcloud', 'source', 'iphone', 'android', 'recorder'}
    cleaned = [t.strip() for t in tag_list if len(t.strip()) > 1 and t.strip() not in junk]
    return ' '.join(dict.fromkeys(cleaned))

def build_doc_text(row):
    parts = []
    if row.get('title'): parts.append(f"TITLE: {normalize_text(row['title'])}")
    if row.get('artist'): parts.append(f"ARTIST: {normalize_text(row['artist'])}")
    if row.get('genre'): parts.append(f"GENRE: {row['genre']} {row['genre']}")
    
    tags = f"{row.get('tags', '')} {row.get('inferred_tags', '')}".strip()
    if tags:
        normalized = normalize_tags(tags)
        if normalized: parts.append(f"TAGS: {normalized} {normalized}")
            
    if row.get('extracted_vibe_text'):
        parts.append(f"VIBE: {remove_urls(row['extracted_vibe_text'])[:MAX_DESC_LENGTH]}")
    if row.get('description'):
        parts.append(f"DESC: {remove_urls(row['description'])[:MAX_DESC_LENGTH]}")
    if row.get('year'):
        parts.append(f"YEAR: {row['year']}")
    return ' '.join(parts)

In [None]:
def process_year(year):
    year_dir = PROCESSED_DIR / f'year={year}'
    if not year_dir.exists(): return None
    print(f'Processing {year}...')
    
    parquet_files = [str(f) for f in year_dir.glob('*.parquet')]
    if not parquet_files: return None
    
    try:
        df = pl.scan_parquet(parquet_files).collect()
    except Exception:
        print("  > Schema mismatch fallback (loading slow)...")
        dfs = [pl.read_parquet(f) for f in parquet_files]
        df = pl.concat(dfs, how='diagonal')

    before = df.height
    music_df, _ = filter_music_tracks_parallel(df)
    print(f'  Filtered: {before:,} -> {music_df.height:,} music tracks')
    
    if 'permalink_url' in music_df.columns:
        music_df = music_df.unique(subset=['permalink_url'], keep='first')
        
    rows = music_df.to_dicts()
    # Parallelize text building too if needed, but usually fast enough
    doc_texts = [build_doc_text(row) for row in rows]
    music_df = music_df.with_columns([pl.Series('doc_text_music', doc_texts)])
    
    out_dir = READY_DIR / f'year={year}'
    out_dir.mkdir(parents=True, exist_ok=True)
    music_df.write_parquet(out_dir / 'data.parquet')
    print(f'  Saved: {out_dir / "data.parquet"} ({music_df.height:,} rows)')
    return music_df.height

In [None]:
total = sum(filter(None, [process_year(y) for y in YEAR_RANGE]))
print(f'\nTotal Ready: {total:,}')