# EraEx: Filter & Prep (Colab)

This notebook filters music tracks and prepares doc_text for embedding.

**Sources**:
- Wikipedia Music Genres: https://en.wikipedia.org/wiki/List_of_music_genres_and_styles
- Every Noise at Once: https://everynoise.com/
- SoundCloud Categories: https://developers.soundcloud.com/docs/api/reference

In [None]:
%pip install -r requirements.txt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from pathlib import Path
import re
import polars as pl

PROJECT_DIR = Path('/content/drive/MyDrive/EraEx')
PROCESSED_DIR = PROJECT_DIR / 'data' / 'processed'
MUSIC_DIR = PROCESSED_DIR / 'music_tracks'
READY_DIR = PROCESSED_DIR / 'music_ready'

MUSIC_DIR.mkdir(parents=True, exist_ok=True)
READY_DIR.mkdir(parents=True, exist_ok=True)

YEAR_RANGE = range(2012, 2019)

In [None]:
BLOCK_GENRES = {'sports', 'spoken words', 'spoken word'}

MUSIC_GENRES = {
    'electronic', 'hip hop rap', 'house', 'techno', 'dubstep', 'drum bass',
    'ambient', 'trance', 'r&b soul', 'pop', 'rock', 'metal', 'jazz', 'blues',
    'classical', 'reggae', 'country', 'folk singer songwriter', 'latin',
    'disco', 'funk', 'indie', 'alternative', 'punk', 'dance edm', 'trap',
    'deep house', 'progressive house', 'minimal', 'lo-fi', 'chillout',
}

EXCLUDE_PATTERNS = {
    'mix': r'\b(dj\s*mix|mixtape|mix\s*tape|megamix|minimix|set\s*mix)\b',
    'ep_episode': r'\b(ep|episode)\s*[\.:#-]?\s*\d+',
    'season_episode': r'\bs\s*\d+\s*e\s*\d+',
    'podcast': r'\bpodcast\b',
    'interview': r'\binterview\b',
    'with_guest': r'\bwith\s+guest\b',
    'full_album': r'\bfull\s+album\b',
    'radio_show': r'\bradio\s*(show|program|episode)\b',
    'live_set': r'\blive\s*set\b',
}

In [None]:
def filter_music_tracks(df: pl.DataFrame) -> tuple:
    df = df.with_columns([
        pl.col('title').fill_null('').str.to_lowercase().alias('title_l'),
        pl.col('genre').fill_null('').str.to_lowercase().alias('genre_l'),
    ])
    
    blocked_genre_mask = pl.col('genre_l').is_in(list(BLOCK_GENRES))
    
    mix_pattern = r'\b(dj\s*mix|mixtape|mix\s*tape|megamix|minimix|set\s*mix|live\s*set|continuous\s*mix)\b'
    dj_mix_mask = pl.col('title_l').str.contains(mix_pattern)
    
    podcast_pattern = r'\b(podcast|episode\s*\d+|ep\s*\d+|s\d+\s*e\d+)\b'
    podcast_mask = pl.col('title_l').str.contains(podcast_pattern)
    
    interview_pattern = r'\b(interview|with\s+guest)\b'
    interview_mask = pl.col('title_l').str.contains(interview_pattern)
    
    album_pattern = r'\bfull\s+album\b'
    album_mask = pl.col('title_l').str.contains(album_pattern)
    
    remix_mask = pl.col('title_l').str.contains(r'\bremix\b')
    
    exclude_mask = (
        blocked_genre_mask | 
        (dj_mix_mask & ~remix_mask) | 
        podcast_mask | 
        interview_mask | 
        album_mask
    )
    
    music_df = df.filter(~exclude_mask).drop(['title_l', 'genre_l'])
    other_df = df.filter(exclude_mask).drop(['title_l', 'genre_l'])
    
    return music_df, other_df

In [None]:
MAX_DESC_LENGTH = 400

def normalize_text(text):
    if text is None:
        return ''
    return re.sub(r'\s+', ' ', text).strip()

def remove_urls(text):
    if text is None:
        return ''
    return re.sub(r'https?://\S+|www\.\S+', '', text)

def normalize_tags(tags):
    if tags is None or tags == '':
        return ''
    tag_list = re.split(r'[,;|/\n]+', tags.lower())
    junk = {'soundcloud', 'source', 'iphone', 'android', '3rdparty', 'recorder'}
    cleaned = [t.strip() for t in tag_list if len(t.strip()) > 1 and t.strip() not in junk]
    return ' '.join(dict.fromkeys(cleaned))

def build_doc_text(row):
    parts = []
    if row.get('title'):
        parts.append(f"TITLE: {normalize_text(row['title'])}")
    if row.get('artist'):
        parts.append(f"ARTIST: {normalize_text(row['artist'])}")
    if row.get('genre'):
        parts.append(f"GENRE: {row['genre']} {row['genre']}")
    tags = f"{row.get('tags', '') or ''} {row.get('inferred_tags', '') or ''}".strip()
    if tags:
        normalized = normalize_tags(tags)
        if normalized:
            parts.append(f"TAGS: {normalized} {normalized}")
    if row.get('extracted_vibe_text'):
        vibe = remove_urls(row['extracted_vibe_text'])[:MAX_DESC_LENGTH]
        parts.append(f"VIBE: {vibe}")
    if row.get('description'):
        desc = remove_urls(row['description'])[:MAX_DESC_LENGTH]
        parts.append(f"DESC: {desc}")
    if row.get('year'):
        parts.append(f"YEAR: {row['year']}")
    return ' '.join(parts)

In [None]:
def process_year(year):
    year_dir = PROCESSED_DIR / f'year={year}'
    if not year_dir.exists():
        return None
    
    print(f'Processing {year}...')
    
    parquet_files = list(year_dir.glob('*.parquet'))
    if not parquet_files:
        return None
    
    df = pl.concat([pl.read_parquet(f) for f in parquet_files])
    before = df.height
    
    music_df, other_df = filter_music_tracks(df)
    print(f'  Filtered: {before:,} -> {music_df.height:,} music tracks')
    
    music_df = music_df.unique(subset=['permalink_url'], keep='first')
    print(f'  Dedup by permalink: {music_df.height:,}')
    
    rows = music_df.to_dicts()
    doc_texts = [build_doc_text(row) for row in rows]
    music_df = music_df.with_columns([pl.Series('doc_text_music', doc_texts)])
    
    out_dir = READY_DIR / f'year={year}'
    out_dir.mkdir(parents=True, exist_ok=True)
    music_df.write_parquet(out_dir / 'data.parquet')
    
    print(f'  Saved: {out_dir / "data.parquet"}')
    return music_df.height

In [None]:
total = 0
for year in YEAR_RANGE:
    count = process_year(year)
    if count:
        total += count

print(f'\nTotal music tracks ready: {total:,}')