In [None]:
"""
Download Mandarin, Russian, and Spanish Accents from GMU Speech Accent Archive
"""

# ============================================================
# SETUP
# ============================================================

from google.colab import drive
drive.mount('/content/drive')

# Output directory - UPDATED with Fall25
OUTDIR = "/content/drive/MyDrive/Fall25/EE502/FinalProjects/Ch24/data"

import os
os.makedirs(OUTDIR, exist_ok=True)
print("="*70)
print("DOWNLOADING ACCENT DATA: Mandarin, Russian, Spanish")
print("="*70)
print(f"Output directory: {OUTDIR}")
print("="*70 + "\n")

# Install dependencies
print("Installing dependencies...")
import sys
!{sys.executable} -m pip install -q requests beautifulsoup4 tqdm
print("✓ Dependencies installed\n")

# ============================================================
# IMPORT LIBRARIES
# ============================================================

import csv
import pathlib
import random
import re
import time
from typing import Dict, List, Optional
from urllib.parse import urljoin, urlparse, parse_qs

import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

# ============================================================
# CONFIGURATION
# ============================================================

BASE = "https://accent.gmu.edu/"
HEADERS = {
    "User-Agent": (
        "Academic research downloader for Speech Accent Archive "
        "(Student project - EE502 Final Project)"
    )
}

# ============================================================
# HELPER FUNCTIONS
# ============================================================

def http_get(url: str, *, stream: bool = False, timeout: int = 30) -> requests.Response:
    """HTTP GET with error handling."""
    r = requests.get(url, headers=HEADERS, stream=stream, timeout=timeout)
    r.raise_for_status()
    return r

def polite_sleep(lo: float = 0.6, hi: float = 1.4) -> None:
    """Sleep to be polite to the server."""
    time.sleep(random.uniform(lo, hi))

def list_all_language_pages() -> Dict[str, str]:
    """Get all available language/accent pages."""
    print("Fetching available languages from GMU website...")
    url = urljoin(BASE, "browse_language.php")
    html = http_get(url).text
    soup = BeautifulSoup(html, "html.parser")
    langs = {}

    for a in soup.select("a[href*='browse_language.php?'][href*='language=']"):
        href = a.get("href", "")
        lang_url = urljoin(BASE, href)
        lang_name = (a.text or "").strip()

        if not lang_name:
            q = parse_qs(urlparse(lang_url).query).get("language", [""])[0]
            lang_name = q.strip()

        if lang_name:
            langs[lang_name.lower()] = lang_url

    print(f"✓ Found {len(langs)} languages on the website\n")
    return langs

def extract_speaker_links(lang_url: str, limit: Optional[int] = None) -> List[str]:
    """Extract speaker detail page links for a language."""
    html = http_get(lang_url).text
    soup = BeautifulSoup(html, "html.parser")
    links = [urljoin(BASE, a.get("href")) for a in soup.select("a[href*='function=detail']")]

    if limit is not None:
        links = links[:limit]

    return links

def find_first_mp3_url(soup: BeautifulSoup) -> Optional[str]:
    """Find the MP3 audio URL on a speaker page."""
    # Try <a> tags
    for a in soup.find_all("a", href=True):
        href = a["href"].strip()
        if href.lower().endswith(".mp3"):
            return urljoin(BASE, href)

    # Try <source> tags
    src = soup.find("source", src=re.compile(r"\.mp3$", re.I))
    if src and src.get("src"):
        return urljoin(BASE, src["src"].strip())

    return None

# Patterns to extract metadata from speaker pages
FIELD_PATTERNS = {
    "birth_place": re.compile(r"birth place\s*:\s*([^\n\r]+)", re.I),
    "native_language": re.compile(r"native language\s*:\s*([^\n\r]+)", re.I),
    "other_languages": re.compile(r"other language\(s\)\s*:\s*([^\n\r]+)", re.I),
    "age_sex": re.compile(r"age,\s*sex\s*:\s*([^\n\r]+)", re.I),
    "age_english_onset": re.compile(r"age of english onset\s*:\s*([^\n\r]+)", re.I),
    "english_learning_method": re.compile(r"english learning method\s*:\s*([^\n\r]+)", re.I),
    "years_in_english_country": re.compile(r"years in english-speaking country\s*:\s*([^\n\r]+)", re.I),
}

def parse_speaker_page(spk_url: str) -> Dict[str, str]:
    """Parse speaker page to extract metadata and audio URL."""
    html = http_get(spk_url).text
    soup = BeautifulSoup(html, "html.parser")
    mp3 = find_first_mp3_url(soup)
    text = soup.get_text("\n", strip=True)

    meta = {"speaker_url": spk_url, "audio_url": mp3 or ""}

    for key, pat in FIELD_PATTERNS.items():
        m = pat.search(text)
        meta[key] = m.group(1).strip() if m else ""

    return meta

def safe_filename(name: str) -> str:
    """Convert string to safe filename."""
    name = re.sub(r"[^\w.\- ]+", "_", name)
    return name[:150]

def download_file(url: str, dest_path: pathlib.Path) -> None:
    """Download a file from URL to destination."""
    dest_path.parent.mkdir(parents=True, exist_ok=True)

    # Skip if already downloaded
    if dest_path.exists() and dest_path.stat().st_size > 0:
        return

    with http_get(url, stream=True) as r:
        with open(dest_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=1 << 14):
                if chunk:
                    f.write(chunk)

def write_csv(csv_path: pathlib.Path, rows: List[Dict[str, str]]) -> None:
    """Write list of dicts to CSV."""
    csv_path.parent.mkdir(parents=True, exist_ok=True)
    headers = sorted({k for row in rows for k in row.keys()})

    with open(csv_path, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=headers)
        w.writeheader()
        for r in rows:
            w.writerow(r)

def count_speakers_for_accent(lang_url: str) -> int:
    """Count how many speakers are available for an accent."""
    try:
        speakers = extract_speaker_links(lang_url)
        return len(speakers)
    except:
        return 0

def download_accent(lang_name: str, lang_url: str, outdir: pathlib.Path,
                    limit_per_accent: Optional[int] = None) -> List[Dict[str, str]]:
    """Download all speakers for one accent."""
    rows = []
    speakers = extract_speaker_links(lang_url, limit=limit_per_accent)

    print(f"\n{'='*70}")
    print(f"Downloading: {lang_name.upper()}")
    print(f"{'='*70}")
    print(f"Speakers to download: {len(speakers)}")

    accent_folder = outdir / safe_filename(lang_name)
    accent_folder.mkdir(parents=True, exist_ok=True)

    success_count = 0
    fail_count = 0

    for spk_url in tqdm(speakers, desc=f"{lang_name}", unit="speaker"):
        try:
            meta = parse_speaker_page(spk_url)
            audio_url = meta.get("audio_url", "")
            local_path = ""

            if audio_url:
                fname = safe_filename(os.path.basename(urlparse(audio_url).path) or f"{hash(spk_url)}.mp3")
                fpath = accent_folder / fname
                download_file(audio_url, fpath)
                local_path = str(fpath)
                success_count += 1
                polite_sleep(0.8, 1.6)  # Be polite to server
            else:
                fail_count += 1

            meta.update({"accent": lang_name, "local_audio_path": local_path})
            rows.append(meta)

        except Exception as e:
            fail_count += 1
            rows.append({
                "accent": lang_name,
                "speaker_url": spk_url,
                "audio_url": "",
                "local_audio_path": "",
                "error": str(e),
            })
            continue

    # Save individual accent CSV
    if rows:
        write_csv(accent_folder / f"{safe_filename(lang_name)}.csv", rows)

    print(f"\n✓ {lang_name.capitalize()}: {success_count} files downloaded, {fail_count} failed")

    return rows

# ============================================================
# MAIN DOWNLOAD FUNCTION
# ============================================================

def download_three_accents():
    """Download Mandarin, Russian, and Spanish accents."""

    outpath = pathlib.Path(OUTDIR)
    outpath.mkdir(parents=True, exist_ok=True)

    # Get all available languages
    all_langs = list_all_language_pages()

    if not all_langs:
        raise RuntimeError("Could not fetch languages from website. Check internet connection.")

    # Define the 3 target accents
    # Try multiple variations as website might use different names
    target_accents = {
        'mandarin': ['mandarin', 'chinese mandarin', 'mandarin chinese', 'chinese'],
        'russian': ['russian'],
        'spanish': ['spanish', 'castilian spanish']
    }

    print("="*70)
    print("SEARCHING FOR TARGET ACCENTS")
    print("="*70)

    # Find matching accents on website
    found_accents = {}

    for accent_key, variations in target_accents.items():
        found = False
        for variation in variations:
            if variation in all_langs:
                found_accents[accent_key] = (variation, all_langs[variation])
                count = count_speakers_for_accent(all_langs[variation])
                print(f"✓ Found {accent_key.capitalize()}: '{variation}' ({count} speakers)")
                found = True
                polite_sleep()
                break

        if not found:
            print(f"✗ Could not find {accent_key.capitalize()}")
            print(f"  Tried: {variations}")

    if len(found_accents) < 3:
        print("\n" + "="*70)
        print("ERROR: Could not find all 3 accents!")
        print("="*70)
        print("\nAvailable languages on website (first 20):")
        for i, lang in enumerate(sorted(all_langs.keys())[:20], 1):
            print(f"  {i:2d}. {lang}")
        print("  ...")
        print(f"\nTotal: {len(all_langs)} languages available")
        raise ValueError(f"Only found {len(found_accents)}/3 target accents")

    print(f"\n✓ All 3 accents found! Proceeding with download...\n")

    # Download each accent
    all_rows = []

    for accent_key in ['mandarin', 'russian', 'spanish']:
        if accent_key in found_accents:
            lang_name, lang_url = found_accents[accent_key]
            rows = download_accent(lang_name, lang_url, outpath, limit_per_accent=None)
            all_rows.extend(rows)
        else:
            print(f"\n⚠ Skipping {accent_key} (not found)")

    # Write combined metadata CSV
    if all_rows:
        metadata_path = outpath / "metadata.csv"
        write_csv(metadata_path, all_rows)
        print(f"\n{'='*70}")
        print("DOWNLOAD COMPLETE!")
        print(f"{'='*70}")
        print(f"\n✓ Total samples downloaded: {len(all_rows)}")
        print(f"✓ Combined metadata saved: {metadata_path}")
        print(f"✓ Individual accent folders created in: {outpath}")

        # Show summary
        print(f"\n{'='*70}")
        print("SUMMARY BY ACCENT")
        print(f"{'='*70}")

        from collections import Counter
        accent_counts = Counter(row['accent'] for row in all_rows if 'accent' in row)
        for accent, count in sorted(accent_counts.items()):
            print(f"  {accent.capitalize():15}: {count:4d} samples")

        print(f"\n{'='*70}")
        print("Next Step: Run the SVM benchmark code!")
        print(f"{'='*70}")
    else:
        print("\n✗ No data downloaded!")

# ============================================================
# RUN THE DOWNLOAD
# ============================================================

if __name__ == "__main__":
    try:
        download_three_accents()
    except KeyboardInterrupt:
        print("\n\n⚠ Download interrupted by user")
    except Exception as e:
        print(f"\n\n✗ Error: {e}")
        raise

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
DOWNLOADING ACCENT DATA: Mandarin, Russian, Spanish
Output directory: /content/drive/MyDrive/Fall25/EE502/FinalProjects/Ch24/data

Installing dependencies...
✓ Dependencies installed

Fetching available languages from GMU website...
✓ Found 392 languages on the website

SEARCHING FOR TARGET ACCENTS
✓ Found Mandarin: 'mandarin' (157 speakers)
✓ Found Russian: 'russian' (82 speakers)
✓ Found Spanish: 'spanish' (243 speakers)

✓ All 3 accents found! Proceeding with download...


Downloading: MANDARIN
Speakers to download: 157


mandarin: 100%|██████████| 157/157 [04:06<00:00,  1.57s/speaker]



✓ Mandarin: 157 files downloaded, 0 failed

Downloading: RUSSIAN
Speakers to download: 82


russian: 100%|██████████| 82/82 [02:12<00:00,  1.62s/speaker]



✓ Russian: 82 files downloaded, 0 failed

Downloading: SPANISH
Speakers to download: 243


spanish: 100%|██████████| 243/243 [06:26<00:00,  1.59s/speaker]


✓ Spanish: 243 files downloaded, 0 failed

DOWNLOAD COMPLETE!

✓ Total samples downloaded: 482
✓ Combined metadata saved: /content/drive/MyDrive/Fall25/EE502/FinalProjects/Ch24/data/metadata.csv
✓ Individual accent folders created in: /content/drive/MyDrive/Fall25/EE502/FinalProjects/Ch24/data

SUMMARY BY ACCENT
  Mandarin       :  157 samples
  Russian        :   82 samples
  Spanish        :  243 samples

Next Step: Run the SVM benchmark code!



