In [None]:
# collect_ud.py
"""
Collect N unique Urban Dictionary entries whose written_on date is within 2015 (inclusive).
Uses the unofficial UD random endpoint; may require many attempts because `random` samples.
Outputs: ud_2015.csv
"""
import requests
import time
import csv
from datetime import datetime
from dateutil import parser as dateparser
from dateutil.tz import tzutc
from pathlib import Path

# -------- CONFIG --------
OUT = Path("ud_2015.csv")
API_RANDOM = "https://api.urbandictionary.com/v0/random"
HEADERS = {"User-Agent": "Northeastern-SlangPhraser/0.1 (+mailto:you@your.university.edu)"}
RATE_SEC = 1.0          # polite delay (seconds) between requests
TARGET = 5000            # desired number of final entries (change as needed)
MAX_ATTEMPTS = 20000    # safety cap on number of API calls/attempt loops
VERBOSE = True          # set False to reduce console output
# ------------------------

# 2015 inclusive bounds (timezone-aware; UTC)
START = datetime(2015, 1, 1, 0, 0, 0, tzinfo=tzutc())
END   = datetime(2015, 12, 31, 23, 59, 59, tzinfo=tzutc())

def parse_date(dstr):
    """Parse a date string into a timezone-aware datetime in UTC (returns None on failure)."""
    if not dstr:
        return None
    try:
        dt = dateparser.parse(dstr)
    except Exception:
        return None
    if dt is None:
        return None
    # If naive, assume UTC; otherwise normalize to UTC
    if dt.tzinfo is None:
        dt = dt.replace(tzinfo=tzutc())
    else:
        dt = dt.astimezone(tzutc())
    return dt

def normalize_text(s: str) -> str:
    """Normalize text for duplicate detection."""
    if not s:
        return ""
    s = " ".join(s.split())  # collapse whitespace, remove newlines
    return s.lower().strip()

def fetch_random():
    """Call the UD random endpoint and return the 'list' (may be empty)."""
    r = requests.get(API_RANDOM, headers=HEADERS, timeout=15)
    r.raise_for_status()
    return r.json().get("list", [])

def collect(target=TARGET, max_attempts=MAX_ATTEMPTS):
    seen_ids = set()
    seen_texts = set()
    collected = []
    attempts = 0

    print(f"Starting collection for selected year entries (target={target})...")
    while len(collected) < target and attempts < max_attempts:
        attempts += 1
        try:
            items = fetch_random()
        except Exception as e:
            if VERBOSE:
                print(f"[attempt {attempts}] request error: {e} — backing off 5s")
            time.sleep(5)
            continue

        for d in items:
            defid = d.get("defid") or d.get("def_id") or d.get("id") or None
            if not defid:
                # skip malformed entries
                continue
            if defid in seen_ids:
                continue

            # parse and normalize date
            raw_date = d.get("written_on") or d.get("date") or d.get("created_on") or ""
            dt = parse_date(raw_date)
            if not dt:
                # no usable date — skip
                seen_ids.add(defid)
                continue

            # accept only if within year inclusive
            if dt < START or dt > END:
                seen_ids.add(defid)
                continue

            # dedupe by normalized definition text (avoid near-exact duplicates)
            definition_text = d.get("definition") or ""
            norm_def = normalize_text(definition_text)
            if not norm_def:
                seen_ids.add(defid)
                continue
            if norm_def in seen_texts:
                seen_ids.add(defid)
                continue

            # build record (safe-get other fields)
            record = {
                "defid": defid,
                "word": d.get("word",""),
                "definition": definition_text.replace("\n"," ").strip(),
                "example": (d.get("example") or "").replace("\n"," ").strip(),
                "written_on": raw_date
            }
            collected.append(record)
            seen_ids.add(defid)
            seen_texts.add(norm_def)

            if len(collected) >= target:
                break

        # occasional status print
        if VERBOSE and attempts % 50 == 0:
            print(f"Attempt {attempts} — collected {len(collected)} valid selected year items so far")

        time.sleep(RATE_SEC)

    if VERBOSE:
        print(f"Finished: attempts={attempts}, collected={len(collected)} (target={target})")

    # sort by parsed date (descending, newest first)
    collected.sort(key=lambda x: parse_date(x["written_on"]) if x["written_on"] else datetime(1970,1,1,tzinfo=tzutc()), reverse=True)

    # write CSV (even if fewer than target)
    cols = ["defid","word","definition","example","written_on"]
    with OUT.open("w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=cols)
        writer.writeheader()
        for row in collected[:target]:
            writer.writerow({k: row.get(k,"") for k in cols})

    print(f"Wrote {min(len(collected), target)} rows to {OUT.resolve()}")
    return OUT

if __name__ == "__main__":
    collect()


Starting collection for selected year entries (target=5000)...
Attempt 50 — collected 5 valid selected year items so far
Attempt 100 — collected 9 valid selected year items so far
Attempt 150 — collected 15 valid selected year items so far
Attempt 200 — collected 21 valid selected year items so far
Attempt 250 — collected 25 valid selected year items so far
Attempt 300 — collected 32 valid selected year items so far
Attempt 350 — collected 40 valid selected year items so far
Attempt 400 — collected 45 valid selected year items so far
Attempt 450 — collected 50 valid selected year items so far
Attempt 500 — collected 59 valid selected year items so far
Attempt 550 — collected 68 valid selected year items so far
Attempt 600 — collected 78 valid selected year items so far
Attempt 650 — collected 88 valid selected year items so far
Attempt 700 — collected 102 valid selected year items so far
Attempt 750 — collected 107 valid selected year items so far
Attempt 800 — collected 114 valid sele