# ILGA Senate Members Scraper (Modern Page)

Este notebook descarga el listado de senadores del Estado de Illinois desde la página moderna
`https://www.ilga.gov/Senate/Members/List`, lo parsea con **BeautifulSoup + lxml**, y genera
un **DataFrame** y un **CSV** (`senado_ilga_moderno.csv`).

**Requisitos:** conexión a Internet desde tu entorno y las librerías `requests`, `beautifulsoup4`, `lxml`, `pandas`.


In [6]:
# (Opcional) Instalar dependencias desde el propio notebook
# Descomenta si lo necesitas en tu entorno actual:
# !pip install -q --upgrade pip
# !pip install -q requests beautifulsoup4 lxml pandas


In [2]:
import re, time
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup

LIST_URLS = [
    "https://www.ilga.gov/Senate/Members/List",
    "https://www.ilga.gov/Senate/List",
]
BASE = "https://www.ilga.gov"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "es-ES,es;q=0.9,en;q=0.8",
    "Referer": "https://www.ilga.gov/",
    "Connection": "keep-alive",
}

def fetch(url):
    s = requests.Session()
    s.headers.update(HEADERS)
    r = s.get(url, timeout=30)
    r.raise_for_status()
    return r

def find_profile_links(html, base=BASE):
    soup = BeautifulSoup(html, "lxml")
    # busca anchors a /Senate/Members/Details/... (insensible a mayúsculas)
    links = set()
    for a in soup.select('a[href]'):
        href = a.get("href") or ""
        if re.search(r"/Senate/Members/Details/\d+", href, flags=re.I):
            links.add(urljoin(base, href))
    return sorted(links)

def parse_profile(html):
    """Devuelve (name, district:int|None, party:str|''), usando varias heurísticas."""
    soup = BeautifulSoup(html, "lxml")

    # Nombre: probar h1, h2, title, aria-labels…
    name = ""
    cand = []
    cand += [h.get_text(strip=True) for h in soup.select("h1")]
    if not cand: cand += [h.get_text(strip=True) for h in soup.select("h2")]
    if not cand and soup.title: cand += [soup.title.get_text(strip=True)]
    if cand: name = cand[0]

    # Texto visible para regex
    text = soup.get_text(" ", strip=True)

    # Distrito (varias variantes)
    district = None
    for pat in [
        r"District\s*(\d+)",
        r"(\d+)\s+District",
        r"Senate\s+District\s*(\d+)",
        r"(\d+)\s+\(D\)|(\d+)\s+\(R\)|(\d+)\s+\(I\)"
    ]:
        m = re.search(pat, text, flags=re.I)
        if m:
            # toma el primer grupo no vacío
            for g in m.groups():
                if g and g.isdigit():
                    district = int(g)
                    break
        if district is not None:
            break

    # Partido (D/R/I o completo)
    party = ""
    # primero letra entre paréntesis
    m = re.search(r"\((D|R|I)\)", text)
    if m:
        party = m.group(1)
    else:
        # palabras completas
        if re.search(r"\bDemocrat(ic)?\b", text, flags=re.I):
            party = "D"
        elif re.search(r"\bRepublican\b", text, flags=re.I):
            party = "R"
        elif re.search(r"\bIndependent\b", text, flags=re.I):
            party = "I"

    return name, district, party

# === Flujo principal ===
try:
    # 1) Prueba con las URLs de lista y junta todos los perfiles
    all_profiles = []
    for url in LIST_URLS:
        try:
            r = fetch(url)
            profs = find_profile_links(r.text)
            if profs:
                all_profiles.extend(profs)
        except Exception as e:
            print(f"[WARN] No se pudo leer {url}: {e}")

    # de-duplicar
    all_profiles = sorted(set(all_profiles))
    print("Perfiles encontrados en la lista:", len(all_profiles))

    # Diagnóstico si no hay enlaces
    if not all_profiles:
        print("No se hallaron enlaces a perfiles. Guardando debug_list.html…")
        try:
            open("debug_list.html", "w", encoding="utf-8").write(r.text)
            print("Revisa debug_list.html para ver el HTML real (¿portal cautivo/bloqueo?).")
        except Exception:
            pass

    # 2) Visita cada perfil y extrae datos
    members = []
    for i, purl in enumerate(all_profiles, 1):
        try:
            pr = fetch(purl)
            name, district, party = parse_profile(pr.text)
            if name:  # al menos nombre
                members.append((name, district, party, purl))
        except Exception as e:
            print(f"[WARN] Perfil con error ({purl}): {e}")
        time.sleep(0.5)  # pausa cortita para ser amable

    print("Total miembros parseados:", len(members))

    # 3) Mostrar algunas filas
    for m in members[:60]:
        print(m)

    # 4) (Opcional) DataFrame/CSV
    try:
        import pandas as pd
        df = pd.DataFrame(members, columns=["Nombre", "Distrito", "Partido", "Perfil"])
        print("\nPrimeras 5 filas:")
        print(df.head())
        df.to_csv("senado_ilga_moderno.csv", index=False, encoding="utf-8")
        print("\nCSV generado: senado_ilga_moderno.csv")
    except ImportError:
        print("Pandas no está instalado; omitiendo CSV. Instala con: pip install pandas openpyxl")

except Exception as e:
    print("Error general:", e)


[WARN] No se pudo leer https://www.ilga.gov/Senate/List: 404 Client Error: Not Found for url: https://www.ilga.gov/Senate/List
Perfiles encontrados en la lista: 60
Total miembros parseados: 60
('Member', 8505, 'D', 'https://www.ilga.gov/Senate/Members/Details/3264')
('Member', 5413, 'R', 'https://www.ilga.gov/Senate/Members/Details/3265')
('Member', 8176, 'D', 'https://www.ilga.gov/Senate/Members/Details/3268')
('Member', 5966, 'D', 'https://www.ilga.gov/Senate/Members/Details/3269')
('Member', 422, 'D', 'https://www.ilga.gov/Senate/Members/Details/3270')
('Member', 8250, 'D', 'https://www.ilga.gov/Senate/Members/Details/3271')
('Member', 9573, 'D', 'https://www.ilga.gov/Senate/Members/Details/3276')
('Member', 3840, 'R', 'https://www.ilga.gov/Senate/Members/Details/3281')
('Member', 5145, 'D', 'https://www.ilga.gov/Senate/Members/Details/3291')
('Member', 8066, 'D', 'https://www.ilga.gov/Senate/Members/Details/3292')
('Member', 9595, 'D', 'https://www.ilga.gov/Senate/Members/Details/3

## Notas
- Si no se muestran miembros, guarda el HTML para depurar:
```python
import pathlib
html_path = pathlib.Path("debug_ilga.html")
html_path.write_text(requests.get("https://www.ilga.gov/Senate/Members/List", headers={"User-Agent":"Mozilla/5.0"}, timeout=20).text, encoding="utf-8")
html_path
```
- Revisa el archivo `debug_ilga.html` para confirmar la estructura del HTML en tu red.
