In [None]:
import re
import sys
import time
from pathlib import Path
from urllib.parse import urlparse

# Auto-install minimal deps if missing
def ensure(pkg):
    try:
        __import__(pkg)
    except ImportError:
        import subprocess
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])
for _p in ["requests", "pandas", "ipywidgets", "tqdm", "beautifulsoup4"]:
    ensure(_p)

In [None]:
import ipywidgets as w
from IPython.display import display
display(w.Button(description="Widgets OK"))

In [None]:
import requests
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, HTML, IFrame

def find_repo_root(start: Path) -> Path:
    for p in [start, *start.parents]:
        if (p / "tab" / "app.js").exists():
            return p
    raise FileNotFoundError(f"Could not locate tab/app.js from {start}")

REPO_ROOT = find_repo_root(Path.cwd())
APP_JS = REPO_ROOT / "tab" / "app.js"
app_js = APP_JS.read_text(encoding="utf-8", errors="ignore")

# Extract the iframeLinks array block
m = re.search(r"const\s+iframeLinks\s*=\s*\[(.*?)\];", app_js, re.S)
block = m.group(1) if m else app_js  # fallback: scan whole file

# Extract all quoted URLs inside the block
urls = re.findall(r'"(https?://[^"\s]+)"', block)
urls = [u.strip() for u in urls]

if not urls:
    raise RuntimeError("No URLs found. Check APP_JS path or the iframeLinks declaration.")

# Build dataframe
def extract_asset_id(u: str):
    mm = re.search(r"/asset/(\d+)", u)
    return mm.group(1) if mm else None

def ends_with_embed(u: str):
    return u.rstrip("/").endswith("embed")

rows = []
seen = {}
for idx, u in enumerate(urls):
    dom = urlparse(u).netloc
    aid = extract_asset_id(u)
    rows.append(
        dict(
            index=idx,
            url=u,
            domain=dom,
            asset_id=aid,
            ends_with_embed=ends_with_embed(u),
        )
    )
df = pd.DataFrame(rows)

# Mark duplicates (same exact URL)
df["duplicate"] = df["url"].duplicated(keep="first")

# Simple suspicious flags
df["missing_asset_id"] = df["asset_id"].isna()
df["non_numeric_asset"] = False  # asset_id regex guarantees numeric if present
df["suspicious"] = df[["duplicate", "missing_asset_id"]].any(axis=1) | (~df["ends_with_embed"])

display(df.head(10))
print(f"Total URLs: {len(df)} | Duplicates: {df['duplicate'].sum()} | Suspicious: {df['suspicious'].sum()}")

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.auto import tqdm
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from bs4 import BeautifulSoup

MAX_WORKERS = 30      # was 10
TIMEOUT = 5           # was 10

headers = {
    "User-Agent": "LinkAudit/1.0 (+https://github.com/your-repo)",
    "Accept": "*/*",
}

# Reusable session with retries for transient errors
session = requests.Session()
retries = Retry(total=2, backoff_factor=0.2, status_forcelist=[429, 500, 502, 503, 504])
session.mount("http://", HTTPAdapter(max_retries=retries))
session.mount("https://", HTTPAdapter(max_retries=retries))

def _extract_sci_from_desc(desc: str) -> str | None:
    # description example: "Macaulay Library ML92547181; Rock Pigeon (Feral Pigeon); Columba livia (Feral Pigeon)"
    parts = [p.strip() for p in desc.split(";") if p.strip()]
    if not parts:
        return None
    sci = parts[-1]
    # keep only the binomial "Genus species"
    sci = re.sub(r"\s*\(.*?\)\s*$", "", sci).strip()
    m = re.search(r"\b([A-Z][a-z]+)\s+([a-z]+)\b", sci)
    return f"{m.group(1)} {m.group(2)}" if m else None

def parse_species_name_from_html(html: str) -> str | None:
    soup = BeautifulSoup(html, "html.parser")

    # 1) Prefer the rendered species block in the embed
    #    <div class="speciesName"> ... <span class="Species-sci ...">Columba livia (Feral Pigeon)</span>
    container = soup.select_one("div.speciesName")
    if container:
        txt = container.get_text(" ", strip=True)
        m = re.search(r"\b([A-Z][a-z]+)\s+([a-z]+)\b", re.sub(r"\s*\(.*?\)\s*", " ", txt))
        if m:
            return f"{m.group(1)} {m.group(2)}"

    sci_el = soup.select_one(".Species-sci")
    if sci_el:
        txt = sci_el.get_text(" ", strip=True)
        txt = re.sub(r"\s*\(.*?\)\s*$", "", txt).strip()
        m = re.search(r"\b([A-Z][a-z]+)\s+([a-z]+)\b", txt)
        if m:
            return f"{m.group(1)} {m.group(2)}"

    # 2) Fallback to meta description content if present (then normalize)
    meta = soup.find("meta", attrs={"name": "description"}) or soup.find("meta", attrs={"property": "og:description"})
    if meta and meta.get("content"):
        sci = _extract_sci_from_desc(meta["content"])
        if sci:
            return sci

    return None

def check_one(u: str):
    t0 = time.time()
    err = None
    code = None
    final_url = None
    method = "HEAD"
    ok = False
    species_name = None  # NEW
    try:
        r = session.head(u, allow_redirects=True, timeout=TIMEOUT, headers=headers)
        code = r.status_code
        final_url = r.url
        if code in (405, 403, 400):
            method = "GET"
            r2 = session.get(u, allow_redirects=True, timeout=TIMEOUT, headers=headers, stream=True)
            code = r2.status_code
            final_url = r2.url
            r2.close()
        ok = 200 <= code < 400

        # If it's a Macaulay Library embed, GET the HTML and extract the sci name
        if ok:
            host = urlparse(final_url or u).netloc.lower()
            if "macaulaylibrary.org" in host:
                r3 = session.get(final_url or u, allow_redirects=True, timeout=TIMEOUT, headers=headers)
                species_name = parse_species_name_from_html(r3.text)
    except requests.RequestException as e:
        err = str(e)
    dt = time.time() - t0
    return dict(
        status_code=code,
        ok=ok,
        error=err,
        final_url=final_url,
        method=method,
        elapsed_s=round(dt, 3),
        species_name=species_name,  # NEW
    )

results = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
    futs = {ex.submit(check_one, u): i for i, u in enumerate(df["url"])}
    for fut in tqdm(as_completed(futs), total=len(futs), desc="Checking links"):
        i = futs[fut]
        res = fut.result()
        results.append((i, res))

# Merge results back
res_map = {i: r for i, r in results}
df["status_code"] = df["index"].map(lambda i: res_map.get(i, {}).get("status_code"))
df["ok"] = df["index"].map(lambda i: res_map.get(i, {}).get("ok", False))
df["error"] = df["index"].map(lambda i: res_map.get(i, {}).get("error"))
df["final_url"] = df["index"].map(lambda i: res_map.get(i, {}).get("final_url"))
df["method_used"] = df["index"].map(lambda i: res_map.get(i, {}).get("method"))
df["elapsed_s"] = df["index"].map(lambda i: res_map.get(i, {}).get("elapsed_s"))
df["species_name"] = df["index"].map(lambda i: res_map.get(i, {}).get("species_name"))

df_sorted = df.sort_values(by=["ok", "suspicious", "status_code"], ascending=[True, False, True])
display(df_sorted.head(20))
print(f"OK: {df['ok'].sum()} | Broken: {(~df['ok']).sum()} | With errors: {df['error'].notna().sum()}")

In [None]:
COLUMBIDAE_GENERA = {
    "Alectroenas","Alopecoenas","Pampusana","Caloenas","Chalcophaps","Claravis","Paraclaravis",
    "Columbina","Columba","Didunculus","Drepanoptila","Ducula","Ectopistes","Gallicolumba", "Geophaps","Geopelia",
    "Geotrygon","Zentrygon","Goura","Gymnophaps","Hemiphaga","Henicophaps","Leptotila","Leucosarcia",
    "Lopholaimus","Macropygia","Metriopelia","Microgoura","Nesoenas","Ocyphaps","Oena","Otidiphaps",
    "Patagioenas","Pezophaps","Phapitreron","Phaps","Ptilinopus","Reinwardtoena","Raphus","Spilopelia",
    "Streptopelia","Starnoenas","Treron","Trugon","Turtur","Uropelia","Zenaida","Petrophassa"
}

# first token of species_name is the genus
df["genus"] = df["species_name"].str.extract(r"^([A-Z][a-z]+)\b")
df["is_columbidae_genus"] = df["genus"].isin(COLUMBIDAE_GENERA)
df["family"] = df["is_columbidae_genus"].map({True: "Columbidae", False: None})
df["family_mismatch"] = df["species_name"].notna() & ~df["is_columbidae_genus"]

df_sorted = df.sort_values(by=["ok", "suspicious", "status_code"], ascending=[True, False, True])
display(df_sorted.head(20))
print(f"OK: {df['ok'].sum()} | Broken: {(~df['ok']).sum()} | With errors: {df['error'].notna().sum()}")

In [None]:
OUT_CSV = Path(APP_JS).parent.parent / "link_audit.csv"
df_sorted.to_csv(OUT_CSV, index=False)
print(f"Saved: {OUT_CSV}")

In [None]:
all_indices = df.index.tolist()

def subset_indices(mode: str):
    if ("ok" in df.columns) and mode == "broken":
        return df.index[~df["ok"]].tolist()
    if mode == "suspicious":
        return df.index[df["suspicious"]].tolist()
    if mode == "duplicates":
        return df.index[df["duplicate"]].tolist()
    return all_indices

mode = widgets.ToggleButtons(options=["all", "broken", "suspicious", "duplicates"], description="View:")
show_iframe = widgets.Checkbox(value=True, description="Preview iframe")
i_slider = widgets.IntSlider(value=0, min=0, max=max(0, len(all_indices)-1), description="Index")
prev_btn = widgets.Button(description="Prev")
next_btn = widgets.Button(description="Next")

out = widgets.Output()
state = {"indices": subset_indices("all")}

def sync_slider_range():
    i_slider.max = max(0, len(state["indices"]) - 1)
    i_slider.value = min(i_slider.value, i_slider.max)

def render():
    out.clear_output()
    if not state["indices"]:
        with out:
            display(HTML("<b>No items in this view.</b>"))
        return
    idx = state["indices"][i_slider.value]
    row = df.loc[idx]
    meta = []
    meta.append(f"Index in list: {row['index']}")
    if "ok" in df.columns:
        meta.append(f"Status: {'OK' if row['ok'] else 'BROKEN'} ({row['status_code']})")
        if pd.notna(row.get("error")):
            meta.append(f"Error: {row['error']}")
    meta.append(f"Duplicate: {row['duplicate']}")
    meta.append(f"Ends with /embed: {row['ends_with_embed']}")
    meta.append(f"Asset ID: {row['asset_id'] or '(none)'}")
    meta.append(f"Domain: {row['domain']}")
    html = f"""
    <div style='font-family: sans-serif'>
        <div style='margin-bottom:8px'>
            <a href="{row['url']}" target="_blank">{row['url']}</a>
        </div>
        <div style='color:#444; margin-bottom:8px'>{' | '.join(meta)}</div>
    </div>
    """
    with out:
        display(HTML(html))
        if show_iframe.value:
            try:
                iframe_html = f"""
                <iframe
                    src="{row['url']}"
                    width="900"
                    height="540"
                    style="border:0"
                    loading="lazy"
                    allow="autoplay; fullscreen; clipboard-write"
                    sandbox="allow-scripts allow-same-origin allow-popups allow-forms">
                </iframe>
                """
                display(HTML(iframe_html))
            except Exception as e:
                display(HTML(f"<i>Preview failed: {e}</i>"))

def on_mode_change(change):
    state["indices"] = subset_indices(change["new"])
    sync_slider_range()
    render()

def on_prev(_):
    if i_slider.value > 0:
        i_slider.value -= 1

def on_next(_):
    if i_slider.value < i_slider.max:
        i_slider.value += 1

mode.observe(on_mode_change, names="value")
show_iframe.observe(lambda ch: render(), names="value")
prev_btn.on_click(on_prev)
next_btn.on_click(on_next)
i_slider.observe(lambda ch: render(), names="value")

controls = widgets.HBox([mode, show_iframe, prev_btn, next_btn, i_slider])
display(controls, out)

render()

In [None]:
import webbrowser
broken = df[~df['ok']] if 'ok' in df.columns else pd.DataFrame()
family_mismatch = df[df['family_mismatch']] if 'family_mismatch' in df.columns else pd.DataFrame()
suspicious = df[df['suspicious'] & ~df.get('family_mismatch', False)]  # suspicious but not family issues
duplicates = df[df['duplicate']]
ok_links = df[df['ok'] & ~df.get('family_mismatch', False)] if 'ok' in df.columns else df

def make_card(row, show_status=True):
    status_class = "broken" if not row.get('ok', True) else ("mismatch" if row.get('family_mismatch', False) else "ok")
    status_parts = []
    if show_status and 'ok' in df.columns:
        status_parts.append(f"Status: {'OK' if row.get('ok', False) else 'BROKEN'} ({row.get('status_code', '')})")
    if pd.notna(row.get('species_name')):
        status_parts.append(f"Species: {row['species_name']}")
    if pd.notna(row.get('family')):
        status_parts.append(f"Family: {row['family']}")
    status_text = ' | '.join(status_parts)
    
    return f"""
    <div class="card {status_class}" data-index="{row['index']}" data-status="{status_class}">
      <div class="meta">
        <span class="index">#{row['index']}</span>
        <a href="{row['url']}" target="_blank" class="url">{row['url']}</a>
        <span class="info">Asset: {row['asset_id'] or 'none'} | Dup: {row['duplicate']} | {status_text}</span>
      </div>
      <iframe src="{row['url']}" width="100%" height="540" style="border:0"
              allow="autoplay; fullscreen; clipboard-write" loading="lazy" allowfullscreen></iframe>
    </div>
    """

sections = []
if len(broken) > 0:
    sections.append(f"<h2 id='broken'>‚ùå Broken ({len(broken)})</h2>")
    sections.append('<div class="section">'+'\n'.join(make_card(r) for _, r in broken.iterrows())+'</div>')

if len(family_mismatch) > 0:
    sections.append(f"<h2 id='mismatch'>üö´ Not Columbidae ({len(family_mismatch)})</h2>")
    sections.append('<div class="section">'+'\n'.join(make_card(r) for _, r in family_mismatch.iterrows())+'</div>')

if len(suspicious) > 0:
    sections.append(f"<h2 id='suspicious'>‚ö†Ô∏è Suspicious ({len(suspicious)})</h2>")
    sections.append('<div class="section">'+'\n'.join(make_card(r) for _, r in suspicious.iterrows())+'</div>')

if len(duplicates) > 0:
    sections.append(f"<h2 id='duplicates'>üîÅ Duplicates ({len(duplicates)})</h2>")
    sections.append('<div class="section">'+'\n'.join(make_card(r, False) for _, r in duplicates.iterrows())+'</div>')

sections.append(f"<h2 id='all'>‚úÖ All OK ({len(ok_links)})</h2>")
sections.append('<div class="section collapsed">'+'\n'.join(make_card(r) for _, r in ok_links.iterrows())+'</div>')

html = f"""<!doctype html>
<html><head><meta charset="utf-8"><title>Link Audit - {len(df)} links</title>
<style>
body {{ font-family: Arial, sans-serif; margin: 16px; background: #f5f5f5; }}
h2 {{ margin: 24px 0 12px; cursor: pointer; user-select: none; }}
h2:hover {{ text-decoration: underline; }}
.section {{ margin-bottom: 24px; }}
.section.collapsed .card {{ display: none; }}
.card {{ background: white; margin: 12px 0; padding: 12px; border: 1px solid #ddd; border-radius: 4px; }}
.card.broken {{ border-left: 4px solid #e74c3c; }}
.card.mismatch {{ border-left: 4px solid #f39c12; }}
.card.ok {{ border-left: 4px solid #2ecc71; }}
.meta {{ margin-bottom: 8px; font-size: 14px; }}
.index {{ font-weight: bold; color: #666; margin-right: 8px; }}
.url {{ color: #3498db; text-decoration: none; margin-right: 8px; }}
.url:hover {{ text-decoration: underline; }}
.info {{ color: #999; font-size: 13px; }}
.controls {{ position: sticky; top: 0; background: white; padding: 12px; border: 1px solid #ddd; 
             margin-bottom: 16px; z-index: 100; display: flex; gap: 12px; align-items: center; }}
button {{ padding: 8px 16px; cursor: pointer; border: 1px solid #ccc; background: white; border-radius: 4px; }}
button:hover {{ background: #f0f0f0; }}
input {{ padding: 8px; border: 1px solid #ccc; border-radius: 4px; flex: 1; max-width: 300px; }}
</style>
</head>
<body>
<div class="controls">
  <input type="text" id="search" placeholder="Search URLs, species, or asset IDs...">
  <button onclick="toggleAll()">Expand/Collapse All</button>
  <span>Total: {len(df)} | Broken: {len(broken)} | Non-Columbidae: {len(family_mismatch)} | Suspicious: {len(suspicious)}</span>
</div>
{''.join(sections)}
<script>
const search = document.getElementById('search');
search.addEventListener('input', e => {{
  const q = e.target.value.toLowerCase();
  document.querySelectorAll('.card').forEach(c => {{
    const text = c.textContent.toLowerCase();
    c.style.display = text.includes(q) ? 'block' : 'none';
  }});
}});

document.querySelectorAll('h2').forEach(h => {{
  h.addEventListener('click', () => {{
    h.nextElementSibling.classList.toggle('collapsed');
  }});
}});

function toggleAll() {{
  document.querySelectorAll('.section').forEach(s => s.classList.toggle('collapsed'));
}}
</script>
</body></html>"""

out_file = Path.cwd() / "_link_audit_categorized.html"
out_file.write_text(html, encoding="utf-8")
webbrowser.open(out_file.as_uri())
print(f"Categorized gallery: {out_file}")
print(f"Broken: {len(broken)} | Non-Columbidae: {len(family_mismatch)} | Suspicious: {len(suspicious)} | Duplicates: {len(duplicates)} | OK: {len(ok_links)}")