<a href="https://colab.research.google.com/github/akhmtvaae/CurveSwitch/blob/master/YCfetchcompaniesfor8x.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
yc_8x_icp_funded_to_excel.py

Exports YC companies to Excel and prioritizes those that are good fits for 8x:
- consumer/performance-marketing heavy categories (fintech, consumer apps, DTC/ecom, beauty, fem health, creator tools)
- recent batch/hiring signals
- funding signal from "Latest News" on the YC company page (best-effort)

"""

from __future__ import annotations

import argparse
import asyncio
import random
import re
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Any, Dict, List, Optional, Tuple

import httpx
import pandas as pd
from bs4 import BeautifulSoup
from tenacity import retry, stop_after_attempt, wait_exponential_jitter


ALL_COMPANIES_URL = "https://yc-oss.github.io/api/companies/all.json"
YC_COMPANY_URL_TEMPLATE = "https://www.ycombinator.com/companies/{slug}"

USER_AGENT = (
    "Mozilla/5.0 (compatible; 8x-yc-sourcing/1.0; +https://www.ycombinator.com/companies)"
)

# -------- Heuristics --------
FUNDING_KEYWORDS = [
    "raises", "raised", "funding", "series a", "series b", "series c",
    "seed", "round", "financing", "investment", "led by"
]

ARR_REGEXES = [
    re.compile(r"\b(\$?\d+(?:\.\d+)?\s?(?:k|m|b))\s*arr\b", re.IGNORECASE),
    re.compile(r"\barr\s*[:\-]?\s*(\$?\d+(?:\.\d+)?\s?(?:k|m|b))\b", re.IGNORECASE),
]

MONEY_REGEX = re.compile(r"(\$|€|£)\s?\d+(?:\.\d+)?\s?(?:k|m|b)\b", re.IGNORECASE)

# ICP keyword buckets (tune freely)
ICP_BUCKETS = {
    "Fintech": [
        "fintech", "bank", "banking", "card", "credit", "debit", "payments", "wallet",
        "lending", "loan", "remittance", "insurance", "wealth", "invest", "brokerage"
    ],
    "Consumer App": [
        "app", "mobile", "subscription", "consumer", "productivity", "photo", "video", "ai",
        "coach", "planner"
    ],
    "Creator/Video Tools": [
        "creator", "video", "editing", "caption", "clip", "ugc", "content", "social", "tiktok", "instagram"
    ],
    "DTC/Ecom/Marketplace": [
        "ecommerce", "e-commerce", "shop", "shopping", "marketplace", "brand", "direct-to-consumer",
        "dtc", "store", "checkout"
    ],
    "Beauty/Skincare": [
        "beauty", "skincare", "skin", "derm", "acne", "routine", "cosmetics", "hair"
    ],
    "Women’s Health/Femtech": [
        "women", "female", "menopause", "cycle", "period", "fertility", "pregnancy",
        "postpartum", "pelvic", "hormone", "femtech"
    ],
    "Health/Wellness": [
        "health", "wellness", "care", "clinic", "therapy", "telehealth", "medical", "patient"
    ],
}

# Business-model / paid-social fit hints
PAID_SOCIAL_FIT_HINTS = [
    "subscription", "consumer", "app", "marketplace", "brand", "shop", "card",
    "credit", "bank", "beauty", "skincare", "women", "health"
]


def now_utc() -> datetime:
    return datetime.now(timezone.utc)


def normalize_text(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "")).strip()


def safe_join(items: Any) -> str:
    if items is None:
        return ""
    if isinstance(items, list):
        return "; ".join(str(x) for x in items if x is not None)
    return str(items)


def parse_batch_to_date(batch: str) -> Optional[datetime]:
    if not batch:
        return None
    b = batch.strip()

    m = re.fullmatch(r"([WSF])(\d{2})", b)
    if m:
        season_code, yy = m.group(1), int(m.group(2))
        year = 2000 + yy
        month = {"W": 1, "S": 7, "F": 10}[season_code]
        return datetime(year, month, 1, tzinfo=timezone.utc)

    m = re.fullmatch(r"(Winter|Spring|Summer|Fall)\s+(\d{4})", b, flags=re.IGNORECASE)
    if m:
        season, year = m.group(1).lower(), int(m.group(2))
        month = {"winter": 1, "spring": 4, "summer": 7, "fall": 10}[season]
        return datetime(year, month, 1, tzinfo=timezone.utc)

    return None


def months_since(dt: Optional[datetime]) -> Optional[float]:
    if not dt:
        return None
    delta = now_utc() - dt
    return delta.days / 30.44


def blob_for_company(c: Dict[str, Any]) -> str:
    parts = [
        c.get("name", ""),
        c.get("one_liner", ""),
        c.get("long_description", ""),
        c.get("industry", ""),
        c.get("subindustry", ""),
        safe_join(c.get("tags")),
    ]
    return normalize_text(" ".join(parts)).lower()


def detect_funding_signal(*texts: str) -> bool:
    blob = " ".join(normalize_text(t).lower() for t in texts if t)
    return any(k in blob for k in FUNDING_KEYWORDS)


def extract_arr_value(*texts: str) -> str:
    blob = " ".join(normalize_text(t) for t in texts if t)
    for rx in ARR_REGEXES:
        m = rx.search(blob)
        if m:
            return m.group(1).strip()
    return ""


def extract_money_mentions(*texts: str) -> str:
    blob = " ".join(normalize_text(t) for t in texts if t)
    matches = [m.group(0) for m in MONEY_REGEX.finditer(blob)]
    seen = set()
    out = []
    for x in matches:
        k = x.lower().strip()
        if k not in seen:
            seen.add(k)
            out.append(x.strip())
    return "; ".join(out)


def pick_icp_bucket(c: Dict[str, Any]) -> Tuple[str, int]:
    """
    Returns (bucket_name, bucket_score). 'Other' if no strong match.
    """
    text = blob_for_company(c)
    best_bucket = "Other"
    best_score = 0
    for bucket, kws in ICP_BUCKETS.items():
        score = 0
        for kw in kws:
            if kw in text:
                score += 1
        if score > best_score:
            best_bucket, best_score = bucket, score

    # Convert keyword hits into a score weight
    # (cap so a long description doesn't dominate)
    bucket_points = min(best_score, 8)
    return best_bucket, bucket_points


def icp_filter_ok(c: Dict[str, Any], min_bucket_hits: int) -> bool:
    bucket, hits = pick_icp_bucket(c)
    if bucket == "Other":
        return False
    return hits >= min_bucket_hits


def pre_points(c: Dict[str, Any]) -> Tuple[int, str]:
    """
    Lightweight scoring before enrichment.
    """
    pts = 0
    reasons = []

    status = (c.get("status") or "").lower()
    if status == "active":
        pts += 10
        reasons.append("active(+10)")

    stage = (c.get("stage") or "").lower()
    if stage == "early":
        pts += 8
        reasons.append("early(+8)")
    elif stage == "growth":
        pts += 6
        reasons.append("growth(+6)")

    if c.get("isHiring") is True:
        pts += 5
        reasons.append("hiring(+5)")

    # recency via batch
    bdt = parse_batch_to_date(c.get("batch") or "")
    ms = months_since(bdt)
    if ms is not None:
        if ms <= 6:
            pts += 12
            reasons.append("batch<=6mo(+12)")
        elif ms <= 12:
            pts += 10
            reasons.append("batch<=12mo(+10)")
        elif ms <= 24:
            pts += 7
            reasons.append("batch<=24mo(+7)")
        elif ms <= 48:
            pts += 3
            reasons.append("batch<=48mo(+3)")

    # ICP bucket fit
    bucket, bucket_hits = pick_icp_bucket(c)
    if bucket != "Other":
        add = 2 * bucket_hits  # amplify relevance
        pts += add
        reasons.append(f"{bucket}(+{add})")

    # paid-social fit hints
    text = blob_for_company(c)
    hint_hits = sum(1 for h in PAID_SOCIAL_FIT_HINTS if h in text)
    add = min(hint_hits, 6)
    pts += add
    if add:
        reasons.append(f"paid_fit_hints(+{add})")

    # team size sweet spot
    ts = c.get("team_size")
    if isinstance(ts, int):
        if 15 <= ts <= 250:
            pts += 4
            reasons.append("team15-250(+4)")
        elif ts > 250:
            pts += 2
            reasons.append("team>250(+2)")

    return pts, " | ".join(reasons)


@retry(stop=stop_after_attempt(3), wait=wait_exponential_jitter(initial=0.5, max=6))
async def fetch_text(client: httpx.AsyncClient, url: str) -> str:
    r = await client.get(url, timeout=30)
    r.raise_for_status()
    return r.text


@retry(stop=stop_after_attempt(3), wait=wait_exponential_jitter(initial=0.5, max=6))
async def fetch_json(client: httpx.AsyncClient, url: str) -> Any:
    r = await client.get(url, timeout=30)
    r.raise_for_status()
    return r.json()


def looks_like_name(s: str) -> bool:
    s = normalize_text(s)
    if len(s) < 3 or len(s) > 60:
        return False
    return bool(re.match(r"^[A-Z][A-Za-z'\-]+(\s+[A-Z][A-Za-z'\-]+)+$", s))


def parse_company_page(html: str) -> Dict[str, Any]:
    soup = BeautifulSoup(html, "html.parser")

    # Latest News (best effort)
    news_items: List[Dict[str, str]] = []
    latest_news_header = soup.find(string=re.compile(r"\bLatest News\b", re.IGNORECASE))
    if latest_news_header:
        node = latest_news_header.parent
        anchors = []
        for a in node.find_all_next("a", href=True, limit=60):
            title = normalize_text(a.get_text(" ", strip=True))
            href = a.get("href", "")
            if title and href:
                anchors.append((title, href))
        # keep first few distinct titles
        seen = set()
        for title, href in anchors:
            tkey = title.lower()
            if tkey in seen:
                continue
            seen.add(tkey)
            # news links are often external; but keep anyway
            news_items.append({"title": title, "url": href})
            if len(news_items) >= 6:
                break

    # Founders (best effort)
    founders: List[Dict[str, str]] = []
    founders_header = soup.find(string=re.compile(r"\bActive Founders\b|\bFounders\b", re.IGNORECASE))
    if founders_header:
        header_tag = founders_header.parent
        stop_markers = re.compile(r"\bLatest News\b|\bCompany Launches\b", re.IGNORECASE)

        for a in header_tag.find_all_next("a", href=True, limit=220):
            href = a.get("href", "")
            txt = normalize_text(a.get_text(" ", strip=True))
            if stop_markers.search(txt):
                break
            if "linkedin.com" not in href:
                continue

            # Search backwards for a human name
            name = ""
            for prev in a.find_all_previous(string=True, limit=35):
                cand = normalize_text(prev)
                if looks_like_name(cand):
                    name = cand
                    break

            if name:
                founders.append({"name": name, "linkedin": href, "role": "Founder"})

        # de-dupe
        dedup = {}
        for f in founders:
            dedup[(f["name"].lower(), f["linkedin"])] = f
        founders = list(dedup.values())

    company_linkedin = ""
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if "linkedin.com/company" in href:
            company_linkedin = href
            break

    return {"news": news_items, "founders": founders, "company_linkedin": company_linkedin}


def pick_point_of_contact(enriched: Dict[str, Any]) -> Tuple[str, str, str]:
    founders = (enriched or {}).get("founders") or []
    if founders:
        f0 = founders[0]
        return f0.get("name", ""), f0.get("role", "Founder"), f0.get("linkedin", "")
    return "", "", ""


def post_points(
    c: Dict[str, Any],
    funding_signal: bool,
    bucket: str,
    bucket_hits: int,
) -> Tuple[int, str]:
    """
    Final points after enrichment + funding signal.
    """
    pts, reasons = pre_points(c)
    reasons_list = [reasons] if reasons else []

    # Reward funding signal heavily if funded-only sourcing
    if funding_signal:
        pts += 12
        reasons_list.append("funding_signal(+12)")

    # Extra bump for specific high-value buckets
    if bucket in {"Fintech", "DTC/Ecom/Marketplace", "Beauty/Skincare", "Women’s Health/Femtech"}:
        pts += 6
        reasons_list.append(f"{bucket}_priority(+6)")
    elif bucket in {"Creator/Video Tools", "Consumer App"}:
        pts += 4
        reasons_list.append(f"{bucket}_priority(+4)")

    return pts, " | ".join([r for r in reasons_list if r])


async def enrich_batch(
    companies: List[Dict[str, Any]],
    concurrency: int,
    sleep_min: float,
    sleep_max: float,
) -> List[Dict[str, Any]]:
    sem = asyncio.Semaphore(concurrency)

    async with httpx.AsyncClient(headers={"User-Agent": USER_AGENT, "Accept-Language": "en"}) as client:

        async def enrich_one(c: Dict[str, Any]) -> Dict[str, Any]:
            slug = c.get("slug", "")
            url = YC_COMPANY_URL_TEMPLATE.format(slug=slug)
            async with sem:
                await asyncio.sleep(random.uniform(sleep_min, sleep_max))
                try:
                    html = await fetch_text(client, url)
                    parsed = parse_company_page(html)
                except Exception:
                    parsed = {"news": [], "founders": [], "company_linkedin": ""}
            c["_enriched"] = parsed
            return c

        return await asyncio.gather(*[enrich_one(c) for c in companies])


def flatten_row(c: Dict[str, Any]) -> Dict[str, Any]:
    enriched = c.get("_enriched", {}) or {}
    news = enriched.get("news") or []
    founders = enriched.get("founders") or []

    news_titles = "; ".join(n.get("title", "") for n in news if n.get("title"))
    news_urls = "; ".join(n.get("url", "") for n in news if n.get("url"))

    funding_signal = detect_funding_signal(c.get("one_liner", ""), c.get("long_description", ""), news_titles)
    arr_val = extract_arr_value(c.get("long_description", ""), news_titles)
    money_mentions = extract_money_mentions(c.get("long_description", ""), news_titles)

    bucket, bucket_hits = pick_icp_bucket(c)
    points, points_breakdown = post_points(c, funding_signal, bucket, bucket_hits)

    poc_name, poc_role, poc_linkedin = pick_point_of_contact(enriched)

    return {
        "name": c.get("name"),
        "slug": c.get("slug"),
        "batch": c.get("batch"),
        "status": c.get("status"),
        "stage": c.get("stage"),
        "industry": c.get("industry"),
        "subindustry": c.get("subindustry"),
        "tags": safe_join(c.get("tags")),
        "team_size": c.get("team_size"),
        "is_hiring": c.get("isHiring"),
        "locations": c.get("all_locations"),
        "website": c.get("website"),
        "yc_url": c.get("url"),

        "icp_bucket": bucket,
        "raised_funding_signal": funding_signal,
        "funding_money_mentions": money_mentions,
        "arr_mentioned": bool(arr_val),
        "arr_value": arr_val,

        "vector": f"{bucket} | {c.get('industry','')} | {c.get('subindustry','')} | {safe_join(c.get('tags'))}",
        "points": points,
        "points_breakdown": points_breakdown,

        "company_linkedin": enriched.get("company_linkedin", ""),
        "founders": "; ".join(f.get("name", "") for f in founders if f.get("name")),

        "poc_name": poc_name,
        "poc_role": poc_role,
        "poc_linkedin": poc_linkedin,

        "latest_news_titles": news_titles,
        "latest_news_urls": news_urls,
    }


async def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("--limit", type=int, default=1000)
    ap.add_argument("--out", type=str, default="yc_8x_1000.xlsx")
    ap.add_argument("--concurrency", type=int, default=5)
    ap.add_argument("--sleep-min", type=float, default=0.25)
    ap.add_argument("--sleep-max", type=float, default=0.75)

    ap.add_argument("--active-only", action="store_true", help="Only keep status=Active")
    ap.add_argument("--funded-only", action="store_true", help="Only keep companies with funding signal from YC page news")
    ap.add_argument("--prefetch", type=int, default=6000, help="How many top candidates to enrich before filtering down")
    ap.add_argument("--icp-only", action="store_true", help="Filter to 8x ICP buckets only")
    ap.add_argument("--min-icp-hits", type=int, default=1, help="Min keyword hits to be considered ICP")
    ap.add_argument("--recent-batches-months", type=int, default=60, help="Keep companies with batch within N months (best-effort)")
    args = ap.parse_args()

    async with httpx.AsyncClient(headers={"User-Agent": USER_AGENT}) as client:
        all_companies = await fetch_json(client, ALL_COMPANIES_URL)

    if not isinstance(all_companies, list):
        raise RuntimeError("Unexpected payload from ALL_COMPANIES_URL")

    # 1) Filter (lightweight) + pre-score
    candidates: List[Tuple[int, str, Dict[str, Any]]] = []
    for c in all_companies:
        if args.active_only and (c.get("status") or "").lower() != "active":
            continue

        bdt = parse_batch_to_date(c.get("batch") or "")
        ms = months_since(bdt)
        if ms is not None and ms > args.recent_batches_months:
            continue

        if args.icp_only and not icp_filter_ok(c, min_bucket_hits=args.min_icp_hits):
            continue

        p, reason = pre_points(c)
        candidates.append((p, reason, c))

    # Highest pre-score first
    candidates.sort(key=lambda x: x[0], reverse=True)

    # 2) Enrich top-N candidates with YC page scrape (founders + latest news)
    top_candidates = [c for _, __, c in candidates[: args.prefetch]]
    enriched = await enrich_batch(
        top_candidates,
        concurrency=args.concurrency,
        sleep_min=args.sleep_min,
        sleep_max=args.sleep_max,
    )

    # 3) Flatten + final filter
    rows = []
    for c in enriched:
        row = flatten_row(c)
        if args.funded_only and not row["raised_funding_signal"]:
            continue
        rows.append(row)
        if len(rows) >= args.limit:
            break

    if not rows:
        raise RuntimeError("No rows produced. Try increasing --prefetch or disabling --funded-only.")

    df = pd.DataFrame(rows)
    df.sort_values(by=["points", "raised_funding_signal", "is_hiring"], ascending=[False, False, False], inplace=True)

    with pd.ExcelWriter(args.out, engine="openpyxl") as w:
        df.to_excel(w, index=False, sheet_name="companies")

    print(f"Done. Exported {len(df)} rows to: {args.out}")


if __name__ == "__main__":
    asyncio.run(main())
