In [3]:
# scrape_google_repos.py
import re, time, sqlite3, argparse
import requests
from bs4 import BeautifulSoup

time.sleep(1)  # ← 指定どおり1秒待機
ORG = "google"
BASE = f"https://github.com/google"


def parse_stars(s: str) -> int | None:
    s = s.strip().lower().replace(",", "")
    m = re.match(r"^([\d.]+)\s*k$", s)
    if m: return int(float(m.group(1)) * 1000)
    m = re.match(r"^([\d.]+)\s*m$", s)
    if m: return int(float(m.group(1)) * 1_000_000)
    return int(s) if s.isdigit() else None

def fetch_list_page(session: requests.Session, page: int) -> BeautifulSoup:
    params = {"page": page, "type": "sources", "sort": "pushed"}
    r = session.get(BASE, params=params, headers=HEADERS, timeout=20)
    r.raise_for_status()
    return BeautifulSoup(r.text, "html.parser")

def extract_rows(soup: BeautifulSoup) -> list[tuple[str, str | None, int | None]]:
    # まずスター数を /stargazers のリンクから拾う（ページ内テキストのみ参照）
    star_map: dict[str, int | None] = {}
    for a in soup.select('a[href*="/stargazers"]'):
        href = a.get("href", "")
        m = re.match(rf"^/{ORG}/([^/]+)/stargazers$", href)
        if m:
            repo = m.group(1)
            star_map[repo] = parse_stars(a.get_text(strip=True))

    rows = []
    for h3 in soup.select("h3"):
        a = h3.select_one(f'a[href^="/{ORG}/"]')
        if not a: 
            continue
        parts = a.get("href", "").strip("/").split("/")
        if len(parts) != 2 or parts[0] != ORG:
            continue
        repo = parts[1]

        # 言語はカード近傍で探索
        lang = None
        container = h3
        for _ in range(5):
            if container is None:
                break
            svg = container.select_one('svg[aria-label="Programming language"]')
            if svg:
                t = svg.find_next(string=True)
                if t and t.strip():
                    lang = t.strip()
                    break
            span = container.select_one('span[itemprop="programmingLanguage"]')
            if span:
                lang = span.get_text(strip=True) or None
                break
            container = container.parent

        stars = star_map.get(repo)
        rows.append((repo, lang, stars))
    return rows

def save_to_db(db_path: str, rows: list[tuple[str, str | None, int | None]]):
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()
    cur.execute("""
        CREATE TABLE IF NOT EXISTS repos(
            name TEXT PRIMARY KEY,
            language TEXT,
            stars INTEGER,
            last_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    """)
    cur.executemany("""
        INSERT INTO repos(name, language, stars)
        VALUES (?, ?, ?)
        ON CONFLICT(name) DO UPDATE SET
          language=excluded.language,
          stars=excluded.stars,
          last_seen=CURRENT_TIMESTAMP
    """, rows)
    conn.commit()

    print("Top 20 by stars:")
    for name, lang, stars in cur.execute(
        "SELECT name, language, stars FROM repos "
        "ORDER BY (stars IS NULL), stars DESC LIMIT 20"
    ):
        print(f"{name:30s} | {lang or '-':12s} | {stars if stars is not None else '-'}")
    conn.close()

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--pages", type=int, default=2, help="取得する一覧ページ数（1ページ≒30件）")
    ap.add_argument("--db", type=str, default="google_repos.db", help="SQLite DBファイル名")
    args = ap.parse_args()

    session = requests.Session()
    all_rows = []
    for p in range(1, args.pages + 1):
        soup = fetch_list_page(session, p)
        all_rows.extend(extract_rows(soup))
        time.sleep(1)  # ← 指定どおり1秒待機

    save_to_db(args.db, all_rows)

if __name__ == "__main__":
    main()

usage: ipykernel_launcher.py [-h] [--pages PAGES] [--db DB]
ipykernel_launcher.py: error: unrecognized arguments: --f=/Users/atsuki/Library/Jupyter/runtime/kernel-v3c1926d2abebf1edb21631a3bd94d379c31241fd6.json


SystemExit: 2