In [6]:
# google_org_repo_scraper.py
# 要件: API不使用 / 1秒スリープ / SQLite保存 / SELECT表示 / 既存DBを安全にマイグレーション
import requests, time, sqlite3, re
from bs4 import BeautifulSoup

# ===== 設定 =====
ORG  = "google"
BASE = "https://github.com"
LIST = f"{BASE}/orgs/{ORG}/repositories"   # サーバー側ページネーション
DB   = "google_repos.db"
MAX_PAGES = None  # 例: 3 にすると3ページまで。None なら最終ページまで巡回。

HEADERS = {
    "User-Agent": "simple-scraper/1.0 (+your_email@example.com)",
    "Accept-Language": "ja,en;q=0.9",
}

# ===== ユーティリティ =====
def parse_stars(s):
    """'1.2k', '3M', '1,234' を整数に正規化（概数）。失敗時は 0。"""
    s = (s or "").strip().lower().replace(",", "")
    m = re.match(r"^([\d.]+)\s*k$", s)
    if m: return int(float(m.group(1)) * 1_000)
    m = re.match(r"^([\d.]+)\s*m$", s)
    if m: return int(float(m.group(1)) * 1_000_000)
    return int(s) if s.isdigit() else 0

def get_soup(url):
    """HTML取得→BeautifulSoup。取得ごとに polite delay(1s)。"""
    r = requests.get(url, headers=HEADERS, timeout=30)
    r.raise_for_status()
    time.sleep(1)  # ← 課題要件
    return BeautifulSoup(r.text, "html.parser")

def extract_repos(soup):
    """一覧ページから (name, language, stars) のタプル配列を抽出（UI差分に緩く対応）。"""
    rows = []
    # 代表的カードを幅広く拾う（旧/新UI想定）
    cards = soup.select("li[itemprop='owns'], div.Box-row, li")
    for card in cards:
        # リポ名: /google/<repo> へのリンクを優先
        a = card.select_one(f'a[href^="/{ORG}/"]')
        if not a:
            continue
        href = a.get("href", "/").strip("/")
        parts = href.split("/")
        if len(parts) != 2 or parts[0] != ORG:
            continue
        name = parts[1]

        # 主要言語: itemprop または プログラミング言語アイコン隣接のテキスト
        lang = None
        span = card.select_one("span[itemprop='programmingLanguage']")
        if span and span.get_text(strip=True):
            lang = span.get_text(strip=True)
        else:
            near = card.select_one('svg[aria-label="Programming language"] + span')
            if near and near.get_text(strip=True):
                lang = near.get_text(strip=True)

        # スター数: /stargazers リンクの表示テキスト（ページ遷移はしない）
        stars = 0
        st_link = card.select_one('a[href$="/stargazers"]')
        if st_link:
            stars = parse_stars(st_link.get_text(strip=True))

        rows.append((name, lang, stars))
    return rows

# ===== DB =====
def init_db(conn):
    """テーブル作成＋既存DBに last_seen が無ければ追加（安全な一括マイグレーション）。"""
    cur = conn.cursor()
    # 旧版で作られている可能性に配慮し、最小カラムでまず確実に生成
    cur.execute("""
        CREATE TABLE IF NOT EXISTS repos(
          name TEXT PRIMARY KEY,
          language TEXT,
          stars INTEGER
        )
    """)
    # 既存スキーマを確認し、last_seen が無ければ追加
    cur.execute("PRAGMA table_info(repos)")
    cols = {row[1] for row in cur.fetchall()}
    if "last_seen" not in cols:
        cur.execute("ALTER TABLE repos ADD COLUMN last_seen TIMESTAMP")
    conn.commit()

def save_rows(conn, rows):
    """UPSERT（名前衝突時は更新＋last_seen更新）。"""
    cur = conn.cursor()
    cur.executemany("""
        INSERT INTO repos(name, language, stars)
        VALUES (?,?,?)
        ON CONFLICT(name) DO UPDATE SET
          language = excluded.language,
          stars    = excluded.stars,
          last_seen= CURRENT_TIMESTAMP
    """, rows)
    conn.commit()

def print_top(conn, limit=20):
    print(f"\nTop {limit} by stars:")
    for name, lang, stars in conn.execute(
        "SELECT name, language, stars FROM repos "
        "ORDER BY (stars IS NULL), stars DESC, name ASC LIMIT ?", (limit,)
    ):
        print(f"{name:30s} | {lang or '-':12s} | {stars if stars is not None else '-'}")

# ===== メイン処理 =====
def main():
    conn = sqlite3.connect(DB)
    try:
        init_db(conn)
        total, page = 0, 1
        while True:
            url = f"{LIST}?type=all&sort=updated&page={page}"
            s = get_soup(url)
            rows = extract_repos(s)
            if not rows:
                break
            save_rows(conn, rows)
            total += len(rows)
            page += 1
            if MAX_PAGES is not None and page > MAX_PAGES:
                break

        print(f"Total repositories processed: {total}")
        print_top(conn, limit=20)
    finally:
        conn.close()

if __name__ == "__main__":
    main()

Total repositories processed: 2811

Top 20 by stars:
material-design-icons          | -            | 53000
guava                          | -            | 51000
zx                             | -            | 45000
styleguide                     | -            | 39000
leveldb                        | -            | 38000
googletest                     | -            | 37000
comprehensive-rust             | -            | 32000
material-design-lite           | -            | 32000
python-fire                    | -            | 28000
flatbuffers                    | -            | 25000
gson                           | -            | 24000
ExoPlayer                      | -            | 22000
iosched                        | -            | 22000
eng-practices                  | -            | 20000
cadvisor                       | -            | 19000
filament                       | -            | 19000
fonts                          | -            | 19000
dagger                       