
# acquisition.ipynb — Data Acquisition (Web Scraping / API)

This notebook collects **global university ranking data** (name, rank, country, score, year) from a public,
multi-page ranking website (default: **CWUR**). It demonstrates:

- Robust HTTP requests with retry/backoff
- Polite scraping (User-Agent, small delay)
- Pagination across **at least 5 pages**
- Output as a single **CSV** and a **list of dicts** (saved to `.jsonl`)

> ⚠️ Run-time note: Some ranking sites change structure or rate-limit. If the default source fails, switch to the backup
  source (QS/THE/CWTS Leiden) by editing `SOURCE` and CSS selectors in the `parse_page()` function.


In [None]:

# Standard imports
from __future__ import annotations
import time
import json
import math
import csv
from pathlib import Path
from dataclasses import dataclass
from typing import Dict, List, Iterable, Optional

import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter, Retry

# I/O paths
OUT_DIR = Path("./data_raw")
OUT_DIR.mkdir(parents=True, exist_ok=True)
RAW_CSV = OUT_DIR / "rankings_raw.csv"
RAW_JSONL = OUT_DIR / "rankings_raw.jsonl"

# Scrape configuration
@dataclass
class SourceConfig:
    name: str
    base_url: str
    pagination_param: str  # e.g., 'start' or 'page'
    first_page: int
    per_page: int
    pages: int            # scrape at least 5 pages
    year: int
    headers: Dict[str, str]

# --- Default: CWUR (example) ---
# CWUR pagination often uses a "start" offset (0, 100, 200, ...). Adjust if needed.
SOURCE = SourceConfig(
    name="CWUR",
    base_url="https://cwur.org/{year}.php",
    pagination_param="start",
    first_page=0,
    per_page=100,     # expected per-page rows
    pages=6,          # 6 pages => ~600 rows (>= 500 requirement)
    year=2024,
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/120.0 Safari/537.36"
    }
)


In [None]:

def make_session() -> requests.Session:
    """Create a session with retry/backoff for resilience."""
    session = requests.Session()
    retries = Retry(
        total=5,
        backoff_factor=0.5,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET", "HEAD"]
    )
    adapter = HTTPAdapter(max_retries=retries)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    session.headers.update(SOURCE.headers)
    return session


In [1]:

def page_url(page_index: int) -> str:
    """Build a paginated URL for the given page index."""
    # Example CWUR style: https://cwur.org/2024.php?start=0,100,200,...
    offset = SOURCE.first_page + page_index * SOURCE.per_page
    return f"{SOURCE.base_url.format(year=SOURCE.year)}?{SOURCE.pagination_param}={offset}"

def parse_page(html: str) -> List[Dict]:
    """Parse a single ranking page and return rows as dicts.

    Expected columns: Rank, University, Country, Score (Overall or similar).
    Adjust CSS selectors to match the site's structure.
    """
    soup = BeautifulSoup(html, "html.parser")
    table = soup.find("table")
    if not table:
        # Some sites use specific class names; you can refine as needed.
        table = soup.find("table", {"class": "table"}) or soup.find("tbody")
        if not table:
            return []

    rows = []
    # Heuristic: iterate over table rows; extract cells
    for tr in table.find_all("tr"):
        cells = [c.get_text(strip=True) for c in tr.find_all(["td", "th"])]
        # Try to infer columns by typical positions (Rank, University, Country, Score)
        if len(cells) < 3:
            continue

        # Attempt flexible mapping
        rank = None
        name = None
        country = None
        score = None

        # Common CWUR layout: [World Rank, University, Country, Score, ...]
        try:
            rank = cells[0] or None
            name = cells[1] or None
            country = cells[2] or None
            # Score may be at 3rd or 4th index depending on site
            score = cells[3] if len(cells) > 3 else None
        except Exception:
            pass

        # Skip header rows or incomplete ones
        if (rank and rank.lower() == "world rank") or (name and name.lower() == "university"):
            continue
        if not name or not rank:
            continue

        rows.append({
            "Year": SOURCE.year,
            "Rank": rank,
            "University": name,
            "Country": country,
            "Overall_Score": score
        })
    return rows


NameError: name 'List' is not defined

In [None]:

def scrape() -> List[Dict]:
    session = make_session()
    all_rows: List[Dict] = []
    for p in range(SOURCE.pages):
        url = page_url(p)
        print(f"Fetching page {p+1}/{SOURCE.pages}: {url}")
        resp = session.get(url, timeout=30)
        resp.raise_for_status()
        page_rows = parse_page(resp.text)
        print(f"  Found {len(page_rows)} rows.")
        all_rows.extend(page_rows)
        time.sleep(0.7)  # Polite delay
    print(f"Total rows scraped: {len(all_rows)}")
    return all_rows


In [None]:

def save_jsonl(rows: List[Dict], path: Path) -> None:
    with path.open("w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")


def save_csv(rows: List[Dict], path: Path) -> None:
    if not rows:
        return
    keys = list(rows[0].keys())
    with path.open("w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=keys)
        w.writeheader()
        w.writerows(rows)


In [None]:

if __name__ == "__main__":
    rows = scrape()
    save_jsonl(rows, RAW_JSONL)
    save_csv(rows, RAW_CSV)
    print(f"Saved: {RAW_CSV} and {RAW_JSONL}")
    # Also show a small preview
    for r in rows[:5]:
        print(r)
