# UFC Stats Scraper
This notebook adapts the original `ufcstats_scraper.py` script into executable cells. Run the cells from top to bottom to build the scraper and optionally execute the crawl.

## Environment Setup
Install the required dependencies in the active environment before running the scraper. You can either create the provided virtual environment (see `README_environment.md`) or run:
```bash
pip install requests requests-cache beautifulsoup4 tenacity pandas python-dateutil tqdm
```

In [70]:
import json
import os
import re
import time
from datetime import date

import pandas as pd
import requests
import requests_cache
from bs4 import BeautifulSoup as BS
from dateutil import parser as dateparser
from tenacity import retry, stop_after_attempt, wait_exponential
from tqdm import tqdm

BASE = "http://ufcstats.com"
DATA_DIR = "./ufc_out"
os.makedirs(DATA_DIR, exist_ok=True)

EVENTS_CSV = os.path.join(DATA_DIR, "events.csv")
FIGHTS_CSV = os.path.join(DATA_DIR, "fights.csv")
FIGHTERS_CSV = os.path.join(DATA_DIR, "fighters.csv")
TOT_OVERALL_CSV = os.path.join(DATA_DIR, "fight_totals_overall.csv")
TOT_ROUND_CSV = os.path.join(DATA_DIR, "fight_totals_round.csv")
SIG_OVERALL_CSV = os.path.join(DATA_DIR, "fight_sig_overall.csv")
SIG_ROUND_CSV = os.path.join(DATA_DIR, "fight_sig_round.csv")
FAIL_CSV = os.path.join(DATA_DIR, "failures.csv")
STATE_JSON = os.path.join(DATA_DIR, "state.json")

requests_cache.install_cache(os.path.join(DATA_DIR, "http_cache"), backend="sqlite", expire_after=60 * 60 * 24 * 7)
HEADERS = {
    "User-Agent": "UFC research (mailto:you@example.com)",
    "Accept-Language": "en-US,en;q=0.9",
}


In [71]:
def normalize_url(u: str) -> str:
    if not u:
        return u
    return u.strip()


@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=2, max=30), reraise=True)
def _get(url: str) -> requests.Response:
    r = requests.get(normalize_url(url), headers=HEADERS, timeout=30)
    r.raise_for_status()
    return r


def soup(url: str) -> BS:
    html = _get(url).text
    return BS(html, "html.parser")


def clean(x: str) -> str:
    return re.sub(r"\s+", " ", x or "").strip()


def parse_height_to_inches(text):
    m = re.search(r"(\d+)\s*'\s*(\d+)", text or "")
    return int(m.group(1)) * 12 + int(m.group(2)) if m else None


def parse_reach_to_inches(text):
    m = re.search(r"(\d+)\s*\"", text or "")
    return int(m.group(1)) if m else None


def split_of(text):
    m = re.search(r"(\d+)\s*of\s*(\d+)", text or "")
    return (int(m.group(1)), int(m.group(2))) if m else (None, None)


def parse_pct(text):
    m = re.search(r"(\d+(?:\.\d+)?)\s*%", text or "")
    return float(m.group(1)) if m else None


def parse_mmss(text):
    m = re.search(r"(\d+):(\d+)", text or "")
    return int(m.group(1)) * 60 + int(m.group(2)) if m else (0 if text and text.strip() == "0:00" else None)


def years_between(dob_iso, event_iso):
    if not dob_iso or not event_iso:
        return None
    try:
        dob = date.fromisoformat(dob_iso)
        evd = date.fromisoformat(event_iso)
        return evd.year - dob.year - ((evd.month, evd.day) < (dob.month, dob.day))
    except Exception:
        return None


def load_state():
    if os.path.exists(STATE_JSON):
        with open(STATE_JSON, "r") as f:
            return json.load(f)
    return {"event_idx": 0}


def save_state(state):
    with open(STATE_JSON, "w") as f:
        json.dump(state, f)


def append_df(path, df):
    if df is None or not len(df):
        return
    header = not os.path.exists(path)
    df.to_csv(path, mode="a", header=header, index=False)


def existing_ids(path, col):
    if not os.path.exists(path):
        return set()
    try:
        return set(pd.read_csv(path, usecols=[col])[col].dropna().astype(str).tolist())
    except Exception:
        return set()


In [72]:

TOTALS_COLS = {
    "KD": "kd",
    "SIG. STR.": "sig_str",
    "SIG. STR. %": "sig_str_pct",
    "TOTAL STR.": "total_str",
    "TD": "td",
    "TD %": "td_pct",
    "SUB. ATT": "sub_att",
    "REV.": "rev",
    "CTRL": "ctrl",
}

SIG_COLS = {
    "SIG. STR.": "sig_str",
    "SIG. STR. %": "sig_str_pct",
    "HEAD": "head",
    "BODY": "body",
    "LEG": "leg",
    "DISTANCE": "distance",
    "CLINCH": "clinch",
    "GROUND": "ground",
}


def _normalize_headers(ths):
    return [re.sub(r"\s+", " ", th.get_text(strip=True)).upper() for th in ths]


def _split_cell_texts(td):
    texts = [clean(p.get_text(" ", strip=True)) for p in td.select(".b-fight-details__table-text")]
    if not texts:
        text = clean(td.get_text(" ", strip=True))
        texts = [text] if text else []
    return [t for t in texts if t is not None]


def _rows_from_tr(tds):
    columns = [_split_cell_texts(td) for td in tds]
    lengths = [len(col) for col in columns if col]
    if not lengths:
        return []
    fighters = max(lengths)
    rows = []
    for idx in range(fighters):
        row = []
        for col in columns:
            row.append(col[idx] if idx < len(col) else "")
        rows.append(row)
    return rows


def _parse_row_values(headers, values, colmap):
    vals = {}
    for h, text in zip(headers, values):
        key = colmap.get(h)
        if not key:
            continue
        t = text or ""
        if key in {"sig_str", "total_str", "td", "head", "body", "leg", "distance", "clinch", "ground"}:
            landed, att = split_of(t)
            vals[f"{key}_landed"] = landed
            vals[f"{key}_attempted"] = att
        elif key in {"sig_str_pct", "td_pct"}:
            vals[key] = parse_pct(t)
        elif key == "ctrl":
            vals["ctrl_seconds"] = parse_mmss(t)
        elif key in {"kd", "sub_att", "rev"}:
            try:
                vals[key] = int(t) if t not in {"--", ""} else None
            except Exception:
                vals[key] = None
    return vals


def parse_table_block(table, title_text, fight_id, event_id, event_date_iso, red_id, blue_id):
    header_section = table.find("thead")
    if not header_section:
        return [], [], None
    header_tr = header_section.find("tr")
    if not header_tr:
        return [], [], None
    headers = _normalize_headers(header_tr.find_all("th", recursive=False))

    if any(h in {"HEAD", "BODY", "LEG", "DISTANCE", "CLINCH", "GROUND"} for h in headers):
        colmap = SIG_COLS
        table_tag = "significant"
    else:
        colmap = TOTALS_COLS
        table_tag = "totals"

    overall_rows, per_round_rows = [], []
    fighters = [red_id, blue_id]
    current_round = None

    children = [child for child in table.children if getattr(child, "name", None) in {"thead", "tbody"}]
    for child in children:
        if child.name == "thead":
            text = clean(child.get_text(" ", strip=True))
            match = re.search(r"ROUND\s*(\d+)", text, re.IGNORECASE)
            if match:
                try:
                    current_round = int(match.group(1))
                except Exception:
                    current_round = None
            continue

        if child.name != "tbody":
            continue

        for tr in child.find_all("tr", recursive=False):
            tds = tr.find_all("td", recursive=False)
            if not tds:
                continue
            row_values = _rows_from_tr(tds)
            if not row_values:
                continue

            level = "overall" if current_round is None else "round"
            base = {
                "fight_id": fight_id,
                "event_id": event_id,
                "event_date": event_date_iso,
                "level": level,
                "round": current_round,
                "table": table_tag,
            }

            for idx, values in enumerate(row_values):
                if idx >= len(fighters):
                    break
                stats = _parse_row_values(headers, values, colmap)
                if not stats:
                    continue
                row = {**base, "fighter_id": fighters[idx]}
                row.update(stats)
                if level == "overall":
                    overall_rows.append(row)
                else:
                    per_round_rows.append(row)

    return overall_rows, per_round_rows, table_tag


In [73]:
def list_completed_event_urls():
    url = f"{BASE}/statistics/events/completed?page=all"
    sp = soup(url)
    out = []
    for a in sp.select('tr.b-statistics__table-row a[href*="event-details"]'):
        href = a.get("href")
        if href and "event-details" in href:
            out.append(normalize_url(href))
    return sorted(set(out))


def parse_event(event_url):
    sp = soup(event_url)
    event_id = event_url.rsplit("/", 1)[-1]
    title = sp.select_one("h2.b-content__title")
    name = clean(title.text if title else "")
    info_items = [clean(li.text) for li in sp.select("li.b-list__box-list-item")]
    date_txt = next((i.split(":", 1)[1].strip() for i in info_items if i.lower().startswith("date:")), None)
    location = next((i.split(":", 1)[1].strip() for i in info_items if i.lower().startswith("location:")), None)
    try:
        date_iso = dateparser.parse(date_txt).date().isoformat() if date_txt else None
    except Exception:
        date_iso = None
    fight_urls = sorted(
        set(
            normalize_url(a.get("href"))
            for a in sp.select('a[href*="/fight-details/"]')
            if a.get("href")
        )
    )
    row = {
        "event_id": event_id,
        "event_url": event_url,
        "name": name,
        "date": date_iso,
        "raw_date": date_txt,
        "location": location,
    }
    return row, fight_urls


def parse_fighter(fighter_url):
    sp = soup(fighter_url)
    fighter_id = fighter_url.rsplit("/", 1)[-1]
    name_el = sp.select_one("span.b-content__title-highlight")
    name = clean(name_el.text if name_el else "")
    bio_items = [clean(li.text) for li in sp.select("li.b-list__box-list-item")]
    h_in = r_in = stance = dob_iso = None
    for it in bio_items:
        upper = it.upper()
        if upper.startswith("HEIGHT:"):
            h_in = parse_height_to_inches(it.split(":", 1)[1])
        elif upper.startswith("REACH:"):
            r_in = parse_reach_to_inches(it.split(":", 1)[1])
        elif "STANCE" in upper:
            stance = clean(it.split(":", 1)[1])
        elif upper.startswith("DOB:"):
            try:
                dob_iso = dateparser.parse(it.split(":", 1)[1]).date().isoformat()
            except Exception:
                pass
    return {
        "fighter_id": fighter_id,
        "name": name,
        "height_in": h_in,
        "reach_in": r_in,
        "stance": stance,
        "dob": dob_iso,
    }


def parse_fight(fight_url, event_id=None, event_date_iso=None):
    sp = soup(fight_url)
    fight_id = fight_url.rsplit("/", 1)[-1]

    persons = sp.select("div.b-fight-details__person")

    def side(div):
        a = div.select_one("a.b-link.b-fight-details__person-link")
        name = clean(a.text if a else "")
        link = normalize_url(a.get("href")) if a else None
        fid = link.rsplit("/", 1)[-1] if link else None
        status_el = div.select_one("i.b-fight-details__person-status")
        status = clean(status_el.text if status_el else "")
        return {"fighter_id": fid, "fighter_url": link, "name": name, "status": status}

    sides = [side(p) for p in persons[:2]]
    red = sides[0] if len(sides) >= 1 else None
    blue = sides[1] if len(sides) >= 2 else None
    red_id = red["fighter_id"] if red else None
    blue_id = blue["fighter_id"] if blue else None


    meta_lookup = {}
    for block in sp.select('[class*="b-fight-details__text-item"]'):
        label_el = block.select_one('.b-fight-details__label')
        if not label_el:
            continue
        label_text = clean(label_el.get_text(" ", strip=True)).rstrip(':').lower()
        if not label_text:
            continue
        label_el.extract()
        value_text = clean(block.get_text(" ", strip=True))
        if not value_text:
            continue
        meta_lookup[label_text] = value_text

    for selector in (
        "p.b-fight-details__text-item",
        "i.b-fight-details__text-item",
        "span.b-fight-details__text-item",
    ):
        for node in sp.select(selector):
            text = clean(node.get_text(" ", strip=True))
            if not text or ':' not in text:
                continue
            label, value = text.split(':', 1)
            label_key = label.strip().lower()
            if label_key in meta_lookup:
                continue
            meta_lookup[label_key] = value.strip()

    method = meta_lookup.get('method')
    referee = meta_lookup.get('referee')

    end_round = None
    round_text = meta_lookup.get('round')
    if round_text:
        m_round = re.search(r"\d+", round_text)
        if m_round:
            try:
                end_round = int(m_round.group(0))
            except Exception:
                end_round = None

    end_time = meta_lookup.get('time')

    totals_overall, totals_rounds, sig_overall, sig_rounds = [], [], [], []
    seen_tables = set()
    for tbl in sp.select("section.b-fight-details__section table"):
        if id(tbl) in seen_tables:
            continue
        seen_tables.add(id(tbl))
        title_el = tbl.find_previous("h2")
        title_text = title_el.get_text(" ", strip=True) if title_el else ""
        ov, pr, table_tag = parse_table_block(
            tbl,
            title_text,
            fight_id,
            event_id,
            event_date_iso,
            red_id,
            blue_id,
        )
        if not ov and not pr:
            continue

        if table_tag == "totals":
            totals_overall.extend(ov)
            totals_rounds.extend(pr)
        elif table_tag == "significant":
            sig_overall.extend(ov)
            sig_rounds.extend(pr)
        elif "TOTAL" in (title_text or "").upper():
            totals_overall.extend(ov)
            totals_rounds.extend(pr)
        elif "SIGNIFICANT" in (title_text or "").upper():
            sig_overall.extend(ov)
            sig_rounds.extend(pr)

    fight_row = {
        "fight_id": fight_id,
        "fight_url": fight_url,
        "event_id": event_id,
        "event_date": event_date_iso,
        "red_id": red_id,
        "red_name": red["name"] if red else None,
        "red_result": red["status"] if red else None,
        "blue_id": blue_id,
        "blue_name": blue["name"] if blue else None,
        "blue_result": blue["status"] if blue else None,
        "method": method,
        "referee": referee,
        "end_round": end_round,
        "end_time": end_time,
    }
    return fight_row, sides, totals_overall, totals_rounds, sig_overall, sig_rounds


def crawl():
    state = load_state()
    event_urls = list_completed_event_urls()

    done_events = existing_ids(EVENTS_CSV, "event_id")
    done_fights = existing_ids(FIGHTS_CSV, "fight_id")
    have_fighter = existing_ids(FIGHTERS_CSV, "fighter_id")

    print(f"Found {len(event_urls)} events. Resuming from event index {state['event_idx']}.")

    for ei in tqdm(range(state["event_idx"], len(event_urls)), desc="Events"):
        print(f"Processing event {ei} of {len(event_urls)}")
        eurl = event_urls[ei]
        try:
            ev_row, fight_urls = parse_event(eurl)
        except Exception as exc:
            append_df(
                FAIL_CSV,
                pd.DataFrame([
                    {"url": eurl, "type": "event", "error": str(exc)}
                ]),
            )
            state["event_idx"] = ei + 1
            save_state(state)
            continue

        if ev_row["event_id"] not in done_events:
            append_df(EVENTS_CSV, pd.DataFrame([ev_row]))
            done_events.add(ev_row["event_id"])

        fights_batch = []
        fighters_batch = []
        failures_batch = []
        tot_overall_batch, tot_round_batch = [], []
        sig_overall_batch, sig_round_batch = [], []

        for furl in tqdm(fight_urls, leave=False, desc=f"Fights@{ev_row['event_id']}"):
            fid = furl.rsplit("/", 1)[-1]
            if fid in done_fights:
                continue
            try:
                fight_row, sides, t_overall, t_rounds, s_overall, s_rounds = parse_fight(
                    furl,
                    event_id=ev_row["event_id"],
                    event_date_iso=ev_row["date"],
                )
                fights_batch.append(fight_row)
                tot_overall_batch.extend(t_overall)
                tot_round_batch.extend(t_rounds)
                sig_overall_batch.extend(s_overall)
                sig_round_batch.extend(s_rounds)

                for s in sides:
                    if not s or not s.get("fighter_id") or s["fighter_id"] in have_fighter:
                        continue
                    try:
                        bio = parse_fighter(s["fighter_url"])
                        bio["age_on_event"] = years_between(bio.get("dob"), ev_row["date"])
                        fighters_batch.append(bio)
                        have_fighter.add(bio["fighter_id"])
                        time.sleep(0.1)
                    except Exception as fighter_exc:
                        failures_batch.append(
                            {
                                "url": s["fighter_url"],
                                "type": "fighter",
                                "error": str(fighter_exc),
                            }
                        )

                done_fights.add(fid)

                if len(fights_batch) >= 20:
                    append_df(FIGHTS_CSV, pd.DataFrame(fights_batch))
                    fights_batch.clear()
                if len(fighters_batch) >= 20:
                    append_df(FIGHTERS_CSV, pd.DataFrame(fighters_batch))
                    fighters_batch.clear()
                if len(tot_overall_batch) >= 50:
                    append_df(TOT_OVERALL_CSV, pd.DataFrame(tot_overall_batch))
                    tot_overall_batch.clear()
                if len(tot_round_batch) >= 50:
                    append_df(TOT_ROUND_CSV, pd.DataFrame(tot_round_batch))
                    tot_round_batch.clear()
                if len(sig_overall_batch) >= 50:
                    append_df(SIG_OVERALL_CSV, pd.DataFrame(sig_overall_batch))
                    sig_overall_batch.clear()
                if len(sig_round_batch) >= 50:
                    append_df(SIG_ROUND_CSV, pd.DataFrame(sig_round_batch))
                    sig_round_batch.clear()
                if len(failures_batch) >= 10:
                    append_df(FAIL_CSV, pd.DataFrame(failures_batch))
                    failures_batch.clear()

                time.sleep(0.1)
            except Exception as fight_exc:
                failures_batch.append(
                    {"url": furl, "type": "fight", "error": str(fight_exc)}
                )

        append_df(FIGHTS_CSV, pd.DataFrame(fights_batch))
        fights_batch.clear()
        append_df(FIGHTERS_CSV, pd.DataFrame(fighters_batch))
        fighters_batch.clear()
        append_df(TOT_OVERALL_CSV, pd.DataFrame(tot_overall_batch))
        tot_overall_batch.clear()
        append_df(TOT_ROUND_CSV, pd.DataFrame(tot_round_batch))
        tot_round_batch.clear()
        append_df(SIG_OVERALL_CSV, pd.DataFrame(sig_overall_batch))
        sig_overall_batch.clear()
        append_df(SIG_ROUND_CSV, pd.DataFrame(sig_round_batch))
        sig_round_batch.clear()
        append_df(FAIL_CSV, pd.DataFrame(failures_batch))
        failures_batch.clear()

        state["event_idx"] = ei + 1
        save_state(state)
        time.sleep(2.0)


In [81]:
# Run the crawler (uncomment to execute; expect a long-running job)
crawl()

Found 747 events. Resuming from event index 5.


Events:   0%|          | 0/742 [00:00<?, ?it/s]

Processing event 5 of 747


Events:   0%|          | 1/742 [00:17<3:39:57, 17.81s/it]

Processing event 6 of 747


Events:   0%|          | 2/742 [00:29<2:57:08, 14.36s/it]

Processing event 7 of 747


Events:   0%|          | 3/742 [00:49<3:29:35, 17.02s/it]

Processing event 8 of 747


Events:   1%|          | 4/742 [01:07<3:30:51, 17.14s/it]

Processing event 9 of 747


Events:   1%|          | 5/742 [01:20<3:11:05, 15.56s/it]

Processing event 10 of 747


Events:   1%|          | 6/742 [01:37<3:17:05, 16.07s/it]

Processing event 11 of 747


Events:   1%|          | 7/742 [01:55<3:25:42, 16.79s/it]

Processing event 12 of 747


Events:   1%|          | 8/742 [02:10<3:19:43, 16.33s/it]

Processing event 13 of 747


Events:   1%|          | 9/742 [02:27<3:19:58, 16.37s/it]

Processing event 14 of 747


Events:   1%|          | 9/742 [02:34<3:30:19, 17.22s/it]


KeyboardInterrupt: 