In [16]:
# World Athletics U18 Women's High Jump

In [8]:
import requests
import pandas as pd
import time
from bs4 import BeautifulSoup

In [9]:
!pip install lxml
import lxml
print(("lxml successfully installed"))

lxml successfully installed



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
def fetch_toplist_page(page_num):
    url = (
        "https://worldathletics.org/records/all-time-toplists/jumps/high-jump/all/women/u20"
        "?regionType=world"
        "&page=1&bestResultsOnly=true"
        "&firstDay=1899-12-31"
        "&lastDay=2025-10-21"
        "&maxResultsByCountry=all"
        "&eventId=10229526"
        "&ageCategory=u20"
        f"&page={page_num}"
    )
    resp = requests.get(url)
    resp.raise_for_status()
    return resp.text

In [11]:
def parse_table_from_html(html):
    dfs = pd.read_html(html)
    for df in dfs:
        cols = set(df.columns)
        expected = {"Rank", "Mark", "WIND", "Competitor", "DOB", "Country", "Pos", "Venue", "Date", "Results Score"}
        if expected.intersection(cols):
            return df
        return max(dfs, key=lambda d: d.shape[0])

In [12]:
def scrape_all_pages(max_pages=40, delay=1.0):
    all_dfs = []
    for page in range(1, max_pages + 1):
        print(f"Fetching page {page}...")
        html = fetch_toplist_page(page)
        df = parse_table_from_html(html)
        if df is None or df.shape[0] == 0:
            print("No more data or empty table on this page. Stopping.")
            break
        all_dfs.append(df)
        time.sleep(delay)
    combined = pd.concat(all_dfs, ignore_index=True)
    return combined

In [14]:
def main():
    df = scrape_all_pages(max_pages=3, delay=1.0)
    print("Scraped rows:", len(df))
    df.to_csv("women_u18_high_jump.csv", index=False)
    print("Saved CSV.")

if __name__ == "__main__":
    main()

Fetching page 1...


  dfs = pd.read_html(html)


Fetching page 2...


  dfs = pd.read_html(html)


Fetching page 3...


  dfs = pd.read_html(html)


Scraped rows: 300
Saved CSV.
