# Tennis Abstract Web Scraper

This notebook allows you to scrape recent tennis match data from Tennis Abstract for Top 25 ATP and WTA players.

**Data Source**: [Tennis Abstract](https://www.tennisabstract.com/)  
**Base Data**: [Jeff Sackmann's tennis_atp/tennis_wta repos](https://github.com/JeffSackmann/tennis_atp)

## 1. Setup & Configuration

In [None]:
# Configuration Options
# =====================

# Which tour to scrape: "atp", "wta", or "both"
TOUR = "both"

# Minimum year to scrape (matches before this year are skipped)
MIN_YEAR = 2025

# Maximum number of players to scrape per tour (None = all 25)
# Use a small number (e.g., 3) for testing
MAX_PLAYERS = None

# Whether to merge scraped data with existing data
MERGE_WITH_EXISTING = True

# Rate limiting (seconds between requests)
RATE_LIMIT_SECONDS = 2.0

In [None]:
# Install dependencies if needed
import subprocess
import sys

def install_if_missing(package, import_name=None):
    import_name = import_name or package
    try:
        __import__(import_name)
        print(f"✓ {package} already installed")
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package, "-q"])
        print(f"✓ {package} installed")

install_if_missing("playwright")
install_if_missing("pandas")

In [None]:
# Install Playwright browser (run once)
import os
if not os.path.exists(os.path.expanduser("~/.cache/ms-playwright/chromium_headless_shell-1208")):
    print("Installing Chromium browser for Playwright...")
    !playwright install chromium
else:
    print("✓ Chromium browser already installed")

In [None]:
# Imports
import csv
import re
import time
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from IPython.display import display, HTML, clear_output

import pandas as pd
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout

print("✓ All imports successful")

## 2. Player Lists

Top 25 players for each tour with their Tennis Abstract URL names.

In [None]:
# Top 25 ATP Players (player_id, url_name, full_name)
ATP_TOP_25 = [
    (206173, "JannikSinner", "Jannik Sinner"),
    (100644, "AlexanderZverev", "Alexander Zverev"),
    (207989, "CarlosAlcaraz", "Carlos Alcaraz"),
    (126203, "TaylorFritz", "Taylor Fritz"),
    (106421, "DaniilMedvedev", "Daniil Medvedev"),
    (134770, "AlexDeMinaur", "Alex de Minaur"),
    (104925, "NovakDjokovic", "Novak Djokovic"),
    (126094, "CasperRuud", "Casper Ruud"),
    (200282, "AndreyRublev", "Andrey Rublev"),
    (105777, "GrigorDimitrov", "Grigor Dimitrov"),
    (126774, "TommyPaul", "Tommy Paul"),
    (126205, "FrancesTiafoe", "Frances Tiafoe"),
    (208029, "HolgerRune", "Holger Rune"),
    (200005, "LorenzoMusetti", "Lorenzo Musetti"),
    (207733, "UgoHumbert", "Ugo Humbert"),
    (128034, "JackDraper", "Jack Draper"),
    (207518, "ArthurFils", "Arthur Fils"),
    (126207, "SebastianKorda", "Sebastian Korda"),
    (111575, "KarenKhachanov", "Karen Khachanov"),
    (209950, "AlexMichelsen", "Alex Michelsen"),
    (210097, "GiovanniMpetshiPerricard", "Giovanni Mpetshi Perricard"),
    (200624, "AdrianMannarino", "Adrian Mannarino"),
    (126214, "BenShelton", "Ben Shelton"),
    (200615, "FelixAugerAliassime", "Felix Auger-Aliassime"),
    (207830, "TaroDaniel", "Taro Daniel"),
]

# Top 25 WTA Players (player_id, url_name, full_name)
WTA_TOP_25 = [
    (214544, "ArynaSabalenka", "Aryna Sabalenka"),
    (216347, "IgaSwiatek", "Iga Swiatek"),
    (221103, "CocoGauff", "Coco Gauff"),
    (211148, "JasminePaolini", "Jasmine Paolini"),
    (221012, "QinwenZheng", "Qinwen Zheng"),
    (214981, "JessicaPegula", "Jessica Pegula"),
    (202468, "ElenaRybakina", "Elena Rybakina"),
    (215613, "EmmaNavarro", "Emma Navarro"),
    (214082, "DariaKasatkina", "Daria Kasatkina"),
    (206252, "BeatrizHaddadMaia", "Beatriz Haddad Maia"),
    (203389, "DanielleCollins", "Danielle Collins"),
    (211651, "MirraAndreeva", "Mirra Andreeva"),
    (223670, "DianaShnaider", "Diana Shnaider"),
    (214939, "AnnaKalinskaya", "Anna Kalinskaya"),
    (211533, "DonnaVekic", "Donna Vekic"),
    (259799, "MadisonKeys", "Madison Keys"),
    (206242, "PaulaBadosa", "Paula Badosa"),
    (216146, "LinaTsurenko", "Lina Tsurenko"),
    (202499, "LindaNoskova", "Linda Noskova"),
    (201458, "MarieBouzkova", "Marie Bouzkova"),
    (201619, "KarolinaPliskova", "Karolina Pliskova"),
    (214096, "AnastasiaPavlyuchenkova", "Anastasia Pavlyuchenkova"),
    (202494, "ElinaSvitolina", "Elina Svitolina"),
    (211107, "VeronikaKudermetova", "Veronika Kudermetova"),
    (211684, "LeylaFernandez", "Leyla Fernandez"),
]

print(f"ATP Players: {len(ATP_TOP_25)}")
print(f"WTA Players: {len(WTA_TOP_25)}")

## 3. Scraper Functions

In [None]:
# Data directory setup
DATA_DIR = Path(".")
OUTPUT_DIR = DATA_DIR / "top25"
OUTPUT_DIR.mkdir(exist_ok=True)
(OUTPUT_DIR / "atp").mkdir(exist_ok=True)
(OUTPUT_DIR / "wta").mkdir(exist_ok=True)

def get_match_columns():
    """Return column names matching Sackmann's format."""
    return [
        "tourney_id", "tourney_name", "surface", "draw_size", "tourney_level",
        "tourney_date", "match_num", "winner_id", "winner_seed", "winner_entry",
        "winner_name", "winner_hand", "winner_ht", "winner_ioc", "winner_age",
        "loser_id", "loser_seed", "loser_entry", "loser_name", "loser_hand",
        "loser_ht", "loser_ioc", "loser_age", "score", "best_of", "round",
        "minutes", "w_ace", "w_df", "w_svpt", "w_1stIn", "w_1stWon", "w_2ndWon",
        "w_SvGms", "w_bpSaved", "w_bpFaced", "l_ace", "l_df", "l_svpt", "l_1stIn",
        "l_1stWon", "l_2ndWon", "l_SvGms", "l_bpSaved", "l_bpFaced", "winner_rank",
        "winner_rank_points", "loser_rank", "loser_rank_points"
    ]

In [None]:
def parse_round(round_str):
    """Convert Tennis Abstract round format to Sackmann format."""
    round_map = {
        "F": "F", "SF": "SF", "QF": "QF", "R16": "R16",
        "R32": "R32", "R64": "R64", "R128": "R128", "RR": "RR", "BR": "BR",
        "1R": "R128", "2R": "R64", "3R": "R32", "4R": "R16",
    }
    return round_map.get(round_str.strip(), round_str)

def parse_surface(surface_str):
    """Normalize surface names."""
    surface_str = surface_str.lower().strip()
    if "hard" in surface_str: return "Hard"
    elif "clay" in surface_str: return "Clay"
    elif "grass" in surface_str: return "Grass"
    elif "carpet" in surface_str: return "Carpet"
    return surface_str.title()

def parse_date(date_str):
    """Parse date string and return (YYYYMMDD format, year)."""
    formats = ["%d-%b-%Y", "%Y-%m-%d", "%d %b %Y", "%b %d, %Y", "%Y/%m/%d"]
    for fmt in formats:
        try:
            dt = datetime.strptime(date_str.strip(), fmt)
            return dt.strftime("%Y%m%d"), dt.year
        except ValueError:
            continue
    return "", 0

def parse_time_to_minutes(time_str):
    """Convert time string like '2:23' to minutes."""
    try:
        parts = time_str.strip().split(':')
        if len(parts) == 2:
            return str(int(parts[0]) * 60 + int(parts[1]))
    except (ValueError, IndexError):
        pass
    return ""

def parse_result_cell(result_text, player_name):
    """Parse result cell to extract winner/loser info."""
    if " d. " not in result_text:
        return False, "", "", "", ""
    
    parts = result_text.split(" d. ")
    if len(parts) != 2:
        return False, "", "", "", ""
    
    winner_part, loser_part = parts[0].strip(), parts[1].strip()
    
    def extract_seed(text):
        match = re.search(r'\((\d+)\)', text)
        return match.group(1) if match else ""
    
    def extract_name(text):
        text = re.sub(r'\(\d+\)', '', text)
        text = re.sub(r'\[.*?\]', '', text)
        return text.strip()
    
    winner_seed = extract_seed(winner_part)
    loser_seed = extract_seed(loser_part)
    winner_name = extract_name(winner_part)
    loser_name = extract_name(loser_part)
    
    player_last_name = player_name.split()[-1].lower()
    winner_last_name = winner_name.split()[-1].lower() if winner_name else ""
    player_won = player_last_name == winner_last_name
    
    return player_won, winner_name, loser_name, winner_seed, loser_seed

def parse_bp_saved(bp_text):
    """Parse break points saved like '5/8' into (saved, faced)."""
    try:
        if '/' in bp_text:
            parts = bp_text.split('/')
            return parts[0].strip(), parts[1].strip()
    except (ValueError, IndexError):
        pass
    return "", ""

print("✓ Parser functions defined")

In [None]:
def load_player_lookup(tour):
    """Load player info for ID lookup."""
    players = {}
    player_file = DATA_DIR / f"tennis_{tour}" / f"{tour}_players.csv"
    
    if not player_file.exists():
        return players
    
    with open(player_file, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            name = f"{row.get('name_first', '')} {row.get('name_last', '')}".lower().strip()
            players[name] = row
            players[row.get('name_last', '').lower()] = row
    
    return players

print("✓ Player lookup function defined")

In [None]:
def scrape_player_matches(page, player_id, url_name, full_name, tour, player_lookup, min_year):
    """Scrape recent matches for a single player."""
    matches = []
    prefix = "w" if tour == "wta" else ""
    url = f"https://www.tennisabstract.com/cgi-bin/{prefix}player.cgi?p={url_name}"
    
    try:
        page.goto(url, timeout=60000, wait_until="domcontentloaded")
        page.wait_for_timeout(3000)
        
        table = page.query_selector("#recent-results")
        if not table:
            return matches, "No table found"
        
        rows = table.query_selector_all("tr")
        
        for row in rows[1:]:
            cells = row.query_selector_all("td")
            if len(cells) < 8:
                continue
            
            try:
                cell_texts = [c.inner_text().strip() for c in cells]
                date_text = cell_texts[0]
                tourney_date, year = parse_date(date_text)
                
                if year < min_year:
                    continue
                
                match = {col: "" for col in get_match_columns()}
                match["tourney_date"] = tourney_date
                match["tourney_name"] = cell_texts[1]
                match["surface"] = parse_surface(cell_texts[2])
                match["round"] = parse_round(cell_texts[3])
                
                player_rank = cell_texts[4]
                opponent_rank = cell_texts[5]
                
                result_text = cell_texts[6]
                player_won, winner_name, loser_name, winner_seed, loser_seed = parse_result_cell(
                    result_text, full_name
                )
                
                match["score"] = cell_texts[7]
                
                clean_name = re.sub(r'[^a-zA-Z0-9]', '', match["tourney_name"])
                match["tourney_id"] = f"{year}-{clean_name[:20]}"
                
                if player_won:
                    match["winner_id"] = str(player_id)
                    match["winner_name"] = full_name
                    match["winner_seed"] = winner_seed
                    match["winner_rank"] = player_rank
                    match["loser_name"] = loser_name or winner_name
                    match["loser_seed"] = loser_seed
                    match["loser_rank"] = opponent_rank
                else:
                    match["loser_id"] = str(player_id)
                    match["loser_name"] = full_name
                    match["loser_seed"] = loser_seed
                    match["loser_rank"] = player_rank
                    match["winner_name"] = winner_name or loser_name
                    match["winner_seed"] = winner_seed
                    match["winner_rank"] = opponent_rank
                
                # Lookup opponent ID
                opponent_name = match["loser_name"] if player_won else match["winner_name"]
                opponent_info = player_lookup.get(opponent_name.lower(), {})
                if not opponent_info:
                    opponent_last = opponent_name.split()[-1].lower() if opponent_name else ""
                    opponent_info = player_lookup.get(opponent_last, {})
                
                if opponent_info:
                    if player_won:
                        match["loser_id"] = opponent_info.get('player_id', '')
                    else:
                        match["winner_id"] = opponent_info.get('player_id', '')
                
                # Parse time and BP stats
                if len(cell_texts) > 15:
                    match["minutes"] = parse_time_to_minutes(cell_texts[15])
                    if len(cell_texts) > 14:
                        bp_saved, bp_faced = parse_bp_saved(cell_texts[14])
                        if player_won:
                            match["w_bpSaved"] = bp_saved
                            match["w_bpFaced"] = bp_faced
                        else:
                            match["l_bpSaved"] = bp_saved
                            match["l_bpFaced"] = bp_faced
                
                matches.append(match)
            
            except Exception as e:
                continue
        
        return matches, None
    
    except PlaywrightTimeout:
        return matches, "Timeout"
    except Exception as e:
        return matches, str(e)

print("✓ Scraper function defined")

In [None]:
def deduplicate_matches(matches):
    """Remove duplicate matches."""
    seen = set()
    unique = []
    
    for match in matches:
        names = sorted([match.get('winner_name', ''), match.get('loser_name', '')])
        key = (
            match.get('tourney_date', ''),
            match.get('tourney_name', ''),
            match.get('round', ''),
            names[0],
            names[1],
        )
        
        if key not in seen and any(key):
            seen.add(key)
            unique.append(match)
    
    return unique

def merge_with_existing(new_matches, existing_file):
    """Merge new matches with existing data."""
    if not existing_file.exists():
        return new_matches
    
    existing = pd.read_csv(existing_file)
    existing_records = existing.to_dict('records')
    
    existing_keys = set()
    for match in existing_records:
        names = sorted([str(match.get('winner_name', '')), str(match.get('loser_name', ''))])
        key = (
            str(match.get('tourney_date', '')),
            str(match.get('tourney_name', '')),
            str(match.get('round', '')),
            names[0],
            names[1],
        )
        existing_keys.add(key)
    
    truly_new = []
    for match in new_matches:
        names = sorted([str(match.get('winner_name', '')), str(match.get('loser_name', ''))])
        key = (
            str(match.get('tourney_date', '')),
            str(match.get('tourney_name', '')),
            str(match.get('round', '')),
            names[0],
            names[1],
        )
        if key not in existing_keys:
            truly_new.append(match)
    
    all_matches = existing_records + truly_new
    all_matches.sort(key=lambda x: str(x.get('tourney_date', '')))
    
    return all_matches, len(truly_new)

def save_matches(matches, output_file):
    """Save matches to CSV."""
    if not matches:
        return
    
    columns = get_match_columns()
    with open(output_file, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=columns)
        writer.writeheader()
        for match in matches:
            row = {col: match.get(col, '') for col in columns}
            writer.writerow(row)

print("✓ Utility functions defined")

## 4. Run the Scraper

Execute this cell to start scraping based on your configuration.

In [None]:
def run_scraper(tour, min_year, max_players, merge_with_existing_flag, rate_limit):
    """Main scraper function with progress display."""
    
    results = {"atp": [], "wta": []}
    
    tours_to_scrape = ["atp", "wta"] if tour == "both" else [tour]
    
    for current_tour in tours_to_scrape:
        players = ATP_TOP_25 if current_tour == "atp" else WTA_TOP_25
        players_to_scrape = players[:max_players] if max_players else players
        player_lookup = load_player_lookup(current_tour)
        
        print(f"\n{'='*60}")
        print(f"Scraping {current_tour.upper()} - {len(players_to_scrape)} players")
        print(f"Looking for matches from {min_year} onwards")
        print(f"{'='*60}\n")
        
        all_matches = []
        
        with sync_playwright() as p:
            browser = p.chromium.launch(headless=True)
            context = browser.new_context(
                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
            )
            page = context.new_page()
            
            for i, (player_id, url_name, full_name) in enumerate(players_to_scrape):
                print(f"[{i+1}/{len(players_to_scrape)}] {full_name}...", end=" ")
                
                matches, error = scrape_player_matches(
                    page, player_id, url_name, full_name,
                    current_tour, player_lookup, min_year
                )
                
                if error:
                    print(f"⚠️ {error}")
                else:
                    print(f"✓ {len(matches)} matches")
                
                all_matches.extend(matches)
                time.sleep(rate_limit)
            
            browser.close()
        
        # Deduplicate
        all_matches = deduplicate_matches(all_matches)
        print(f"\n→ Total unique matches: {len(all_matches)}")
        
        # Save scraped data
        scraped_file = OUTPUT_DIR / current_tour / f"{current_tour}_top25_matches_scraped.csv"
        save_matches(all_matches, scraped_file)
        print(f"→ Saved to: {scraped_file}")
        
        # Merge with existing
        if merge_with_existing_flag and all_matches:
            existing_file = OUTPUT_DIR / current_tour / f"{current_tour}_top25_matches.csv"
            if existing_file.exists():
                merged, new_count = merge_with_existing(all_matches, existing_file)
                save_matches(merged, existing_file)
                print(f"→ Merged {new_count} new matches into {existing_file}")
                print(f"→ Total matches in merged file: {len(merged)}")
        
        results[current_tour] = all_matches
    
    return results

# Run the scraper
scraped_data = run_scraper(
    tour=TOUR,
    min_year=MIN_YEAR,
    max_players=MAX_PLAYERS,
    merge_with_existing_flag=MERGE_WITH_EXISTING,
    rate_limit=RATE_LIMIT_SECONDS
)

print("\n" + "="*60)
print("✓ Scraping Complete!")
print("="*60)

## 5. View Results

In [None]:
# Load and display scraped data
for tour in ["atp", "wta"]:
    scraped_file = OUTPUT_DIR / tour / f"{tour}_top25_matches_scraped.csv"
    if scraped_file.exists():
        df = pd.read_csv(scraped_file)
        print(f"\n{tour.upper()} Scraped Matches: {len(df)}")
        print(f"Columns: {len(df.columns)}")
        
        if len(df) > 0:
            display(df[["tourney_date", "tourney_name", "surface", "round", 
                       "winner_name", "loser_name", "score"]].head(10))

In [None]:
# Summary statistics
for tour in ["atp", "wta"]:
    merged_file = OUTPUT_DIR / tour / f"{tour}_top25_matches.csv"
    if merged_file.exists():
        df = pd.read_csv(merged_file)
        df['tourney_date'] = pd.to_datetime(df['tourney_date'], format='%Y%m%d', errors='coerce')
        
        print(f"\n{'='*40}")
        print(f"{tour.upper()} Full Dataset Statistics")
        print(f"{'='*40}")
        print(f"Total matches: {len(df):,}")
        print(f"Date range: {df['tourney_date'].min().date()} to {df['tourney_date'].max().date()}")
        print(f"\nMatches by year (recent):")
        print(df.groupby(df['tourney_date'].dt.year).size().tail(5))
        print(f"\nTop tournaments:")
        print(df['tourney_name'].value_counts().head(10))

## 6. Custom Single Player Scrape

Scrape a specific player by name.

In [None]:
def scrape_single_player(player_name, tour="atp", min_year=2025):
    """Scrape matches for a single player by name."""
    players = ATP_TOP_25 if tour == "atp" else WTA_TOP_25
    
    # Find player
    player = None
    for p in players:
        if player_name.lower() in p[2].lower():
            player = p
            break
    
    if not player:
        print(f"Player '{player_name}' not found in {tour.upper()} Top 25")
        return None
    
    player_id, url_name, full_name = player
    player_lookup = load_player_lookup(tour)
    
    print(f"Scraping {full_name}...")
    
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        context = browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        )
        page = context.new_page()
        
        matches, error = scrape_player_matches(
            page, player_id, url_name, full_name,
            tour, player_lookup, min_year
        )
        
        browser.close()
    
    if error:
        print(f"Error: {error}")
        return None
    
    df = pd.DataFrame(matches)
    print(f"Found {len(df)} matches from {min_year}+")
    return df

# Example: Scrape Sinner's recent matches
# sinner_matches = scrape_single_player("Sinner", tour="atp", min_year=2025)
# display(sinner_matches)

In [None]:
# Uncomment and run to scrape a specific player:

# sinner_matches = scrape_single_player("Sinner", tour="atp", min_year=2025)
# if sinner_matches is not None:
#     display(sinner_matches[["tourney_date", "tourney_name", "round", "winner_name", "loser_name", "score"]])