# ESPN Women's Basketball Data Scraper

This notebook demonstrates how to use the ESPN Women's Basketball Data Scraper to collect and analyze data.


## Setup

First, let's make sure all dependencies are installed:


In [1]:
# !pip install -r requirements.txt

## Import Libraries


In [1]:
import os
import json
import asyncio
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm

# Import our scraper modules
from espn_data.utils import make_request, save_json, load_json, TEAMS_URL
from espn_data.scraper import get_all_teams, get_team_schedule, get_game_data, scrape_all_data
from espn_data.processor import process_teams_data, process_schedules, process_game_data

## Example 1: Get All Teams

Let's start by fetching all women's college basketball teams from ESPN.


In [2]:
# Fetch all teams
teams = get_all_teams()

# Display a sample of teams
pd.DataFrame(teams[:10])[['id', 'abbreviation', 'displayName', 'nickname', 'location']]

2025-03-09 20:09:30,119 - espn_data - INFO - Fetching all women's college basketball teams
2025-03-09 20:09:30,120 - espn_data - INFO - Fetching teams page 1 with limit 500
2025-03-09 20:09:31,905 - espn_data - INFO - Retrieved 360 teams on page 1
2025-03-09 20:09:31,906 - espn_data - INFO - Reached last page of results
2025-03-09 20:09:31,906 - espn_data - INFO - Retrieved 360 teams in total
2025-03-09 20:09:31,954 - espn_data - INFO - Data saved to /Users/aburkard/fun/espn-data/espn_data/data/teams.json


Unnamed: 0,id,abbreviation,displayName,nickname,location
0,2000,ACU,Abilene Christian Wildcats,Abilene Chrstn,Abilene Christian
1,2005,AFA,Air Force Falcons,Air Force,Air Force
2,2006,AKR,Akron Zips,Akron,Akron
3,2010,AAMU,Alabama A&M Bulldogs,Alabama A&M,Alabama A&M
4,333,ALA,Alabama Crimson Tide,Alabama,Alabama
5,2011,ALST,Alabama State Lady Hornets,Alabama St,Alabama State
6,2016,ALCN,Alcorn State Lady Braves,Alcorn St,Alcorn State
7,44,AMER,American University Eagles,American,American University
8,2026,APP,App State Mountaineers,App State,App State
9,9,ASU,Arizona State Sun Devils,Arizona St,Arizona State


## Example 2: Get Team Schedule

Now, let's get the schedule for a specific team for the last few seasons.


In [3]:
# Get UConn's schedule (team_id = 41)
team_id = "41"  # UConn Huskies
seasons = [2022, 2023, 2024]  # Last three seasons

schedule = get_team_schedule(team_id, seasons)

# Create a DataFrame to view the schedule
schedule_df = pd.DataFrame([{
    'game_id': game['id'],
    'date': game['date'],
    'season': game['season'],
    'name': game['name'],
    'short_name': game['shortName'],
    'neutral_site': game.get('neutralSite', False)
} for game in schedule])

# Display the schedule
schedule_df.head(10)

2025-03-09 20:09:31,965 - espn_data - INFO - Fetching schedule for team 41 across 3 seasons
2025-03-09 20:09:31,965 - espn_data - INFO - Fetching 2022 season for team 41
2025-03-09 20:09:33,597 - espn_data - INFO - Found 41 games for team 41 in season 2022
2025-03-09 20:09:34,101 - espn_data - INFO - Fetching 2023 season for team 41
2025-03-09 20:09:35,568 - espn_data - INFO - Found 38 games for team 41 in season 2023
2025-03-09 20:09:36,074 - espn_data - INFO - Fetching 2024 season for team 41
2025-03-09 20:09:37,574 - espn_data - INFO - Found 39 games for team 41 in season 2024
2025-03-09 20:09:38,172 - espn_data - INFO - Data saved to /Users/aburkard/fun/espn-data/espn_data/data/schedules/41.json


Unnamed: 0,game_id,date,season,name,short_name,neutral_site
0,401373084,2021-11-14T18:00Z,2022,Arkansas Razorbacks at UConn Huskies,ARK @ CONN,False
1,401373085,2021-11-20T17:00Z,2022,Minnesota Golden Gophers at UConn Huskies,MINN VS CONN,False
2,401381678,2021-11-21T17:00Z,2022,South Florida Bulls at UConn Huskies,USF VS CONN,False
3,401381679,2021-11-22T17:00Z,2022,South Carolina Gamecocks at UConn Huskies,SC VS CONN,False
4,401378370,2021-12-04T00:00Z,2022,UConn Huskies at Seton Hall Pirates,CONN @ HALL,False
5,401369448,2021-12-05T17:00Z,2022,Notre Dame Fighting Irish at UConn Huskies,ND @ CONN,False
6,401369457,2021-12-10T00:00Z,2022,UConn Huskies at Georgia Tech Yellow Jackets,CONN @ GT,False
7,401373086,2021-12-11T18:00Z,2022,UCLA Bruins at UConn Huskies,UCLA VS CONN,False
8,401369480,2021-12-19T20:53Z,2022,Louisville Cardinals at UConn Huskies,LOU VS CONN,False
9,401378373,2021-12-30T00:00Z,2022,Marquette Golden Eagles at UConn Huskies,MARQ @ CONN,False


## Example 3: Get Game Data

Let's get detailed data for a specific game, including box score and play-by-play information.


In [4]:
# Get a game ID from the schedule
game_id = schedule[0]['id']
print(f"Getting data for game {game_id}: {schedule[0]['name']}")

# Fetch game data
game_data = get_game_data(game_id)

# Extract basic game info
if 'header' in game_data:
    header = game_data['header']

    # Get date from competitions array
    game_date = None
    if 'competitions' in header and header['competitions']:
        game_date = header['competitions'][0].get('date')
    print(f"Game date: {game_date}")

    # Get venue from gameInfo object
    venue_name = None
    venue_city = None
    venue_state = None
    if 'gameInfo' in game_data and 'venue' in game_data['gameInfo']:
        venue = game_data['gameInfo']['venue']
        venue_name = venue.get('fullName')
        if 'address' in venue:
            venue_city = venue['address'].get('city')
            venue_state = venue['address'].get('state')

    if venue_name:
        location = f"{venue_city}, {venue_state}" if venue_city and venue_state else ""
        print(f"Venue: {venue_name} {location}")
    else:
        print("Venue: Not available")

    # Show attendance if available
    if 'gameInfo' in game_data and 'attendance' in game_data['gameInfo']:
        attendance = game_data['gameInfo']['attendance']
        print(f"Attendance: {attendance}")

    # Show competitors
    if 'competitions' in header and header['competitions']:
        competition = header['competitions'][0]
        if 'competitors' in competition:
            print("\nCompetitors:")
            for team in competition['competitors']:
                team_name = team.get('team', {}).get('displayName', 'Unknown')
                score = team.get('score', 'N/A')
                home_away = team.get('homeAway', 'N/A').upper()
                winner = "(WINNER)" if team.get('winner', False) else ""
                print(f"  {team_name} ({home_away}): {score} {winner}")


2025-03-09 20:09:38,182 - espn_data - INFO - Fetching data for game 401373084
2025-03-09 20:09:38,183 - espn_data - INFO - Using URL: https://site.web.api.espn.com/apis/site/v2/sports/basketball/womens-college-basketball/summary?event=401373084


Getting data for game 401373084: Arkansas Razorbacks at UConn Huskies


2025-03-09 20:09:40,171 - espn_data - INFO - Data saved to /Users/aburkard/fun/espn-data/espn_data/data/games/401373084.json


Game date: 2021-11-14T18:00Z
Venue: XL Center Hartford, CT
Attendance: 9359

Competitors:
  UConn Huskies (HOME): 95 (WINNER)
  Arkansas Razorbacks (AWAY): 80 


## Example 4: Process Game Data

Let's process the game data into structured formats. The processor has been improved to correctly extract team information from the game data, ensuring that player and team statistics are properly processed.


In [5]:
# Process the game data
processed_data = process_game_data(game_id)

# Display play-by-play data
if 'play_by_play' in processed_data and processed_data['play_by_play']:
    pbp_df = pd.DataFrame(processed_data['play_by_play'])
    print(f"Total plays: {len(pbp_df)}")

    # Show scoring plays
    scoring_plays = pbp_df[pbp_df['scoring_play'] == True].copy()
    print(f"\nScoring plays: {len(scoring_plays)}")

    # Display a sample of scoring plays
    scoring_plays[['period', 'clock', 'text', 'score_home', 'score_away']].head(10)

2025-03-09 20:09:40,317 - espn_data - INFO - Successfully processed and saved data for game 401373084


Total plays: 366

Scoring plays: 99


## Example 5: Player Statistics

Let's look at player statistics from the game.


In [6]:
# Get player stats
if 'player_stats' in processed_data and processed_data['player_stats']:
    player_stats_df = pd.DataFrame(processed_data['player_stats'])

    # Group stats by team
    for team_id, team_group in player_stats_df.groupby('team_name'):
        print(f"\n{team_id} Player Statistics:")

        # Get players who played in the game (not DNP)
        players = team_group[team_group['did_not_play'] == False]

        # Get basic statistics like points, rebounds, assists
        key_stats = players[['player_name', 'position', 'starter']].copy()

        # Add points, rebounds, assists if available
        if 'PTS' in players.columns:
            key_stats['PTS'] = players['PTS']
        if 'REB' in players.columns:
            key_stats['REB'] = players['REB']
        if 'AST' in players.columns:
            key_stats['AST'] = players['AST']
        if 'MIN' in players.columns:
            key_stats['MIN'] = players['MIN']

        display(key_stats)


Arkansas Razorbacks Player Statistics:


Unnamed: 0,player_name,position,starter,PTS,REB,AST,MIN
0,Erynn Barnum,Forward,True,16.0,8.0,2.0,27.0
1,Marquesha Davis,Guard,True,5.0,2.0,1.0,18.0
2,Sasha Goforth,Guard,True,17.0,5.0,1.0,31.0
3,Makayla Daniels,Guard,True,5.0,6.0,3.0,27.0
4,Amber Ramirez,Guard,True,20.0,2.0,2.0,29.0
5,Emrie Ellis,Forward,False,6.0,4.0,0.0,13.0
6,Jersey Wolfenbarger,Forward,False,0.0,4.0,0.0,16.0
7,Samara Spencer,Guard,False,8.0,0.0,2.0,17.0
8,Rylee Langerman,Guard,False,0.0,0.0,0.0,6.0
9,Elauna Eaton,Guard,False,3.0,0.0,0.0,16.0



UConn Huskies Player Statistics:


Unnamed: 0,player_name,position,starter,PTS,REB,AST,MIN
14,Aaliyah Edwards,Forward,True,9.0,4.0,3.0,17.0
15,Olivia Nelson-Ododa,Forward,True,9.0,8.0,2.0,19.0
16,Paige Bueckers,Guard,True,34.0,6.0,4.0,40.0
17,Christyn Williams,Guard,True,18.0,4.0,3.0,36.0
18,Evina Westbrook,Guard,True,15.0,8.0,5.0,38.0
19,Dorka Juhasz,Forward,False,3.0,3.0,1.0,13.0
20,Azzi Fudd,Guard,False,7.0,3.0,0.0,20.0
21,Caroline Ducharme,Guard,False,0.0,0.0,0.0,4.0
22,Nika Muhl,Guard,False,0.0,1.0,2.0,13.0


## Example 6: Running the Full Scraper

Here's how to run the full scraping process for a specific time range.
Note: This will take a long time to run, so we're using a small date range.


In [8]:
# Define a function to run the scraper asynchronously
async def run_scraper_demo():
    # Use only a single recent season to keep the example quick
    await scrape_all_data(seasons=[2023], concurrency=5, delay=0.5)


# Uncomment to run the full scraper
# await run_scraper_demo()

## Example 7: Loading Processed Data

After running the full scraper and processor, you can easily load and analyze the processed data.


In [None]:
# Load processed data files
data_dir = Path("espn_data/data/processed/csv")

# Check if processed data exists
if data_dir.exists():
    # List available data files
    data_files = list(data_dir.glob("*.csv"))
    print("Available data files:")
    for file in data_files:
        print(f"  - {file.name}")

    # Example: Load and display team data
    teams_file = data_dir / "teams.csv"
    if teams_file.exists():
        teams_df = pd.read_csv(teams_file)
        print(f"\nLoaded {len(teams_df)} teams\n")
        display(teams_df.head())
else:
    print("No processed data available yet. Run the full scraper and processor first.")

## Example 8: Analyzing Conference Performance

Let's analyze team performance by conference.


In [None]:
# Load game teams data if available
game_teams_file = data_dir / "game_teams.csv"
schedules_file = data_dir / "schedules.csv"

if game_teams_file.exists() and teams_file.exists():
    # Load data
    game_teams_df = pd.read_csv(game_teams_file)
    teams_df = pd.read_csv(teams_file)

    # Merge with team data to get conference information
    merged_df = game_teams_df.merge(teams_df[['team_id', 'conference_name']], left_on='team_id', right_on='team_id')

    # Calculate win percentage by conference
    conference_stats = merged_df.groupby('conference_name').agg(total_games=('game_id', 'count'),
                                                                wins=('winner', 'sum')).reset_index()

    # Calculate win percentage
    conference_stats['win_pct'] = conference_stats['wins'] / conference_stats['total_games']

    # Sort by win percentage
    conference_stats = conference_stats.sort_values('win_pct', ascending=False)

    print("Conference Performance:")
    display(conference_stats)

    # Visualize with a bar chart
    try:
        import matplotlib.pyplot as plt
        plt.figure(figsize=(12, 6))
        plt.bar(conference_stats['conference_name'], conference_stats['win_pct'])
        plt.title('Win Percentage by Conference')
        plt.xlabel('Conference')
        plt.ylabel('Win Percentage')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()
    except ImportError:
        print("Matplotlib not available for visualization. Install with 'pip install matplotlib'")
else:
    print("Game data not available yet. Run the full scraper and processor first.")

## Example 9: Player Statistical Leaders

Let's identify statistical leaders across all games.


In [None]:
# Load player stats data if available
player_stats_file = data_dir / "player_stats.csv"

if player_stats_file.exists():
    # Load data
    player_stats_df = pd.read_csv(player_stats_file)

    # Find scoring leaders
    if 'PTS' in player_stats_df.columns:
        # Group by player and calculate average points
        scoring_leaders = player_stats_df.groupby(['player_id', 'player_name',
                                                   'team_name']).agg(games_played=('game_id', 'nunique'),
                                                                     total_points=('PTS', 'sum')).reset_index()

        # Calculate points per game
        scoring_leaders['ppg'] = scoring_leaders['total_points'] / scoring_leaders['games_played']

        # Filter to players with at least 5 games
        scoring_leaders = scoring_leaders[scoring_leaders['games_played'] >= 5]

        # Sort by points per game
        scoring_leaders = scoring_leaders.sort_values('ppg', ascending=False)

        print("Scoring Leaders (min. 5 games):")
        display(scoring_leaders[['player_name', 'team_name', 'games_played', 'total_points', 'ppg']].head(20))

        # Create similar analyses for rebounds, assists, etc.
else:
    print("Player stats data not available yet. Run the full scraper and processor first.")

## Conclusion

This notebook demonstrates the basic functionality of the ESPN Women's Basketball Data Scraper. You can use these examples as a starting point for your own analyses and visualizations.
