# Data Collecting

This notebook is designed to scrape football team statistics from the FBref website for the 2023-2024 season. 

---

In [2]:
# import libraries
import requests
import os
import pandas as pd
import time
import random
from pathlib import Path

### scrape for clubs stats

In [12]:
# Base URL and configurations
base_url = "https://fbref.com/en/comps/9/{season}/{season}-Premier-League-Stats"
# seasons = ["2017-2018", "2018-2019", "2019-2020", "2020-2021", "2021-2022", "2022-2023", "2023-2024"]
seasons = ["2023-2024"]

# table_ids = [
#     "stats_squads_standard_for", "stats_squads_keeper_for",
#     "stats_squads_shooting_for", "stats_squads_misc_for",
#     "stats_squads_passing_for", "stats_squads_passing_types_for",
#     "stats_squads_defense_for"
# ]

In [4]:
output_dir = Path("../data/premier_league_by_table")
output_dir.mkdir(parents=True, exist_ok=True)

In [5]:
for season in seasons:
    season_url = base_url.format(season=season)

    print(f"Scraping data for season {season}: {season_url}")
    
    try:
        # Fetch the webpage and parse tables
        response = requests.get(season_url)
        response.raise_for_status()
        tables = pd.read_html(response.text)

        # Save each table for the season
        for i, table in enumerate(tables):
            output_file = output_dir / f"club_stats_{season}_table_{i}.csv"
            table.to_csv(output_file, index=False)
            print(f"Saved table {i} for season {season} to {output_file}")

    except Exception as e:
        print(f"Error scraping data for season {season}: {e}")
    
    # Add a delay to avoid overloading the server
    time.sleep(5)

Scraping data for season 2023-2024: https://fbref.com/en/comps/9/2023-2024/2023-2024-Premier-League-Stats


  tables = pd.read_html(response.text)


Saved table 0 for season 2023-2024 to ../data/premier_league_by_table/club_stats_2023-2024_table_0.csv
Saved table 1 for season 2023-2024 to ../data/premier_league_by_table/club_stats_2023-2024_table_1.csv
Saved table 2 for season 2023-2024 to ../data/premier_league_by_table/club_stats_2023-2024_table_2.csv
Saved table 3 for season 2023-2024 to ../data/premier_league_by_table/club_stats_2023-2024_table_3.csv
Saved table 4 for season 2023-2024 to ../data/premier_league_by_table/club_stats_2023-2024_table_4.csv
Saved table 5 for season 2023-2024 to ../data/premier_league_by_table/club_stats_2023-2024_table_5.csv
Saved table 6 for season 2023-2024 to ../data/premier_league_by_table/club_stats_2023-2024_table_6.csv
Saved table 7 for season 2023-2024 to ../data/premier_league_by_table/club_stats_2023-2024_table_7.csv
Saved table 8 for season 2023-2024 to ../data/premier_league_by_table/club_stats_2023-2024_table_8.csv
Saved table 9 for season 2023-2024 to ../data/premier_league_by_table/clu

### Scrape for players stats

In [31]:
teams = [
    {"id": "18bb7c10", "name": "Arsenal"},
    {"id": "8602292d", "name": "Aston-Villa"},
    {"id": "4ba7cbea", "name": "Bournemouth"},
    {"id": "d07537b9", "name": "Brighton-Hove-Albion"},
    {"id": "cd051869", "name": "Burnley"},
    {"id": "cff3d9bb", "name": "Chelsea"},
    {"id": "47c64c55", "name": "Crystal-Palace"},
    {"id": "d3fd31cc", "name": "Everton"},
    {"id": "fd962109", "name": "Fulham"},
    {"id": "822bd0ba", "name": "Liverpool"},
    {"id": "b8fd03ef", "name": "Manchester-City"},
    {"id": "19538871", "name": "Manchester-United"},
    {"id": "b2b47a98", "name": "Newcastle-United"},
    {"id": "1df6b87e", "name": "Sheffield-United"},
    {"id": "361ca564", "name": "Tottenham-Hotspur"},
    {"id": "7c21e445", "name": "West-Ham-United"},
    {"id": "8cec06e1", "name": "Wolverhampton-Wanderers"},
    {"id": "5bfb9659", "name": "Nott'ham-Forest"},
    {"id": "f5922ca5", "name": "Brentford"},
    {"id": "e297cd13", "name": "Luton-Town"}
]

seasons = ["2023-2024"]

In [29]:
base_url = "https://fbref.com/en/squads/{team_id}/{year}/{team_name}-Stats"

In [30]:
for team in teams:
    team_id = team["id"]
    team_name = team["name"]

    for season_slug in seasons:
        # Format the URL
        season_url = base_url.format(team_id=team_id, year=season_slug, team_name=team_name)

        print(f"Scraping data for {team_name} ({season_slug}): {season_url}")

        try:
            # Request page content
            response = requests.get(season_url)
            response.raise_for_status()

            # Use pandas to read all tables on the page
            tables = pd.read_html(response.text)

            for i, table in enumerate(tables):
                # Add metadata columns
                table["Season"] = season_slug
                table["Team"] = team_name

                # Save each table to a separate CSV file
                output_file = output_dir / f"{team_name}_table_{i}_stats_{season_slug}.csv"
                table.to_csv(output_file, index=False)

                print(f"Saved table {i} for {team_name} ({season_slug}) to {output_file}")

        except Exception as e:
            print(f"Error scraping data for {team_name} ({season_slug}): {e}")

        # Add a delay to avoid overloading the server
        time.sleep(5)

Scraping data for Crystal-Palace (2023-2024): https://fbref.com/en/squads/47c64c55/2023-2024/Crystal-Palace-Stats


  tables = pd.read_html(response.text)


Saved table 0 for Crystal-Palace (2023-2024) to ../data/player_stats_by_team/Crystal-Palace_table_0_stats_2023-2024.csv
Saved table 1 for Crystal-Palace (2023-2024) to ../data/player_stats_by_team/Crystal-Palace_table_1_stats_2023-2024.csv
Saved table 2 for Crystal-Palace (2023-2024) to ../data/player_stats_by_team/Crystal-Palace_table_2_stats_2023-2024.csv
Saved table 3 for Crystal-Palace (2023-2024) to ../data/player_stats_by_team/Crystal-Palace_table_3_stats_2023-2024.csv
Saved table 4 for Crystal-Palace (2023-2024) to ../data/player_stats_by_team/Crystal-Palace_table_4_stats_2023-2024.csv
Saved table 5 for Crystal-Palace (2023-2024) to ../data/player_stats_by_team/Crystal-Palace_table_5_stats_2023-2024.csv
Saved table 6 for Crystal-Palace (2023-2024) to ../data/player_stats_by_team/Crystal-Palace_table_6_stats_2023-2024.csv
Saved table 7 for Crystal-Palace (2023-2024) to ../data/player_stats_by_team/Crystal-Palace_table_7_stats_2023-2024.csv
Saved table 8 for Crystal-Palace (2023-2