In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time






# Fonction pour scraper les données de la Ligue 1 sur plusieurs saisons
def scrape_ligue1_data(nb_saisons=6):
    """
    Scrappe les données de la Ligue 1 pour un nombre donné de saisons.
    :param nb_saisons: Nombre de saisons à scraper.
    :return: DataFrame des données agrégées sur les saisons spécifiées.
    """
    url_base_ligue1 = "https://fbref.com/en/comps/13/Ligue-1-Stats"
    headers = {'User-Agent': 'Mozilla/5.0'}
    all_seasons_data = []

    for saison in range(nb_saisons):
        rate_limit()
        response = requests.get(url_base_ligue1, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        teams_urls = [f"https://fbref.com{a.get('href')}" for a in soup.select("table.stats_table a") if "squads" in a.get('href')]
        url_base_ligue1 = f"https://fbref.com{soup.find('a', class_='button2 prev').get('href')}"

        for team_url in teams_urls:
            rate_limit()
            team_response = requests.get(team_url, headers=headers)
            team_data = pd.read_html(team_response.text, match="Scores")[0]
            team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
            team_data["Team"] = team_name
            stats_urls = get_stats_urls(team_response)

            for stats_url in set(stats_urls):
                rate_limit()
                detailed_stats = get_detailed_stats(stats_url, headers)
                team_data = team_data.merge(detailed_stats, on="Date")

            all_seasons_data.append(team_data)

    return pd.concat(all_seasons_data, ignore_index=True)


# Fonction pour scraper les dernières données de la Ligue 1
def scrape_latest_ligue1_data():
    """
    Scrappe les dernières données disponibles pour chaque équipe de la Ligue 1.
    :return: DataFrame des dernières données agrégées pour chaque équipe.
    """
    url_ligue1 = "https://fbref.com/en/comps/13/Ligue-1-Stats"
    headers = {'User-Agent': 'Mozilla/5.0'}
    latest_data = []

    response = requests.get(url_ligue1, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    teams_urls = [f"https://fbref.com{a.get('href')}" for a in soup.select("table.stats_table a") if "squads" in a.get('href')]

    for team_url in teams_urls:
        rate_limit()
        team_response = requests.get(team_url, headers=headers)
        team_data = pd.read_html(team_response.text, match="Scores")[0]
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        team_data["Team"] = team_name
        stats_urls = get_stats_urls(team_response)

        for stats_url in set(stats_urls):
            rate_limit()
            detailed_stats = get_detailed_stats(stats_url, headers)
            team_data = team_data.merge(detailed_stats, on="Date")

        latest_data.append(team_data)

    return pd.concat(latest_data, ignore_index=True)


In [3]:
url_base_ligue1 = "https://fbref.com/en/comps/13/Ligue-1-Stats"
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url_base_ligue1, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')

In [4]:
url_equipes = soup.select("table.stats_table")[0].find_all("a")
url_equipes = [equipes.get("href") for equipes in url_equipes]
url_equipes = [equipes for equipes in url_equipes if equipes and "squads" in equipes]
url_equipes = [f"https://fbref.com{i}" for i in url_equipes]

In [5]:
url_equipes = ["https://fbref.com" + equipe.get("href") for equipe in soup.select("table.stats_table")[0].find_all("a") if "squads" in equipe.get("href", "")]


In [6]:
url_equipes

['https://fbref.com/en/squads/e2d8892c/Paris-Saint-Germain-Stats',
 'https://fbref.com/en/squads/132ebc33/Nice-Stats',
 'https://fbref.com/en/squads/fd6114db/Monaco-Stats',
 'https://fbref.com/en/squads/fb08dbb3/Brest-Stats',
 'https://fbref.com/en/squads/cb188c0c/Lille-Stats',
 'https://fbref.com/en/squads/5725cc7b/Marseille-Stats',
 'https://fbref.com/en/squads/fd4e0f7d/Lens-Stats',
 'https://fbref.com/en/squads/7fdd64e0/Reims-Stats',
 'https://fbref.com/en/squads/c0d3eab4/Strasbourg-Stats',
 'https://fbref.com/en/squads/b3072e00/Rennes-Stats',
 'https://fbref.com/en/squads/281b0e73/Montpellier-Stats',
 'https://fbref.com/en/squads/5c2737db/Le-Havre-Stats',
 'https://fbref.com/en/squads/d7a486cd/Nantes-Stats',
 'https://fbref.com/en/squads/d53c0b06/Lyon-Stats',
 'https://fbref.com/en/squads/f83960ae/Metz-Stats',
 'https://fbref.com/en/squads/3f8c4b5f/Toulouse-Stats',
 'https://fbref.com/en/squads/d2c87802/Lorient-Stats',
 'https://fbref.com/en/squads/d9676424/Clermont-Foot-Stats']

In [10]:
team_response = requests.get(url_equipes[0], headers=headers)

soup_team = BeautifulSoup(team_response.text, 'html.parser')

url_stats = {
    f"https://fbref.com{a.get('href')}" 
    for a in soup_team.find_all("a") 
    if "matchlogs/all_comps" in a.get('href', '') and 
       any(substring in a.get('href', '') for substring in ["passing/", "shooting", "possession/", "defense/", "keeper"])
}


In [11]:
url_stats

{'https://fbref.com/en/squads/e2d8892c/2023-2024/matchlogs/all_comps/defense/Paris-Saint-Germain-Match-Logs-All-Competitions',
 'https://fbref.com/en/squads/e2d8892c/2023-2024/matchlogs/all_comps/keeper/Paris-Saint-Germain-Match-Logs-All-Competitions',
 'https://fbref.com/en/squads/e2d8892c/2023-2024/matchlogs/all_comps/passing/Paris-Saint-Germain-Match-Logs-All-Competitions',
 'https://fbref.com/en/squads/e2d8892c/2023-2024/matchlogs/all_comps/possession/Paris-Saint-Germain-Match-Logs-All-Competitions',
 'https://fbref.com/en/squads/e2d8892c/2023-2024/matchlogs/all_comps/shooting/Paris-Saint-Germain-Match-Logs-All-Competitions'}