# Collecte des données footballistiques

### Configuration de Selenium et Lancement du Site Web

In [51]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

website = "https://fbref.com/en/comps/9/2024-2025/2024-2025-Premier-League-Stats"
path = "../chromedriver-win32/chromedriver.exe"
service = Service(path)
driver = webdriver.Chrome(service=service)

driver.get(website) 



### Extraction des Équipes et de leurs Liens depuis la Page Web

In [52]:
team_data = []
team_elements = driver.find_elements(By.CSS_SELECTOR,'td[data-stat="team"] > a')
seen = set()

for t in team_elements:
    name = t.text.strip()
    href = t.get_attribute('href')
    
    if name and name not in seen:
        team_data.append((name, href))
        seen.add(name)

print(team_data)

[('Liverpool', 'https://fbref.com/en/squads/822bd0ba/2024-2025/Liverpool-Stats'), ('Arsenal', 'https://fbref.com/en/squads/18bb7c10/2024-2025/Arsenal-Stats'), ('Manchester City', 'https://fbref.com/en/squads/b8fd03ef/2024-2025/Manchester-City-Stats'), ('Chelsea', 'https://fbref.com/en/squads/cff3d9bb/2024-2025/Chelsea-Stats'), ('Newcastle Utd', 'https://fbref.com/en/squads/b2b47a98/2024-2025/Newcastle-United-Stats'), ('Aston Villa', 'https://fbref.com/en/squads/8602292d/2024-2025/Aston-Villa-Stats'), ("Nott'ham Forest", 'https://fbref.com/en/squads/e4a775cb/2024-2025/Nottingham-Forest-Stats'), ('Brighton', 'https://fbref.com/en/squads/d07537b9/2024-2025/Brighton-and-Hove-Albion-Stats'), ('Bournemouth', 'https://fbref.com/en/squads/4ba7cbea/2024-2025/Bournemouth-Stats'), ('Brentford', 'https://fbref.com/en/squads/cd051869/2024-2025/Brentford-Stats'), ('Fulham', 'https://fbref.com/en/squads/fd962109/2024-2025/Fulham-Stats'), ('Crystal Palace', 'https://fbref.com/en/squads/47c64c55/2024-2

In [53]:
for i in team_data:
    print(i)

('Liverpool', 'https://fbref.com/en/squads/822bd0ba/2024-2025/Liverpool-Stats')
('Arsenal', 'https://fbref.com/en/squads/18bb7c10/2024-2025/Arsenal-Stats')
('Manchester City', 'https://fbref.com/en/squads/b8fd03ef/2024-2025/Manchester-City-Stats')
('Chelsea', 'https://fbref.com/en/squads/cff3d9bb/2024-2025/Chelsea-Stats')
('Newcastle Utd', 'https://fbref.com/en/squads/b2b47a98/2024-2025/Newcastle-United-Stats')
('Aston Villa', 'https://fbref.com/en/squads/8602292d/2024-2025/Aston-Villa-Stats')
("Nott'ham Forest", 'https://fbref.com/en/squads/e4a775cb/2024-2025/Nottingham-Forest-Stats')
('Brighton', 'https://fbref.com/en/squads/d07537b9/2024-2025/Brighton-and-Hove-Albion-Stats')
('Bournemouth', 'https://fbref.com/en/squads/4ba7cbea/2024-2025/Bournemouth-Stats')
('Brentford', 'https://fbref.com/en/squads/cd051869/2024-2025/Brentford-Stats')
('Fulham', 'https://fbref.com/en/squads/fd962109/2024-2025/Fulham-Stats')
('Crystal Palace', 'https://fbref.com/en/squads/47c64c55/2024-2025/Crystal-

### Chargement des Pages Équipes et Gestion des Timeouts

In [54]:
from selenium.common.exceptions import TimeoutException

for i in range(len(team_data)):
    team, href = team_data[i]
    print(f"Loading {team}...")
    driver.get(href)
    try:
        WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "table.stats_table"))
        )
    except TimeoutException:
        print(f"Timeout loading {team}, skipping...")
        continue

Loading Liverpool...
Loading Arsenal...
Loading Manchester City...
Loading Chelsea...
Loading Newcastle Utd...
Loading Aston Villa...
Loading Nott'ham Forest...
Loading Brighton...
Loading Bournemouth...
Loading Brentford...
Loading Fulham...
Loading Crystal Palace...
Loading Everton...
Loading West Ham...
Loading Manchester Utd...
Loading Wolves...
Loading Tottenham...
Loading Leicester City...
Loading Ipswich Town...
Loading Southampton...


### Extraction des Données des Joueurs de Toutes les Équipes et Création du DataFrame

In [70]:
import pandas as pd
from selenium.common.exceptions import TimeoutException
all_headers = []
all_teams_data = []

all_teams_players = []
for i in range(len(team_data)):
    team, href = team_data[i]
    print(f"Loading {team}...")
    driver.get(href)
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "stats_standard_9"))
        )
        table = driver.find_element(By.ID, 'stats_standard_9')
        
        column_headers = table.find_elements(By.CSS_SELECTOR, "thead tr:nth-child(2) th")
        column_headers_text = [h.text for h in column_headers][:17]
        

        players_rows = table.find_elements(By.CSS_SELECTOR,"tbody tr")
        player_data = []
        for row in players_rows:
            player_name = row.find_element(By.TAG_NAME, "th").text
            cells = row.find_elements(By.TAG_NAME, "td")
            row_data = [player_name]+[cell.text for cell in cells[:16]]
            
            player_data.append(row_data)
            
         
        print("good")
        
        team_df = pd.DataFrame(player_data, columns=column_headers_text)
        team_df.insert(0, "Team", team)
        all_teams_data.append(team_df)
        
    except TimeoutException:
        print(f"Timeout loading {team}, skipping...")
        continue
    
players_All = pd.concat(all_teams_data,ignore_index=True)
print(players_All)

Loading Liverpool...
good
Loading Arsenal...
good
Loading Manchester City...
good
Loading Chelsea...
good
Loading Newcastle Utd...
good
Loading Aston Villa...
good
Loading Nott'ham Forest...
good
Loading Brighton...
good
Loading Bournemouth...
good
Loading Brentford...
good
Loading Fulham...
good
Loading Crystal Palace...
good
Loading Everton...
good
Loading West Ham...
good
Loading Manchester Utd...
good
Loading Wolves...
good
Loading Tottenham...
good
Loading Leicester City...
good
Loading Ipswich Town...
good
Loading Southampton...
good
            Team                 Player   Nation Pos Age  MP Starts    Min  \
0      Liverpool          Mohamed Salah   eg EGY  FW  32  38     38  3,371   
1      Liverpool        Virgil van Dijk   nl NED  DF  33  37     37  3,330   
2      Liverpool       Ryan Gravenberch   nl NED  MF  22  37     37  3,160   
3      Liverpool    Alexis Mac Allister   ar ARG  MF  25  35     30  2,599   
4      Liverpool        Ibrahima Konaté   fr FRA  DF  25  31    

In [74]:
players_All.head(100)

Unnamed: 0,Team,Player,Nation,Pos,Age,MP,Starts,Min,90s,Gls,Ast,G+A,G-PK,PK,PKatt,CrdY,CrdR,xG
0,Liverpool,Mohamed Salah,eg EGY,FW,32,38,38,3371,37.5,29,18,47,20,9,9,1,0,25.2
1,Liverpool,Virgil van Dijk,nl NED,DF,33,37,37,3330,37.0,3,1,4,3,0,0,5,0,2.2
2,Liverpool,Ryan Gravenberch,nl NED,MF,22,37,37,3160,35.1,0,4,4,0,0,0,6,1,1.1
3,Liverpool,Alexis Mac Allister,ar ARG,MF,25,35,30,2599,28.9,5,5,10,5,0,0,6,0,2.8
4,Liverpool,Ibrahima Konaté,fr FRA,DF,25,31,30,2560,28.4,1,2,3,1,0,0,5,0,1.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Manchester City,Abdukodir Khusanov,uz UZB,DF,20,6,6,503,5.6,0,0,0,0,0,0,1,0,0.0
96,Manchester City,James Mcatee,eng ENG,"MF,FW",21,15,3,349,3.9,3,0,3,3,0,0,1,0,2.8
97,Manchester City,Jahmai Simpson-Pusey,eng ENG,DF,18,2,1,96,1.1,0,0,0,0,0,0,1,0,0.0
98,Manchester City,Rodri,es ESP,MF,28,3,1,73,0.8,0,0,0,0,0,0,0,0,0.0


### Extraction des Données des Matchs de Toutes les Équipes et Création du DataFrame

In [67]:
all_teams_data2 = []
for i in range(len(team_data)):
    team, href = team_data[i]
    print(f"Loading matches of {team}...")
    
    driver.get(href)
    
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "matchlogs_for"))
        )
        
        table = driver.find_element(By.ID, "matchlogs_for")
        
        column_headers = table.find_elements(By.CSS_SELECTOR, "thead tr th")
        column_headers_text = [h.text for h in column_headers][:-2]
        
        matches_row = table.find_elements(By.CSS_SELECTOR, "tbody tr")
       
        matches_data = []
        for row in matches_row:
            matche_date = row.find_element(By.TAG_NAME, "th").text
            cells = row.find_elements(By.TAG_NAME, "td")
            row_data = [matche_date]+[cell.text for cell in cells[:-2]]
            
            matches_data.append(row_data)
        
        print(column_headers_text)
        
        match_df = pd.DataFrame(matches_data, columns=column_headers_text)
        match_df.insert(0,"team",team)
        all_teams_data2.append(match_df)
        
    except TimeoutException:
        print(f"Timeout loading {team}, skipping...")
        continue
    
matches_All = pd.concat(all_teams_data2,ignore_index=True) 



Loading matches of Liverpool...
['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA', 'Opponent', 'xG', 'xGA', 'Poss', 'Attendance', 'Captain', 'Formation', 'Opp Formation', 'Referee']
Loading matches of Arsenal...
['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA', 'Opponent', 'xG', 'xGA', 'Poss', 'Attendance', 'Captain', 'Formation', 'Opp Formation', 'Referee']
Loading matches of Manchester City...
['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA', 'Opponent', 'xG', 'xGA', 'Poss', 'Attendance', 'Captain', 'Formation', 'Opp Formation', 'Referee']
Loading matches of Chelsea...
['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA', 'Opponent', 'xG', 'xGA', 'Poss', 'Attendance', 'Captain', 'Formation', 'Opp Formation', 'Referee']
Loading matches of Newcastle Utd...
['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA', 'Opponent', 'xG', 'xGA', 'Poss', 'Attendance', 'Captain', 'Formation', 'Opp Fo