In [1]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.firefox import GeckoDriverManager
from bs4 import BeautifulSoup
from tqdm import tqdm
import re
import pandas as pd
import time
import ace_tools_open as tools
import datetime

options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")

service = Service(GeckoDriverManager().install())

# Récupération des liens des championnats les plus important

In [2]:
def get_link_leagues():

    url = "https://fr.whoscored.com/livescores"

    link_leagues = []

    driver = webdriver.Firefox(service=service, options=options)

    try:
        driver.get(url)

        time.sleep(2)

        prev_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.ID, "Premier-Tournois-btn"))
        )
                    
        driver.execute_script("arguments[0].click();", prev_button)

        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")

        code = soup.find_all("a", class_="TournamentNavButton-module_clickableArea__ZFnBl")
        for c in code:
            link_leagues.append("https://fr.whoscored.com" + c.get("href"))

    except Exception as e:
        print("Erreur :", e)

    finally:
        driver.quit()

    return link_leagues


link_leagues = get_link_leagues()
link_leagues

['https://fr.whoscored.com/regions/74/tournaments/22/france-ligue-1',
 'https://fr.whoscored.com/regions/252/tournaments/2/angleterre-premier-league',
 'https://fr.whoscored.com/regions/206/tournaments/4/espagne-laliga',
 'https://fr.whoscored.com/regions/81/tournaments/3/allemagne-bundesliga',
 'https://fr.whoscored.com/regions/108/tournaments/5/italie-serie-a',
 'https://fr.whoscored.com/regions/250/tournaments/12/europe-champions-league',
 'https://fr.whoscored.com/regions/250/tournaments/30/europe-europa-league',
 'https://fr.whoscored.com/regions/177/tournaments/21/portugal-liga-portugal',
 'https://fr.whoscored.com/regions/155/tournaments/13/pays-bas-eredivisie',
 'https://fr.whoscored.com/regions/252/tournaments/7/angleterre-championship',
 'https://fr.whoscored.com/regions/233/tournaments/85/usa-major-league-soccer',
 'https://fr.whoscored.com/regions/31/tournaments/95/brésil-brasileirão',
 'https://fr.whoscored.com/regions/225/tournaments/17/turquie-super-lig',
 'https://fr.wh

# Récupération des liens des championnats pour les saisons précédentes

In [3]:
def get_link_historical_leagues(link_leagues):

    data_link_leagues = pd.DataFrame(columns=["annee", "pays", "league", "lien"])

    for url in tqdm(link_leagues):

        driver = webdriver.Firefox(service=service, options=options)

        try:
            driver.get(url)

            time.sleep(2)

            prev_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.ID, "seasons"))
            )
                        
            driver.execute_script("arguments[0].click();", prev_button)
            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")

        except Exception as e:
            print("Erreur :", e)

        finally:
            driver.quit()


        links = []

        pays = url.split("/")[-1].split("-")[0]

        if datetime.date.today().month > 8:
            annee1 = datetime.date.today().year
        else:
            annee1 = datetime.date.today().year - 1
        annee2 = annee1 + 1
        annee = str(annee1) + "/" + str(annee2)

        league = url.split("/")[-1].replace(pays, "").replace(str(annee1), "").replace(str(annee2), "").replace("-", " ").strip()

        code = soup.find("select", id = "seasons").find_all("option")

        for i in range(len(code)):
            link = "https://fr.whoscored.com" + code[i].get("value")
            links.append([annee, pays, league, link])
            if "/" in annee:
                annee1 += -1
                annee2 += -1
                annee = str(annee1) + "/" + str(annee2)
            else:
                annee = int(annee)
                annee += -1
                annee = str(annee)

        data = pd.DataFrame(links, columns=["annee", "pays", "league", "lien"])

        data_link_leagues = pd.concat([data_link_leagues, data])

    return data_link_leagues


data_link_leagues = get_link_historical_leagues(link_leagues[:1])
data_link_leagues.to_csv("urls/data_link_leagues.csv")

100%|██████████| 1/1 [00:11<00:00, 11.29s/it]


In [4]:
import ace_tools_open as tools
tools.display_dataframe_to_user("", data_link_leagues)




annee,pays,league,lien
Loading ITables v2.2.5 from the internet... (need help?),,,


# Récupération de tous les matchs passés pour une saison donnée

In [None]:
def get_link_match(data_link_leagues):

    all_links = []
    all_links_df = []

    for i in tqdm(range(len(data_link_leagues))):

        driver = webdriver.Firefox(service=service, options=options)

        annee = data_link_leagues["annee"].iloc[i]
        pays = data_link_leagues["pays"].iloc[i]
        league = data_link_leagues["league"].iloc[i]
        url = data_link_leagues["lien"].iloc[i]

        try:
            driver.get(url)

            time.sleep(2)

            stop=False

            while stop == False:
                links1 = []
                links2 = []
                time.sleep(1)

                html = driver.page_source
                soup = BeautifulSoup(html, "html.parser")
                code = soup.find_all("div", class_="Match-module_right_oddsOn__o-ux-")
                for c in code:
                    if c.find("a") is not None:
                        link_name = c.find("a").get("href")
                        link = "https://fr.whoscored.com" + link_name
                        if link not in all_links:
                            links1.append([annee, pays, league, link])
                            links2.append(link)

                all_links_df = all_links_df + links1
                all_links = all_links + links2

                if links2 == []:
                    stop = True
                else:
                    prev_button = WebDriverWait(driver, 10).until(
                        EC.element_to_be_clickable((By.ID, "dayChangeBtn-prev"))
                    )
                    
                    driver.execute_script("arguments[0].click();", prev_button)

        except Exception as e:
            print("Erreur :", e)

        finally:
            driver.quit()

    all_links_df = pd.DataFrame(all_links_df, columns=["annee", "pays", "league", "lien"])

    return all_links_df

100%|██████████| 27/27 [15:52<00:00, 35.28s/it]


In [6]:
import ace_tools_open as tools
tools.display_dataframe_to_user("", all_links_df)




annee,pays,league,lien
Loading ITables v2.2.5 from the internet... (need help?),,,


In [7]:
len(all_links_df)

5730