In [594]:
import pandas as pd
import json
from mplsoccer import Pitch, Sbopen, VerticalPitch
import os
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import time

In [223]:
def fetch_https_data(season : str, league_id : str, league_name : str) : 
    """
    This Python function takes in three parameters: season, league_id, and league_name, which are all of type str. 
    The purpose of this function is to retrieve data from a specified URL on the fbref website using HTTPS protocol. 
    The URL is constructed using the league_id, season, and league_name parameters provided. 
    Once the URL is created, the function uses the requests library to send an HTTP GET request to the URL and retrieve the response data.
    """
    url = f"https://fbref.com/en/comps/{league_id}/{season}/schedule/{season}-{league_name}-Scores-and-Fixtures#sched_{season}_{league_id}_1"
    #url = "https://fbref.com/en/comps/13/2021-2022/schedule/2021-2022-Ligue-1-Scores-and-Fixtures#sched_2021-2022_13_1"
    print(url)
    with requests.Session() as s :
        response = s.get(url, headers= headers , cookies=s.cookies, timeout = 10)
        print("Request page Response:", response)
    return response

In [581]:
def fetch_game_detail_data(url, debug = False) :
    """
    Fetches detailed game data from a given URL using the requests library and returns the response.

    Args:
    - url (str): The URL of the page to fetch game data from.
    - debug (bool, optional): Whether to print debug information. Defaults to False.

    Returns:
    - response (Response): The response object obtained by making a GET request to the URL.

    Raises:
    - Exception: If the response status code is 429 (Too Many Requests).

    Example usage:
    response = fetch_game_detail_data("https://example.com/game_details", debug=True)
    """
    
    with requests.Session() as s :
        response = s.get(url, headers= headers , cookies=s.cookies, timeout = 10)
        if debug :
            print(url)
            print("Request page Response:", response)
        
        if response.status_code == 429 :
            raise Exception(f"Too many requests, you got timeout. Please wait 1 hour")
    return response

In [702]:
def parse_table_schedule(table) :
    """
    Parses a given HTML table containing soccer match schedules and returns a dictionary of match data.

    Args:
    - table (Tag): The HTML table object containing the match data.

    Returns:
    - data_table (dict): A dictionary containing the parsed match data, with each key being a row index and the corresponding value being a dictionary of match data.

    Example usage:
    soup = BeautifulSoup(html_doc, 'html.parser')
    table = soup.find('table', {'id': 'match-schedule'})
    data_table = parse_table_schedule(table)
    """
    
    data_table = {}
    rows = table.findAll('tr')
    for i,row in tqdm(enumerate(rows), total = len(rows)) :
        try : # Handle empty row 
            data = {}
            data['gameweek'] = row.find(['td','th'], {'data-stat': 'gameweek'}).text.strip()
            data['dayofweek'] = row.find('td', {'data-stat': 'dayofweek'}).text.strip()
            data['date'] = row.find('td', {'data-stat': 'date'}).text.strip()
            data['start_time'] = row.find('td', {'data-stat': 'start_time'}).text.strip()
            data['home_team'] = row.find('td', {'data-stat': 'home_team'}).text.strip()
            data['home_xg'] = row.find('td', {'data-stat': 'home_xg'}).text.strip() if not TypeError else None
            data['score'] = row.find('td', {'data-stat': 'score'}).text.strip()
            data['away_xg'] = row.find('td', {'data-stat': 'away_xg'}).text.strip() if not TypeError else None
            data['away_team'] = row.find('td', {'data-stat': 'away_team'}).text.strip()
            data['attendance'] = row.find('td', {'data-stat': 'attendance'}).text.strip() if not TypeError else None
            data['venue'] = row.find('td', {'data-stat': 'venue'}).text.strip()
            data['game_detail'] = f"https://fbref.com{row.find('td', {'data-stat': 'score'}).a['href']}"
            data_table[i] = data
        except TypeError :
            continue
    return data_table 

In [703]:
season_list = ['2015-2016','2016-2017','2017-2018','2018-2019','2019-2020','2020-2021','2021-2022']
league_table = {
    'Ligue-1' : 13,
}

In [704]:
user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.112 Safari/535.0"
headers = {
    'User-Agent': user_agent,
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept': '*/*',
    'Connection': 'keep-alive'}

## All season, single league 

In [705]:
league_name = 'Ligue-1'
league_id = league_table[league_name]
df_list = []
for season in season_list :
    print(season)
    r = fetch_https_data(season, league_id, league_name) 
    soup = BeautifulSoup(r.text, 'lxml') # parse html
    table_row = soup.find('table').find('tbody') # focus on the table
    data_parsed = parse_table_schedule(table_row)
    df = pd.DataFrame.from_dict(data_parsed, orient = 'index')
    df['season'] = season
    df['league'] = league_name
    df_list.append(df)

2015-2016
https://fbref.com/en/comps/13/2015-2016/schedule/2015-2016-Ligue-1-Scores-and-Fixtures#sched_2015-2016_13_1
Request page Response: <Response [200]>


100%|██████████████████████████████████████████████████████████████████████████████| 422/422 [00:00<00:00, 2098.80it/s]


2016-2017
https://fbref.com/en/comps/13/2016-2017/schedule/2016-2017-Ligue-1-Scores-and-Fixtures#sched_2016-2017_13_1
Request page Response: <Response [200]>


100%|██████████████████████████████████████████████████████████████████████████████| 426/426 [00:00<00:00, 1125.88it/s]


2017-2018
https://fbref.com/en/comps/13/2017-2018/schedule/2017-2018-Ligue-1-Scores-and-Fixtures#sched_2017-2018_13_1
Request page Response: <Response [200]>


100%|██████████████████████████████████████████████████████████████████████████████| 424/424 [00:00<00:00, 1661.36it/s]


2018-2019
https://fbref.com/en/comps/13/2018-2019/schedule/2018-2019-Ligue-1-Scores-and-Fixtures#sched_2018-2019_13_1
Request page Response: <Response [200]>


100%|██████████████████████████████████████████████████████████████████████████████| 432/432 [00:00<00:00, 1628.35it/s]


2019-2020
https://fbref.com/en/comps/13/2019-2020/schedule/2019-2020-Ligue-1-Scores-and-Fixtures#sched_2019-2020_13_1
Request page Response: <Response [200]>


100%|██████████████████████████████████████████████████████████████████████████████| 421/421 [00:00<00:00, 1628.42it/s]


2020-2021
https://fbref.com/en/comps/13/2020-2021/schedule/2020-2021-Ligue-1-Scores-and-Fixtures#sched_2020-2021_13_1
Request page Response: <Response [200]>


100%|██████████████████████████████████████████████████████████████████████████████| 427/427 [00:00<00:00, 1881.92it/s]


2021-2022
https://fbref.com/en/comps/13/2021-2022/schedule/2021-2022-Ligue-1-Scores-and-Fixtures#sched_2021-2022_13_1
Request page Response: <Response [200]>


100%|██████████████████████████████████████████████████████████████████████████████| 425/425 [00:00<00:00, 1499.80it/s]


---

In [263]:
df_merge = pd.concat(df_list) # concatenate all season into a unique dataframe

## Gather Game detail

In [592]:
def parse_team_stats(teams_stats_table, debug = False) :
    """
    Parses a given HTML table containing soccer team stats and returns a dictionary of selected stats.

    Args:
    - teams_stats_table (Tag): The HTML table object containing the team stats data.
    - debug (bool): A flag indicating whether or not to print debug information during execution.

    Returns:
    - stats (dict): A dictionary containing the parsed team stats data, with each key being a selected statistic and the corresponding value being a tuple containing the home and away team values.

    Example usage:
    soup = BeautifulSoup(html_doc, 'html.parser')
    table = soup.find('table', {'id': 'team-stats'})
    stats = parse_team_stats(table)
    """
    stats_header = ["Possession", "Shots on Target", "Saves"]
    stats = {}
    teams_stats_table
    tr = teams_stats_table.findAll('tr')
    tr = tr[1::] # first value is irrelevant
    # header and value got distinct tr. So we link them in a tuple 
    tr_header = tr[::2] # every 2 tr we got the name of the value
    tr_values = tr[1::2] # same 
    
    tr = [(h,v) for h,v in zip(tr_header,tr_values)]
    i = 0
    for _tr in tr : 
        try :
            #print(_tr)
            _tr_header, _tr_value  = _tr 
            td = _tr_value.find_all('td') # contain one stat type for 2 teams
            
            value = []
            for j,_td in enumerate(td) : # loop throw team value
                val = _td.find('div').find('div')
                value.append(val)
            if debug :
                print(value)
                print('-'*15)
            header = _tr_header.text.strip()
            if header in stats_header : # we only want specific stats
                stats[f'{header} home'] = value[0]
                stats[f'{header} away'] = value[1]
                i += 1
        except IndexError: # handle error for td with only the stats name
            continue
    return stats
    

In [571]:
def get_team_detail(url) :
    """
    Fetches detailed game statistics data from a given URL and returns them as a dictionary.

    Args:
    - url (str): The URL from which to fetch the data.

    Returns:
    - stats (dict): A dictionary containing the parsed game statistics data for the specified URL.

    Example usage:
    url = "https://fbref.com/en/matches/5e7d1b5c/Sheffield-United-Manchester-City-March-17-2020-Premier-League"
    stats = get_team_detail(url)
    """
    r_detail = fetch_game_detail_data(url)
    soup_detail = BeautifulSoup(r_detail.text, 'lxml') # parse html
    div = soup_detail.find("div", {"id" :"team_stats"})
    stats = parse_team_stats(div)
    
    return stats 

r = fetch_game_detail_data(_url)

soup_detail = BeautifulSoup(r.text, 'lxml') # parse html

div = soup_detail.find("div", {"id" :"team_stats"})
stats = parse_team_stats(div)

stats

In [672]:
stats

{}

In [673]:
url_list = df_merge[df_merge["possession_home"].isnull()]["game_detail"].values.tolist() # only games without values
url_list = df_merge["game_detail"].values.tolist() # only games without values
game_detail = {}
for _url in tqdm(url_list) :
    stats = get_team_detail(_url)
    #game_detail[_url] = stats
    if stats : # if dictionnary is null, no values to be added
        df_merge = new_features_game_detail(df_merge, _url, stats)
    time.sleep(2.5) #timeout
    break

  0%|                                                                                         | 0/2670 [00:04<?, ?it/s]


In [640]:
def new_features_game_detail(df, url, stats_dict) :
    
    df.loc[df.game_detail == url,"possession_home"] = str(stats_dict["Possession home"])
    df.loc[df.game_detail == url,"possession_away"] = str(stats_dict["Possession away"])
    df.loc[df.game_detail == url,"shot_on_target_home"] = str(stats_dict["Shots on Target home"])
    df.loc[df.game_detail == url,"shot_on_target_away"] = str(stats_dict["Shots on Target away"])
    df.loc[df.game_detail == url,"saves_home"] = str(stats_dict["Saves home"])
    df.loc[df.game_detail == url,"saves_away"] = str(stats_dict["Saves away"])
    
    return df 

## Save to local file

In [711]:
df_merge.to_csv(f"{league_name}-2015-2022.csv", index = False, encoding = 'utf-8')

In [643]:
#df_merge[['possession_home', 'possession_away', 'shot_on_target_home', 'shot_on_target_away', 'saves_home', 'saves_away']] = None