# LIGA HISTORICAL DATA ANALYSIS

## Web scraping to get La Liga data

In [165]:
import os
import requests
from bs4 import BeautifulSoup
import time
import re
import pandas as pd
from tqdm.notebook import tqdm
import numpy as np

## Creating variables : 
SEASONS = list(range(2020, 2023))#Select the NBA season(s) to scrape
STANDINGS_DIR = r'C:\Users\aureb\OneDrive - Sport-Data\Documents\COURS\DATABIRD\Football_PROJECT\data\standings' #Final destination for the seasons .html files
GAMES_DIR = r'C:\Users\aureb\OneDrive - Sport-Data\Documents\COURS\DATABIRD\Football_PROJECT\data\games' # Final destination for the games .html files

def scrape_games_html(season):
    url = f"https://fbref.com/en/comps/12/{season}-{season+1}/schedule/La-Liga-Scores-and-Fixtures"# Formatted string with the variable seaso
    # Send a request to the website
    response = requests.get(url)
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        # Extract information based on the HTML structure
        # Modify this part based on the structure of the website
        links = soup.find_all("a", href=lambda href: href and href.startswith("/en/matches/") and href.endswith("La-Liga") )
        game_pages = [f"https://fbref.com/{l['href']}" for l in links]

        # Create a folder to save HTML files if it doesn't exist
        folder_path = GAMES_DIR
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)

        # Iterate through the game pages and save HTML content
        for page_url in tqdm(game_pages):
            # Define a regular expression pattern to capture the desired portion
            pattern = r"/matches/([^/]+/.+)$"
            # Use re.search to find the match
            match = re.search(pattern, page_url)
            match = match.group(1)
            # Replace '/' with '-'
            match = match.replace('/', '-')
            save_path = os.path.join(folder_path, f'{match}.html')
            if os.path.exists(save_path):
                    continue
            response = requests.get(page_url)
            if response.status_code == 200:
                # Save the HTML content in a file
                with open(os.path.join(folder_path, f'{match}.html'), 'w', encoding='utf-8') as file:
                    file.write(response.text)
                print(f"Saved HTML for {page_url}")
                time.sleep(5)
            else:
                print(f"Failed to retrieve HTML for {page_url}")
    else:
        print(f"Failed to retrieve main page. Status code: {response.status_code}")

In [164]:
#Executing the function to scrape all the games :
for season in SEASONS:
     scrape_games_html(season)

  0%|          | 0/760 [00:00<?, ?it/s]

  0%|          | 0/760 [00:00<?, ?it/s]

  0%|          | 0/760 [00:00<?, ?it/s]

## Create a DataFrame with pandas

### Function 1 :  

In [162]:
# Create a function to "clean" the html code of the game page before extracting the tables we need
def parse_html(html_file):
    with open(html_file, 'r', encoding='utf-8') as f:  # Specify the encoding here
        html = f.read()
        soup = BeautifulSoup(html, 'html.parser')
        [s.decompose() for s in soup.select("tr.over_header")]  # Remove the "basic Box score Stats" header
        [s.decompose() for s in soup.select("tr.thead")]  # Remove reserves row
        return soup

### Function 2 : Get the basic stats from each html file saved into the local folder

In [163]:
def get_basic_stats(html_file):
    soup = parse_html(html_file)#Executing the code with the function created
    # Extract the score and xG :
    score_list = []
    xG_list = []
    for item in soup.find_all('div', {'class': 'scores'}):
        xG = item.text.strip()[-3:]
        score = item.text.strip()[:-3]
        score_list.append(score)
        xG_list.append(xG)
    # Extract the Team stats percentage : 
    team_stats_div = soup.find('div', {'id': 'team_stats'})
    team_home =team_stats_div.find('span',{ 'class' : 'teamandlogo', 'style' : 'padding-right: 10px'}).text.strip()
    team_away = team_stats_div.find('span',{ 'class' : 'teamandlogo', 'style' : 'padding-left: 10px'}).text.strip()
    #Exctract the date: 
    date_span = soup.find('span', {'class': 'venuetime'})
    date = date_span['data-venue-date']
    basic_stats_values = [strong_tag.text for strong_tag in team_stats_div.find_all('strong')]
    cleaned_stats_values = [percentage.strip() for percentage in basic_stats_values]
    possession = cleaned_stats_values[:2] or [np.nan, np.nan]
    passing_accuracy = cleaned_stats_values[2:4] or [np.nan, np.nan]
    shots_on_target = cleaned_stats_values[4:6] or [np.nan, np.nan]
    saves = cleaned_stats_values[6:8] or [np.nan, np.nan]
    # Extract the number of yellow and red cards :
    yellow_cards_list = []
    red_cards_list = []
    for x in range(0,2):
        card = soup.find_all('div', {'class': 'cards'})
        yellow_cards = card[x].select('div.cards span.yellow_card')
        red_cards = card[x].select('div.cards span.red_card, div.cards span.yellow_red_card')
        num_yellow_cards = len(yellow_cards)
        num_red_cards = len(red_cards)
        yellow_cards_list.append(num_yellow_cards)
        red_cards_list.append(num_red_cards)
        # Use regular expression to extract the alphanumeric string between the last '\\' and the first '-'
        match = re.search(r'\\([a-zA-Z0-9]+)-', html_file)
        if match:
            game_id = match.group(1)
        else:
            print("Pattern not found in the file path.")
    # Create a DataFrame
    data = {
        'Date' : date,
        'game_id' : game_id,
        'Team': [team_home, team_away],
        'Goals' : score_list,
        'xG' : xG_list,
        'Possession': possession,
        'Passing Accuracy': passing_accuracy,
        'Shots on Target': shots_on_target,
        'Saves': saves,
        'Yellow_cards' :yellow_cards_list,
        'Red_cards' :red_cards_list, 
    }

    df = pd.DataFrame(data)
    return df

### Function 3 : Get the full team stats from each html file saved into the local folder

In [36]:
def get_extra_stats(html_file):
    soup = parse_html(html_file)#Executing the code with the function created
    #Create empty lists to stock the data we will crap 
    home_stats_extra_list = []
    away_stats_extra_list = []
    column_headers = ['Fouls', 'Corners', 'Crosses', 'Touches','Tackles' ,'Interceptions' ,
              'Aerials_Won','Clearances','Offsides','Goal Kicks','Throw Ins','Long Balls' ] #headers for the final db
    
    extra_stats = soup.find('div', {'id': 'team_stats_extra'})
    for item in extra_stats.find_all('div'):
        current_teams = [team.strip() for team in item.text.strip().split(' ') if team]
        value = item.text.strip()
        numeric_values = [int(val) for val in re.findall(r'\d+', value)]
        if len(numeric_values) == 8:
            home_stats = [numeric_values[0] , numeric_values[2], numeric_values[4], numeric_values[6]]
            away_stats = [numeric_values[1] , numeric_values[3], numeric_values[5], numeric_values[7]]
            home_stats_extra_list.append(home_stats[:4])
            away_stats_extra_list.append(away_stats[:4])

    # Flatten the list of lists
    flattened_home_stats = [item for sublist in home_stats_extra_list for item in sublist]
    flattened_away_stats = [item for sublist in away_stats_extra_list for item in sublist]


    # Create DataFrame
    data = [flattened_home_stats, flattened_away_stats]
    df = pd.DataFrame(data, columns=column_headers)
    return df

After created the functions, we can execute the functions into a loop going into all html files we scrapped before

In [67]:
GAMES_DIR = r'C:\Users\aureb\OneDrive - Sport-Data\Documents\COURS\DATABIRD\Football_PROJECT\data\games' # Final destination for the games .html files
games_html = os.listdir(GAMES_DIR)
games_html = [os.path.join(GAMES_DIR, f) for f in games_html if f.endswith(".html")]
# Initialize an empty DataFrame
all_games_df = pd.DataFrame()
# Assuming games_html is a list containing HTML content for each game 
for game in tqdm(games_html) :
    basic_stats = get_basic_stats(game) 
    extra_stats = get_extra_stats(game)  
    # Concatenate basic_stats and extra_stats along the columns axis
    result_df = pd.concat([basic_stats, extra_stats], axis=1)
     # Concatenate the current game's DataFrame to the main DataFrame
    all_games_df = pd.concat([all_games_df, resu lt_df], ignore_index=True)

  0%|          | 0/1330 [00:00<?, ?it/s]

Once we have the database, we can do a little bit of cleaning 

In [74]:
# Cleaning the DATE : string to date format
all_games_df['Date'] = pd.to_datetime(all_games_df['Date']).dt.date

#Adding a new Season column that we can separate the data per season if we want to
# Define a function to calculate the year based on the date
def calculate_year(row):
    if row['Date'].month >= 8:
        return row['Date'].year + 1
    else:
        return row['Date'].year
#Apply the calculate_year function to the new column to get the Season : 
all_games_df["Season"] = all_games_df.apply(calculate_year, axis=1)

#Cleaning the columns were the figures come with the "%"
# Function to convert percentage columns to float
def convert_percentage_to_float(column_name, dataframe):
    dataframe[column_name] = pd.to_numeric(dataframe[column_name], errors='coerce')
    return dataframe
# Apply the function to all percentage columns
percentage_columns = ['Goals','xG','Possession', 'Passing Accuracy', 'Shots on Target', 'Saves']
for column in percentage_columns:
    convert_percentage_to_float(column,all_games_df )

# # Data analysis using plot

### Possession and Passing Accuracy

In [133]:
import plotly.express as px
# Group by 'Team' and calculate the mean values
team_summary = all_games_df.groupby('Team')[['Possession', 'Passing Accuracy']].mean().reset_index()
# Create a scatter plot
fig = px.scatter(
    team_summary, 
    x='Possession', 
    y='Passing Accuracy', 
    color='Team', 
    text='Team',  # Specify the 'Team' column for text annotations
    title='Average Possession and Passing Accuracy by Team',
    labels={'Possession': 'Average Possession(%)', 'Passing Accuracy': 'Average Passing Accuracy(%)'},
    height=600, width=1000
)

# Update layout
fig.update_layout(
    showlegend=False,
)# Hide legend to avoid duplicate team names

## Goals and Expected goals (xG)

In [161]:
import plotly.express as px

# Group by 'Team' and calculate the mean values
team_summary = all_games_df.groupby('Team')[['Goals', 'xG']].mean().reset_index()

# Create a DataFrame with separate columns for 'Goals' and 'xG'
team_summary_long = team_summary.melt(id_vars='Team', value_vars=['Goals', 'xG'], var_name='Metric', value_name='Average')

# Sort the DataFrame by 'Average' in descending order
team_summary = team_summary.sort_values(by='Goals', ascending= False)

# Create a horizontal bar plot
fig = px.bar(team_summary_long_sorted, x='Average', y='Team', color='Metric',
             labels={'Average': 'Average', 'Metric': 'Metric'},
             title='Average Goals and xG per game by Team',
             orientation='h',  # Horizontal orientation
             category_orders={'Team': team_summary['Team'].tolist()},
             height=800, width=800)  # Adjust dimensions as needed

# Add average values as text annotations
fig.update_traces(texttemplate='%{x:.2f}', textposition='inside')


# Show the plot
fig.show()

## Indiscipline

In [143]:
import plotly.express as px

# Calculate the total indiscipline score for each team
all_games_df['Total_Indiscipline'] = all_games_df['Fouls'] + all_games_df['Yellow_cards'] + all_games_df['Red_cards']

# Group by 'Team' and calculate the mean values
indiscipline_summary = all_games_df.groupby('Team')[['Fouls', 'Yellow_cards', 'Red_cards', 'Total_Indiscipline']].mean().reset_index()

# Sort the teams based on the total indiscipline score in descending order
indiscipline_summary_sorted = indiscipline_summary.sort_values(by='Total_Indiscipline', ascending=False)

# Create a horizontal bar plot with different colors for yellow and red cards
fig = px.bar(indiscipline_summary_sorted, x=['Fouls', 'Yellow_cards', 'Red_cards'], y='Team',
             color_discrete_map={'Fouls':'black','Yellow_cards': 'yellow', 'Red_cards': 'red'},
             labels={'value': 'Average', 'variable': 'Indiscipline'},
             title='Average Indiscipline (Fouls, Yellow Cards, Red Cards) by Team (per game)',
             orientation='h',  # Horizontal orientation
             height=800, width=800)  # Adjust dimensions as needed
# Add average values as text annotations
fig.update_traces(texttemplate='%{x:.2f}', textposition='inside')

# Show the plot
fig.show()

## Defensive stats

In [160]:
import plotly.express as px

# Define the stats columns
stats_columns = ['Tackles', 'Interceptions']

# Calculate the median for each stat
median_stats = all_games_df.groupby('Team')[stats_columns].median().reset_index()

# Sort the teams based on the median of Tackles in descending order
median_tackles_sorted = median_stats.sort_values(by='Tackles', ascending=False)

# Sort the teams based on the median of Tackles in descending order
median_interceptions_sorted = median_stats.sort_values(by='Interceptions', ascending=False)

# Create individual box plots for each stat
fig_tackles = px.box(all_games_df, x='Team', y='Tackles', title='Distribution of Tackles (order by median)',
                     category_orders={'Team': median_tackles_sorted['Team'].tolist()})

fig_interceptions = px.box(all_games_df, x='Team', y='Interceptions', title='Distribution of Interceptions (order by median)',
                           category_orders={'Team': median_interceptions_sorted['Team'].tolist()})

# Show the plots
fig_tackles.show()
fig_interceptions.show()