All data downloaded from https://www.football-data.co.uk/spainm.php

In [10]:
import numpy as np
import pandas as pd
import re

In [11]:
league_name = 'primera'
league_years = [f"{year}_{year+1}" for year in range(1999, 2023)]

match_result_cols = [
    'Div', # League Division
    'Date', # Match Date (dd/mm/yy)
    'Time', # Time of match kick off
    'HomeTeam', # Home Team
    'AwayTeam', # Away Team
    'FTHG and HG', # Full Time Home Team Goals
    'FTAG and AG', # Full Time Away Team Goals
    'FTR and Res', # Full Time Result (H=Home Win, D=Draw, A=Away Win)
    'HTHG', # Half Time Home Team Goals
    'HTAG', # Half Time Away Team Goals
    'HTR', # Half Time Result (H=Home Win, D=Draw, A=Away Win)
]

match_stats_cols = [
    'Attendance', # Crowd Attendance
    'Referee', # Match Referee
    'HS', # Home Team Shots
    'AS', # Away Team Shots
    'HST', # Home Team Shots on Target
    'AST', # Away Team Shots on Target
    'HHW', # Home Team Hit Woodwork
    'AHW', # Away Team Hit Woodwork
    'HC', # Home Team Corners
    'AC', # Away Team Corners
    'HF', # Home Team Fouls Committed
    'AF', # Away Team Fouls Committed
    'HFKC', # Home Team Free Kicks Conceded
    'AFKC', # Away Team Free Kicks Conceded
    'HO', # Home Team Offsides
    'AO', # Away Team Offsides
    'HY', # Home Team Yellow Cards
    'AY', # Away Team Yellow Cards
    'HR', # Home Team Red Cards
    'AR', # Away Team Red Cards
    'HBP', # Home Team Bookings Points (10', # yellow, 25', # red)
    'ABP', # Away Team Bookings Points (10', # yellow, 25', # red)
]

Reading all the data from files into separate dataframes; which will later be consolidated into a single one.

In [12]:
# This dictionary will hold every single separate dataframe
df_list = dict()

# Going through all the files for each league division
for league_year in league_years:
    dataset_file = f"./initial_ds/{league_name}_{league_year}.csv"
    # Reading the league's data from the file
    cur_df = pd.read_csv(dataset_file)
    # Removing the columns corresponding to betting odds
    cur_df = cur_df[[col for col in cur_df if col in match_result_cols+match_stats_cols]]
    # Removing the rows which contain only Null values
    cur_df = cur_df.dropna(axis='index', how="all")
    # Removing the columns which contain only Null values
    cur_df = cur_df.dropna(axis="columns", how='all')
    df_list[league_year] = cur_df
        
print(f"{len(df_list)} dataframes were read.")

df_list.keys()

24 dataframes were read.


dict_keys(['1999_2000', '2000_2001', '2001_2002', '2002_2003', '2003_2004', '2004_2005', '2005_2006', '2006_2007', '2007_2008', '2008_2009', '2009_2010', '2010_2011', '2011_2012', '2012_2013', '2013_2014', '2014_2015', '2015_2016', '2016_2017', '2017_2018', '2018_2019', '2019_2020', '2020_2021', '2021_2022', '2022_2023'])

All columns available in one year are also available in the next:

In [13]:
# Comparing the dataframe for each season with the next season and printing the list of the columns that
# are available in the older season and unavailable in the newer
for i in range(len(league_years) - 1):
    cur_league = f"{league_years[i]}"
    next_league = f"{league_years[i+1]}"
    unmatched_cols = [col for col in df_list[cur_league].columns if col not in df_list[next_league].columns]
    if unmatched_cols:
        print(f"columns in {cur_league} and unavailable in {next_league}:\n{unmatched_cols}\n")

So the dataframes can be merged into a single one with the columns corresponding to the most recent one:

In [15]:
# Merging all the dataframes of different seasons of the same league together
df = pd.concat(
    [df_list[league] for league in df_list.keys()],
    axis='index'
)

In [17]:
df.to_csv("./initial_ds/matches.csv", index=False)