All data downloaded from https://www.football-data.co.uk/spainm.php

In [2]:
import numpy as np
import pandas as pd

In [3]:
league_name = 'primera'
league_years = [f"{year}_{year+1}" for year in range(2017, 2023)]

# Reading the dataset files
Reading all the data from files into separate dataframes; which will later be consolidated into a single one.

In [4]:
# This dictionary will hold every single separate dataframe
df_list = dict()

# Going through all the files for each league division
for league_year in league_years:
    dataset_file = f"./initial_ds/{league_name}_{league_year}.csv"
    # Reading the league's data from the file
    cur_df = pd.read_csv(dataset_file)
    # Removing the rows which contain only Null values
    cur_df = cur_df.dropna(axis='index', how="all")
    # Removing the columns which contain only Null values
    cur_df = cur_df.dropna(axis="columns", how='all')
    # Adding the season as a new column
    cur_df.insert(1, 'season', league_year, True)
    df_list[league_year] = cur_df

# Merging the dataframes

Getting the accululated list of all the columns available in all dataframes:

In [5]:
all_cols = dict()
for df_name, df_table in df_list.items():
    for col in df_table.columns:
        all_cols[col] = all_cols[col] + [df_name] if col in all_cols.keys() else [df_name]

all_cols = dict(sorted(all_cols.items(), key=lambda item: len(item[1]), reverse=True))        

Merging all the separate dataframes into a single one

In [6]:
df = pd.DataFrame(columns=all_cols.keys())
for df_season in df_list.values():
    df = pd.concat([df, df_season], axis=0)  

# Cleaning up

Getting the list of the columns that are present in all of the recent seasons:

In [7]:
main_cols = []

for col, seasons in all_cols.items():
    if all([season in seasons for season in league_years]):
        main_cols.append(col)

Dropping all other columns:

In [8]:
df = df[main_cols]

# Dropping the division column (since they are all primera)
df = df.drop('Div', axis='columns')

## Renaming the columns

In [9]:

df = df.rename(columns={
    'Date': 'date',
    'HomeTeam': 'home', 
    'AwayTeam': 'away',
    'FTHG': 'home_fulltime_goals',
    'FTAG': 'away_fulltime_goals',
    'FTR': 'fulltime_result',
    'HTHG': 'home_halftime_goals',
    'HTAG': 'away_halftime_goals',
    'HTR': 'halftime_result',
    'HS': 'home_total_shots',
    'AS': 'away_total_shots',
    'HST': 'home_shots_on_target',
    'AST': 'away_shots_on_target',
    'HF': 'home_fouls_committed',
    'AF': 'away_fouls_committed',
    'HC': 'home_corners',
    'AC': 'away_corners',
    'HY': 'home_yellow_cards',
    'AY': 'away_yellow_cards',
    'HR': 'home_red_cards',
    'AR': 'away_red_cards',
    'B365H': 'home_win_bet365_odds',
    'B365D': 'draw_bet365_odds',
    'B365A': 'away_win_bet365_odds',
    'BWH': 'home_win_betandwin_odds',
    'BWD': 'draw_betandwin_odds',
    'BWA': 'away_win_betandwin_odds',
    'IWH': 'home_win_interwetten_odds',
    'IWD': 'draw_interwetten_odds',
    'IWA': 'away_win_interwetten_odds',
    'PSH': 'home_win_pinnaclesports_odds',
    'PSD': 'draw_pinnaclesports_odds',
    'PSA': 'away_win_pinnaclesports_odds',
    'WHH': 'home_win_williamhill_odds',
    'WHD': 'draw_williamhill_odds',
    'WHA': 'away_win_williamhill_odds',
    'VCH': 'home_win_vcbet_odds',
    'VCD': 'draw_vcbet_odds',
    'VCA': 'away_win_vcbet_odds',
    'PSCH': 'home_win_pinnaclesports_closing_odds',
    'PSCD': 'draw_pinnaclesports_closing_odds',
    'PSCA': 'away_win_pinnaclesports_closing_odds',
})

## Dropping unnecessary columns
These columns refer to in-game stats and are also retrieved from other sources

In [10]:
df = df.drop([
    'home_total_shots',
    'away_total_shots',
    'home_shots_on_target',
    'away_shots_on_target',
    'home_fouls_committed',
    'away_fouls_committed',
    'home_corners',
    'away_corners',
    'home_yellow_cards',
    'away_yellow_cards',
    'home_red_cards',
    'away_red_cards',
], axis='columns')

## Correcting the dates
The dates are currently of type string (object). They need to be converted into DateTime.<br>
However, before doing that, some values must be tweaked to conform to conventional date formats.<br>
For instance:
<ul>
    <li>'%d/%m/99' must be converted to '%d/%m/1999'</li>
    <li>'%d/%m/00' must be converted to '%d/%m/2000'</li>
    <li>'%d/%m/01' must be converted to '%d/%m/2001'</li>
    <li>...</li>
</ul>

In [11]:
# Setting the required corrections in terms of RegEx
replacements = dict(dict((f"/{str(y).zfill(2)}$", f"/{2000 + y}") for y in range(17, 24)))
# Fixing the inconsistencies in the date columns
df['date'] = df['date'].replace(replacements, regex=True)
# Converting the date column from object to DateTime64
df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y')

# Saving the cleaned up dataset into file

In [12]:
df.to_csv("./processed_ds/matches.csv", index=False)