All data downloaded from https://www.football-data.co.uk/spainm.php

In [106]:
import numpy as np
import pandas as pd

In [107]:
league_name = 'primera'
league_years = [f"{year}_{year+1}" for year in range(2011, 2023)]

# Reading the dataset files
Reading all the data from files into separate dataframes; which will later be consolidated into a single one.

In [108]:
# This dictionary will hold every single separate dataframe
df_list = dict()

# Going through all the files for each league division
for league_year in league_years:
    dataset_file = f"./initial_ds/{league_name}_{league_year}.csv"
    # Reading the league's data from the file
    cur_df = pd.read_csv(dataset_file)
    # Removing the rows which contain only Null values
    cur_df = cur_df.dropna(axis='index', how="all")
    # Removing the columns which contain only Null values
    cur_df = cur_df.dropna(axis="columns", how='all')
    # Adding the season as a new column
    cur_df.insert(1, 'season', league_year, True)
    df_list[league_year] = cur_df

# Merging the dataframes

Getting the accululated list of all the columns available in all dataframes:

In [109]:
all_cols = dict()
for df_name, df_table in df_list.items():
    for col in df_table.columns:
        all_cols[col] = all_cols[col] + [df_name] if col in all_cols.keys() else [df_name]

all_cols = dict(sorted(all_cols.items(), key=lambda item: len(item[1]), reverse=True))        

Merging all the separate dataframes into a single one

In [110]:
df = pd.DataFrame(columns=all_cols.keys())
for df_season in df_list.values():
    df = pd.concat([df, df_season], axis=0)  

# Cleaning up

Getting the list of the columns that are present in all of the recent seasons:

In [111]:
main_cols = []

for col, seasons in all_cols.items():
    if all([season in seasons for season in league_years]):
        main_cols.append(col)

Dropping all other columns:

In [112]:
df = df[main_cols]

# Renaming the columns

In [113]:
df = df.rename(columns={
    'Div': 'division',
    'Date': 'date',
    'HomeTeam': 'home', 
    'AwayTeam': 'away',
    'FTHG': 'fulltime_home_goals',
    'FTAG': 'fulltime_away_goals',
    'FTR': 'fulltime_result',
    'HTHG': 'halftime_home_goals',
    'HTAG': 'halftime_away_goals',
    'HTR': 'halftime_result',
    'HS': 'home_total_shots',
    'AS': 'away_total_shots',
    'HST': 'home_shots_on_target',
    'AST': 'away_shots_on_target',
    'HF': 'home_fouls_committed',
    'AF': 'away_fouls_committed',
    'HC': 'home_corners',
    'AC': 'away_corners',
    'HY': 'home_yellow_cards',
    'AY': 'away_yellow_cards',
    'HR': 'home_red_cards',
    'AR': 'away_red_cards',
    'B365H': 'bet365_home_win_odds',
    'B365D': 'bet365_draw_odds',
    'B365A': 'bet365_away_win_odds',
    'BWH': 'betandwin_home_win_odds',
    'BWD': 'betandwin_draw_odds',
    'BWA': 'betandwin_away_win_odds',
    'IWH': 'interwetten_home_win_odds',
    'IWD': 'interwetten_draw_odds',
    'IWA': 'interwetten_away_win_odds',
    'WHH': 'williamhill_home_win_odds',
    'WHD': 'williamhill_draw_odds',
    'WHA': 'williamhill_away_win_odds',
    'VCH': 'vcbet_home_win_odds',
    'VCD': 'vcbet_draw_odds',
    'VCA': 'vcbet_away_win_odds',
})

# Saving the consolidated dataset into file

In [114]:
df.to_csv("./initial_ds/matches.csv", index=False)