In [1]:
import pandas as pd
import numpy as np

In [2]:
df_stats = pd.read_csv("./match_stats/processed_ds/match_stats.csv", parse_dates=['date'])
df_matches = pd.read_csv("./matches/processed_ds/matches.csv", parse_dates=['date'])
df_ratings = pd.read_csv("./ratings/dataset/ratings.csv", parse_dates=['date'])
df_xg = pd.read_csv("./xg/processed_ds/xg.csv", parse_dates=['date'])

# Making the team names consitent across the datasets

Defining a function to match similar team names together

In [3]:
# The following function receives a lists of words and maps the similar ones
# together; it is used to clean up string features by pinpointing the
# 'almost' similar words.
from difflib import SequenceMatcher

def similar(ref_words, words):
    result = dict()
    
    for similarity_ratio in np.arange(1, 0, -.1):
        for word in words:
            if pd.isna(word) or word in result.keys():
                continue

            for ref_word in ref_words:
                if pd.isna(ref_word):
                    continue
            
                similarity = SequenceMatcher(None, word, ref_word).ratio()
                if similarity_ratio <= similarity:
                    result[word] = ref_word
                    ref_words.remove(ref_word)
                    words.remove(word)
                    
    if words:
        result['UNMATCHED_TEAMS'] = words
    if ref_words:
        result['UNMATCHED_REF_TEAMS'] = ref_words
        
    return result

Taking the list of team names in 'stats' dataset as reference

In [4]:
refrence_team_names = (
    pd.melt(frame=df_stats, id_vars='date', value_vars=['home', 'away'])['value']
    .value_counts()).reset_index().rename(columns={
        'index': 'team_name',
        'value': 'match_count'
    })

refrence_team_names

Unnamed: 0,team_name,match_count
0,Valencia,190
1,Villarreal,190
2,Getafe,190
3,Athletic Bilbao,190
4,Alaves,190
5,Real Sociedad,190
6,Levante,190
7,Barcelona,190
8,Atletico Madrid,190
9,Celta Vigo,189


## matches dataset

In [5]:
df_matches_team_names = (
    pd.melt(frame=df_matches, id_vars='date', value_vars=['home', 'away'])['value']
    .value_counts()).reset_index().rename(columns={
        'index': 'team_name',
        'value': 'match_count'
    })

df_matches_team_names

Unnamed: 0,team_name,match_count
0,Real Madrid,190
1,Valencia,190
2,Celta,190
3,Getafe,190
4,Sevilla,190
5,Ath Bilbao,190
6,Barcelona,190
7,Levante,190
8,Villarreal,190
9,Betis,190


In [6]:
df_matches_team_similarities = similar(
    list(refrence_team_names['team_name']), 
    list(df_matches_team_names['team_name']))

for key, value in df_matches_team_similarities.items():
    print(f"{key} => {value}\n")

Real Madrid => Real Madrid

Getafe => Getafe

Barcelona => Barcelona

Villarreal => Villarreal

Alaves => Alaves

Eibar => Eibar

Osasuna => Osasuna

Leganes => Leganes

Girona => Girona

Elche => Elche

Malaga => Malaga

Valencia => Valencia

Sevilla => Sevilla

Levante => Levante

Espanol => Espanyol

Cadiz => Cadiz

Ath Bilbao => Athletic Bilbao

Granada => Granada

Huesca => SD Huesca

Las Palmas => Las Palmas

Sociedad => Real Sociedad

Valladolid => Real Valladolid

Mallorca => Real Mallorca

Celta => Celta Vigo

Ath Madrid => Atletico Madrid

La Coruna => Deportivo La Coruna

Betis => Real Betis

Vallecano => Rayo Vallecano



In [7]:
df_matches['home'] = df_matches['home'].replace(df_matches_team_similarities)
df_matches['away'] = df_matches['away'].replace(df_matches_team_similarities)

## Ratings dataset

In [8]:
df_ratings_team_names = (
    pd.melt(frame=df_ratings, id_vars='date', value_vars=['home', 'away'])['value']
    .value_counts()).reset_index().rename(columns={
        'index': 'team_name',
        'value': 'match_count'
    })

df_ratings_team_names

Unnamed: 0,team_name,match_count
0,FC Barcelona,190
1,Levante UD,190
2,Real Sociedad,190
3,CD Alavés,190
4,Atlético Madrid,190
5,Getafe CF,190
6,Real Madrid,190
7,Real Betis,190
8,Celta Vigo,190
9,Sevilla FC,190


In [9]:
df_ratings_team_similarities = similar(
    list(refrence_team_names['team_name']), 
    list(df_ratings_team_names['team_name']))

for key, value in df_ratings_team_similarities.items():
    print(f"{key} => {value}\n")

Real Sociedad => Real Sociedad

Real Madrid => Real Madrid

Celta Vigo => Celta Vigo

Athletic Bilbao => Athletic Bilbao

Real Valladolid => Real Valladolid

SD Huesca => SD Huesca

Atlético Madrid => Atletico Madrid

Real Betis => Real Betis

Rayo Vallecano => Rayo Vallecano

Deportivo La Coruña => Deportivo La Coruna

FC Barcelona => Barcelona

Getafe CF => Getafe

Villarreal CF => Villarreal

Granada CF => Granada

CA Osasuna => Osasuna

Girona FC => Girona

UD Las Palmas => Las Palmas

Levante UD => Levante

Sevilla FC => Sevilla

SD Eibar => Eibar

RCD Mallorca => Real Mallorca

Elche CF => Elche

CD Alavés => Alaves

Espanyol Barcelona => Espanyol

Cádiz CF => Cadiz

Valencia CF => Valencia

Málaga CF => Malaga

CD Leganés => Leganes



In [10]:
df_ratings['home'] = df_ratings['home'].replace(df_ratings_team_similarities)
df_ratings['away'] = df_ratings['away'].replace(df_ratings_team_similarities)

## XG dataset

In [11]:
df_xg_team_names = (
    pd.melt(frame=df_xg, id_vars='date', value_vars=['home', 'away'])['value']
    .value_counts()).reset_index().rename(columns={
        'index': 'team_name',
        'value': 'match_count'
    })

df_xg_team_names

Unnamed: 0,team_name,match_count
0,Real Madrid,190
1,Valencia,190
2,Celta Vigo,190
3,Getafe,190
4,Sevilla,190
5,Athletic Club,190
6,Barcelona,190
7,Levante,190
8,Villarreal,190
9,Real Sociedad,190


In [12]:
df_xg_team_similarities = similar(
    list(refrence_team_names['team_name']), 
    list(df_xg_team_names['team_name']))

for key, value in df_xg_team_similarities.items():
    print(f"{key} => {value}\n")

Real Madrid => Real Madrid

Celta Vigo => Celta Vigo

Sevilla => Sevilla

Barcelona => Barcelona

Villarreal => Villarreal

Eibar => Eibar

Granada => Granada

Rayo Vallecano => Rayo Vallecano

Girona => Girona

Elche => Elche

Valencia => Valencia

Levante => Levante

Atlético Madrid => Atletico Madrid

Osasuna => Osasuna

Las Palmas => Las Palmas

Getafe => Getafe

Real Sociedad => Real Sociedad

Alavés => Alaves

Valladolid => Real Valladolid

Huesca => SD Huesca

Cádiz => Cadiz

Athletic Club => Athletic Bilbao

Espanyol => Espanyol

Mallorca => Real Mallorca

Betis => Real Betis

Málaga => Malaga

Leganés => Leganes

La Coruña => Deportivo La Coruna



In [13]:
df_xg['home'] = df_xg['home'].replace(df_xg_team_similarities)
df_xg['away'] = df_xg['away'].replace(df_xg_team_similarities)

# Making venue names consistent

In [14]:
df_stats['venue'].sort_values().unique()

array(['Alfredo Di Stefano.', 'Anoeta', 'Balaidos', 'Balaidos.',
       'Benito Villamarin', 'Benito Villamarin.', 'Butarque', 'Butarque.',
       'Camp Nou', 'Camp Nou.', 'Campo de Futbol de Vallecas',
       'Ciutat de Valencia', 'Ciutat de Valencia.',
       'Coliseum Alfonso Perez', 'Coliseum Alfonso Perez.',
       'Cornella - El Prat', 'Cornella - El Prat.', 'El Alcoraz',
       'El Alcoraz.', 'El Sadar', 'El Sadar.', 'Estadio Camilo Cano.',
       'Estadio Los Carmenes', 'Estadio Los Carmenes.',
       'Estadio Municipal de Ipura', 'Estadio Municipal de Ipura.',
       'Estadio de la Ceramica', 'Estadio de la Ceramica.',
       'Gran Canaria', 'Iberostar Estadi', 'Iberostar Estadi.',
       'Jose Zorrilla', 'Jose Zorrilla.', 'La Rosaleda',
       'Martinez Valero', 'Martinez Valero.', 'Mendizorroza',
       'Mendizorroza.', 'Mestalla', 'Mestalla.', 'Municipal de Montilivi',
       'Ramon Sanchez Pizjuan', 'Ramon Sanchez Pizjuan.',
       'Ramon de Carranza', 'Ramon de Carranza.'

In [15]:
# Removing dot (.) from the end of the venue names

df_stats['venue'] = df_stats['venue'].apply(
    lambda x: x if x[-1] != '.' else x[:-1])

In [16]:
df_stats['venue'].sort_values().unique()

array(['Alfredo Di Stefano', 'Anoeta', 'Balaidos', 'Benito Villamarin',
       'Butarque', 'Camp Nou', 'Campo de Futbol de Vallecas',
       'Ciutat de Valencia', 'Coliseum Alfonso Perez',
       'Cornella - El Prat', 'El Alcoraz', 'El Sadar',
       'Estadio Camilo Cano', 'Estadio Los Carmenes',
       'Estadio Municipal de Ipura', 'Estadio de la Ceramica',
       'Gran Canaria', 'Iberostar Estadi', 'Jose Zorrilla', 'La Rosaleda',
       'Martinez Valero', 'Mendizorroza', 'Mestalla',
       'Municipal de Montilivi', 'Ramon Sanchez Pizjuan',
       'Ramon de Carranza', 'Reale Seguros Stadium', 'Riazor',
       'San Mames', 'Santiago Bernabeu', 'Spotify Camp Nou',
       'Wanda Metropolitano'], dtype=object)

# Joining the datasets
The join will be done on date, home, and away columns.

In [17]:
from functools import reduce

joined_df = reduce(
    lambda left,right: 
        pd.merge(left, right, on=['date', 'home', 'away'], how='inner'),
    [df_matches, df_ratings, df_stats, df_xg])

print(f"matches dataset's shape: {df_matches.shape}\n" +
      f"ratings dataset's shape: {df_ratings.shape}\n" +
      f"stats dataset's shape: {df_stats.shape}\n" +
      f"xg dataset's shape: {df_xg.shape}\n" +
      f"final resulting dataset's shape: {joined_df.shape}\n\n" +
      f"resulting columns:\n{'-'*30}\n{joined_df.columns}")

matches dataset's shape: (1900, 31)
ratings dataset's shape: (1900, 9)
stats dataset's shape: (1894, 37)
xg dataset's shape: (1900, 5)
final resulting dataset's shape: (1894, 73)

resulting columns:
------------------------------
Index(['season', 'date', 'home', 'away', 'fulltime_home_goals',
       'fulltime_away_goals', 'fulltime_result', 'halftime_home_goals',
       'halftime_away_goals', 'halftime_result', 'bet365_home_win_odds',
       'bet365_draw_odds', 'bet365_away_win_odds', 'betandwin_home_win_odds',
       'betandwin_draw_odds', 'betandwin_away_win_odds',
       'interwetten_home_win_odds', 'interwetten_draw_odds',
       'interwetten_away_win_odds', 'pinnaclesports_home_win_odds',
       'pinnaclesports_draw_odds', 'pinnaclesports_away_win_odds',
       'williamhill_home_win_odds', 'williamhill_draw_odds',
       'williamhill_away_win_odds', 'vcbet_home_win_odds', 'vcbet_draw_odds',
       'vcbet_away_win_odds', 'pinnaclesports_closing_home_win_odds',
       'pinnaclesport

# Saving the joined dataset into file

In [18]:
joined_df.to_csv("./dataset.csv", index=False)