In [19]:
import pandas as pd
import numpy as np

In [20]:
df_stats = pd.read_csv("./match_stats/processed_ds/match_stats.csv", parse_dates=['date'])
df_matches = pd.read_csv("./matches/processed_ds/matches.csv", parse_dates=['date'])
df_ratings = pd.read_csv("./ratings/dataset/ratings.csv", parse_dates=['date'])
df_xg = pd.read_csv("./xg/processed_ds/xg.csv", parse_dates=['date'])

# Making the team names consitent across the datasets

Defining a function to match similar team names together

In [21]:
# The following function receives a lists of words and maps the similar ones
# together; it is used to clean up string features by pinpointing the
# 'almost' similar words.
from difflib import SequenceMatcher

def similar(ref_words, words):
    result = dict()
    
    for similarity_ratio in np.arange(1, 0, -.1):
        for word in words:
            if pd.isna(word) or word in result.keys():
                continue

            for ref_word in ref_words:
                if pd.isna(ref_word):
                    continue
            
                similarity = SequenceMatcher(None, word, ref_word).ratio()
                if similarity_ratio <= similarity:
                    result[word] = ref_word
                    ref_words.remove(ref_word)
                    words.remove(word)
                    
    if words:
        result['UNMATCHED_TEAMS'] = words
    if ref_words:
        result['UNMATCHED_REF_TEAMS'] = ref_words
        
    return result

Taking the list of team names in 'stats' dataset as reference

In [22]:
refrence_team_names = (
    pd.melt(frame=df_stats, id_vars='date', value_vars=['home', 'away'])['value']
    .value_counts()).reset_index().rename(columns={
        'index': 'team_name',
        'value': 'match_count'
    })

refrence_team_names

Unnamed: 0,team_name,match_count
0,Real Sociedad,214
1,Atletico Madrid,214
2,Getafe,214
3,Villarreal,213
4,Athletic Bilbao,213
5,Valencia,213
6,Barcelona,213
7,Sevilla,212
8,Real Madrid,212
9,Celta Vigo,211


## matches dataset

In [23]:
df_matches_team_names = (
    pd.melt(frame=df_matches, id_vars='date', value_vars=['home', 'away'])['value']
    .value_counts()).reset_index().rename(columns={
        'index': 'team_name',
        'value': 'match_count'
    })

df_matches_team_names

Unnamed: 0,team_name,match_count
0,Real Madrid,213
1,Valencia,213
2,Celta,213
3,Getafe,213
4,Sevilla,213
5,Ath Bilbao,213
6,Barcelona,213
7,Villarreal,213
8,Betis,213
9,Sociedad,213


In [24]:
df_matches_team_similarities = similar(
    list(refrence_team_names['team_name']), 
    list(df_matches_team_names['team_name']))

for key, value in df_matches_team_similarities.items():
    print(f"{key} => {value}\n")

Real Madrid => Real Madrid

Getafe => Getafe

Barcelona => Barcelona

Levante => Levante

Eibar => Eibar

Granada => Granada

Girona => Girona

Elche => Elche

Las Palmas => Las Palmas

Almeria => Almeria

Valencia => Valencia

Sevilla => Sevilla

Villarreal => Villarreal

Alaves => Alaves

Osasuna => Osasuna

Leganes => Leganes

Cadiz => Cadiz

Malaga => Malaga

Ath Bilbao => Athletic Bilbao

Espanol => Espanyol

Huesca => SD Huesca

Sociedad => Real Sociedad

Valladolid => Real Valladolid

Mallorca => Real Mallorca

Celta => Celta Vigo

Ath Madrid => Atletico Madrid

La Coruna => Deportivo La Coruna

Betis => Real Betis

Vallecano => Rayo Vallecano



In [25]:
df_matches['home'] = df_matches['home'].replace(df_matches_team_similarities)
df_matches['away'] = df_matches['away'].replace(df_matches_team_similarities)

## Ratings dataset

In [26]:
df_ratings_team_names = (
    pd.melt(frame=df_ratings, id_vars='date', value_vars=['home', 'away'])['value']
    .value_counts()).reset_index().rename(columns={
        'index': 'team_name',
        'value': 'match_count'
    })

df_ratings_team_names

Unnamed: 0,team_name,match_count
0,FC Barcelona,215
1,Real Madrid,215
2,Real Sociedad,215
3,Atlético Madrid,215
4,Getafe CF,215
5,Real Betis,215
6,Athletic Bilbao,215
7,Sevilla FC,215
8,Valencia CF,215
9,Villarreal CF,215


In [27]:
df_ratings_team_similarities = similar(
    list(refrence_team_names['team_name']), 
    list(df_ratings_team_names['team_name']))

for key, value in df_ratings_team_similarities.items():
    print(f"{key} => {value}\n")

Real Madrid => Real Madrid

Real Betis => Real Betis

Celta Vigo => Celta Vigo

Real Valladolid => Real Valladolid

Rayo Vallecano => Rayo Vallecano

SD Huesca => SD Huesca

Real Sociedad => Real Sociedad

Athletic Bilbao => Athletic Bilbao

Deportivo La Coruña => Deportivo La Coruna

FC Barcelona => Barcelona

Getafe CF => Getafe

Valencia CF => Valencia

Levante UD => Levante

CA Osasuna => Osasuna

Granada CF => Granada

RCD Mallorca => Real Mallorca

UD Las Palmas => Las Palmas

Atlético Madrid => Atletico Madrid

Villarreal CF => Villarreal

SD Eibar => Eibar

Girona FC => Girona

UD Almería => Almeria

Sevilla FC => Sevilla

Espanyol Barcelona => Espanyol

Elche CF => Elche

Málaga CF => Malaga

CD Alavés => Alaves

Cádiz CF => Cadiz

CD Leganés => Leganes



In [28]:
df_ratings['home'] = df_ratings['home'].replace(df_ratings_team_similarities)
df_ratings['away'] = df_ratings['away'].replace(df_ratings_team_similarities)

## XG dataset

In [29]:
df_xg_team_names = (
    pd.melt(frame=df_xg, id_vars='date', value_vars=['home', 'away'])['value']
    .value_counts()).reset_index().rename(columns={
        'index': 'team_name',
        'value': 'match_count'
    })

df_xg_team_names

Unnamed: 0,team_name,match_count
0,Real Sociedad,228
1,Celta Vigo,228
2,Atlético Madrid,228
3,Sevilla,228
4,Athletic Club,228
5,Barcelona,228
6,Villarreal,228
7,Real Madrid,228
8,Getafe,228
9,Valencia,228


In [30]:
df_xg_team_similarities = similar(
    list(refrence_team_names['team_name']), 
    list(df_xg_team_names['team_name']))

for key, value in df_xg_team_similarities.items():
    print(f"{key} => {value}\n")

Real Sociedad => Real Sociedad

Sevilla => Sevilla

Barcelona => Barcelona

Real Madrid => Real Madrid

Valencia => Valencia

Espanyol => Espanyol

Eibar => Eibar

Granada => Granada

Elche => Elche

Rayo Vallecano => Rayo Vallecano

Las Palmas => Las Palmas

Celta Vigo => Celta Vigo

Villarreal => Villarreal

Levante => Levante

Osasuna => Osasuna

Girona => Girona

Atlético Madrid => Atletico Madrid

Getafe => Getafe

Alavés => Alaves

Cádiz => Cadiz

Leganés => Leganes

Málaga => Malaga

Almería => Almeria

Athletic Club => Athletic Bilbao

Valladolid => Real Valladolid

Huesca => SD Huesca

Betis => Real Betis

Mallorca => Real Mallorca

La Coruña => Deportivo La Coruna



In [31]:
df_xg['home'] = df_xg['home'].replace(df_xg_team_similarities)
df_xg['away'] = df_xg['away'].replace(df_xg_team_similarities)

# Making venue names consistent

In [32]:
df_stats['venue'].sort_values().unique()

array(['Alfredo Di Stefano.', 'Anoeta', 'Balaidos', 'Balaidos.',
       'Benito Villamarin', 'Benito Villamarin.', 'Butarque', 'Butarque.',
       'Camp Nou', 'Camp Nou.', 'Campo de Futbol de Vallecas',
       'Ciutat de Valencia', 'Ciutat de Valencia.',
       'Coliseum Alfonso Perez', 'Coliseum Alfonso Perez.',
       'Cornella - El Prat', 'Cornella - El Prat.', 'El Alcoraz',
       'El Alcoraz.', 'El Sadar', 'El Sadar.', 'Estadio Camilo Cano.',
       'Estadio Los Carmenes', 'Estadio Los Carmenes.',
       'Estadio Municipal de Ipura', 'Estadio Municipal de Ipura.',
       'Estadio de la Ceramica', 'Estadio de la Ceramica.',
       'Gran Canaria', 'Iberostar Estadi', 'Iberostar Estadi.',
       'Jose Zorrilla', 'Jose Zorrilla.', 'Juegos del Mediterraneo',
       'La Rosaleda', 'Martinez Valero', 'Martinez Valero.',
       'Mendizorroza', 'Mendizorroza.', 'Mestalla', 'Mestalla.',
       'Municipal de Montilivi', 'Ramon Sanchez Pizjuan',
       'Ramon Sanchez Pizjuan.', 'Ramon de Carr

In [33]:
# Removing dot (.) from the end of the venue names

df_stats['venue'] = df_stats['venue'].apply(
    lambda x: x if x[-1] != '.' else x[:-1])

In [34]:
df_stats['venue'].sort_values().unique()

array(['Alfredo Di Stefano', 'Anoeta', 'Balaidos', 'Benito Villamarin',
       'Butarque', 'Camp Nou', 'Campo de Futbol de Vallecas',
       'Ciutat de Valencia', 'Coliseum Alfonso Perez',
       'Cornella - El Prat', 'El Alcoraz', 'El Sadar',
       'Estadio Camilo Cano', 'Estadio Los Carmenes',
       'Estadio Municipal de Ipura', 'Estadio de la Ceramica',
       'Gran Canaria', 'Iberostar Estadi', 'Jose Zorrilla',
       'Juegos del Mediterraneo', 'La Rosaleda', 'Martinez Valero',
       'Mendizorroza', 'Mestalla', 'Municipal de Montilivi',
       'Ramon Sanchez Pizjuan', 'Ramon de Carranza',
       'Reale Seguros Stadium', 'Riazor', 'San Mames',
       'Santiago Bernabeu', 'Spotify Camp Nou', 'Wanda Metropolitano'],
      dtype=object)

# Joining the datasets
The join will be done on date, home, and away columns.

In [35]:
from functools import reduce

joined_df = reduce(
    lambda left,right: 
        pd.merge(left, right, on=['date', 'home', 'away'], how='inner'),
    [df_matches, df_ratings, df_stats, df_xg])

print(f"matches dataset's shape: {df_matches.shape}\n" +
      f"ratings dataset's shape: {df_ratings.shape}\n" +
      f"stats dataset's shape: {df_stats.shape}\n" +
      f"xg dataset's shape: {df_xg.shape}\n" +
      f"final resulting dataset's shape: {joined_df.shape}\n\n" +
      f"resulting columns:\n{'-'*30}\n{joined_df.columns}")

matches dataset's shape: (2130, 31)
ratings dataset's shape: (2149, 9)
stats dataset's shape: (2127, 37)
xg dataset's shape: (2280, 5)
final resulting dataset's shape: (2122, 73)

resulting columns:
------------------------------
Index(['season', 'date', 'home', 'away', 'home_fulltime_goals',
       'away_fulltime_goals', 'fulltime_result', 'home_halftime_goals',
       'away_halftime_goals', 'halftime_result', 'home_win_bet365_odds',
       'draw_bet365_odds', 'away_win_bet365_odds', 'home_win_betandwin_odds',
       'draw_betandwin_odds', 'away_win_betandwin_odds',
       'home_win_interwetten_odds', 'draw_interwetten_odds',
       'away_win_interwetten_odds', 'home_win_pinnaclesports_odds',
       'draw_pinnaclesports_odds', 'away_win_pinnaclesports_odds',
       'home_win_williamhill_odds', 'draw_williamhill_odds',
       'away_win_williamhill_odds', 'home_win_vcbet_odds', 'draw_vcbet_odds',
       'away_win_vcbet_odds', 'home_win_pinnaclesports_closing_odds',
       'draw_pinnacle

# Saving the joined dataset into file

In [36]:
joined_df.to_csv("./matches_initial.csv", index=False)