In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('./initial_ds/matches.csv')

# Data cleanup

## Correcting the dates
The dates are currently of type string (object). They need to be converted into DateTime.<br>
However, before doing that, some values must be tweaked to conform to conventional date formats.<br>
For instance:
<ul>
    <li>'%d/%m/99' must be converted to '%d/%m/1999'</li>
    <li>'%d/%m/00' must be converted to '%d/%m/2000'</li>
    <li>'%d/%m/01' must be converted to '%d/%m/2001'</li>
    <li>...</li>
</ul>

In [4]:
# Setting the required corrections in terms of RegEx
replacements = {
        r"/99$": "/1999",
    }
replacements.update((f"/{str(y).zfill(2)}$", f"/{2000 + y}") for y in range(24))
# Fixing the inconsistencies in the date columns
df['Date'] = df['Date'].replace(replacements, regex=True)
# Converting the date column from object to DateTime64
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

## Making team names consistent

In [5]:
df.columns

Index(['Div', 'Date', 'HomeTeam', 'AwayTeam', 'HTHG', 'HTAG', 'HTR', 'HS',
       'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR',
       'Time'],
      dtype='object')

In [11]:
df[df['Div'] == 'primera']['HomeTeam'].value_counts()

Ath Bilbao     449
Valencia       449
Barcelona      448
Real Madrid    448
Espanol        430
Sevilla        430
Ath Madrid     410
Villarreal     410
Sociedad       391
Betis          372
Osasuna        353
Celta          335
Getafe         335
La Coruna      323
Malaga         323
Mallorca       315
Levante        266
Valladolid     258
Santander      228
Zaragoza       228
Vallecano      220
Alaves         209
Granada        171
Eibar          133
Almeria        126
Sp Gijon       114
Las Palmas      95
Elche           88
Recreativo      76
Leganes         76
Numancia        76
Cadiz           69
Girona          50
Murcia          38
Oviedo          38
Albacete        38
Tenerife        38
Huesca          38
Hercules        19
Xerez           19
Gimnastic       19
Cordoba         19
Name: HomeTeam, dtype: int64