In [28]:
import pandas as pd

In [29]:
df = pd.read_csv('./initial_ds/matches.csv')

# Data cleanup

## Correcting the dates
The dates are currently of type string (object). They need to be converted into DateTime.<br>
However, before doing that, some values must be tweaked to conform to conventional date formats.<br>
For instance:
<ul>
    <li>'%d/%m/99' must be converted to '%d/%m/1999'</li>
    <li>'%d/%m/00' must be converted to '%d/%m/2000'</li>
    <li>'%d/%m/01' must be converted to '%d/%m/2001'</li>
    <li>...</li>
</ul>

In [30]:
# Setting the required corrections in terms of RegEx
replacements = {
        r"/99$": "/1999",
    }
replacements.update((f"/{str(y).zfill(2)}$", f"/{2000 + y}") for y in range(24))
# Fixing the inconsistencies in the date columns
df['Date'] = df['Date'].replace(replacements, regex=True)
# Converting the date column from object to DateTime64
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

## Dropping redundant columns

In [31]:
df.columns

Index(['Div', 'Date', 'HomeTeam', 'AwayTeam', 'HTHG', 'HTAG', 'HTR', 'HS',
       'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR',
       'Time'],
      dtype='object')

Since all the matches belong to the same league, the 'Div' (division) column is unnecessary.

In [32]:
df = df.drop('Div', axis='columns')

# Saving the cleaned up dataset into file

In [33]:
df.to_csv("./processed_ds/matches.csv", index=False)