In [42]:
import pandas as pd

In [43]:
df = pd.read_csv('./initial_ds/ratings.csv')

# Data clean up

## Removing data for irrelevant leagues

In [44]:
df['division'].unique()

array(['Segunda Division ', 'Copa del Rey ', 'Champions League ',
       'Primera Division ', 'Europa League ', 'Champions League q. ',
       'Supercopa ', 'Intertoto Cup ', 'Supercup ', 'Europa League q. ',
       'Primera Division rel. ', 'Europa Conf. League ',
       'Europa Conf. League q. '], dtype=object)

In [45]:
df = df[df['division'] == 'Primera Division ']

## Processing the match dates

The date column must be converted into DateTime64 format.

In [46]:
df['date'] = pd.to_datetime(df['date'])

## Dropping redundant data

The division column only contains the value 'Primera Division ', so it can be dropped.

In [47]:
df = df.drop('division', axis='columns')

The two unused columns must be dropped.

In [48]:
df = df.drop(['unused_1', 'unused_2'], axis='columns')

The probability columns are based on merely the difference in the ratings of the two teams; hence they are not very accurate and may be dropped.

In [49]:
df = df.drop(['prob_h', 'prob_d', 'prob_a'], axis='columns')

Some matches are set to be held later and have not happened yet; those are to be removed from the dataset.

In [50]:
df = df[
    (df['date'] <= pd.to_datetime('today')) &
    (df['result'] != '')]

The result of the match is already available in other datasets and is redundant, so it must be dropped as well.

In [51]:
df = df.drop('result', axis='columns')

## Converting the ratings to integers

The rating columns must all be converted to Int64

In [52]:
rating_cols = [
    'home_pre_rating', 
    'home_rating_delta',
    'home_post_rating',
    'away_pre_rating',
    'away_rating_delta',
    'away_post_rating'
]

for col in rating_cols:
    df[col] = df[col].astype('Int64')

In [53]:
# Saving the cleaned up dataset

In [54]:
df.to_csv("./processed_ds/ratings.csv", index=False)

## Extracting the team names
These team names will be the reference for making all the team names across all datasets consistent

In [55]:
# Getting all the team names and their frequencies from both home and away columns
pd.melt(df, ['date'], ['home', 'away'])['value'].value_counts()

Valencia CF            897
FC Barcelona           897
Real Madrid            897
Athletic Bilbao        897
Sevilla FC             859
Espanyol Barcelona     859
Villarreal CF          821
Atlético Madrid        821
Real Sociedad          783
Real Betis             745
CA Osasuna             707
Getafe CF              669
Celta Vigo             669
Málaga CF              646
Deportivo La Coruña    646
RCD Mallorca           631
Levante UD             532
Real Valladolid        517
Racing Santander       456
Real Zaragoza          456
Rayo Vallecano         441
CD Alavés              418
Granada CF             342
SD Eibar               266
UD Almería             251
Sporting Gijón         228
UD Las Palmas          190
Elche CF               175
Recreativo Huelva      152
CD Numancia            152
CD Leganés             152
Cádiz CF               137
Girona FC               99
Real Murcia             76
Albacete                76
CD Tenerife             76
Real Oviedo             76
S