All data downloaded from http://www.elofootball.com/

In [1]:
league_years = list(range(2017, 2023))

# Scraping the data

In [2]:
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Initiating a new Google Chrome window
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# The function below extracts the games of any given season of La Liga and
# returns the output in the form of a dataframe
def extract_games(year):
    # The URL is shapes in the form of
    # http://www.elofootball.com/country.php?countryiso=ESP&season=1999-2000
    url = f"http://www.elofootball.com/country.php?countryiso=ESP&season={year}-{year+1}"
    
    # Retrieving the URL in the Chrome window
    driver.get(url)
    
    # When the page loads, the table's empty and is later filled by AJAX calls.
    # So, we wait until there is an element by the id of for instance
    # "tabcontainer" available on the page before proceeding to
    # extract the table.
    # The availability of the said element shows that the table is fully loaded
    # and filled.
    element = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.ID, 'tabcontainer'))
    )
    soup = BeautifulSoup(driver.page_source, 'lxml')
    
    # Retrieving the specific table amongst all
    table_bodies = soup.findAll('tbody')
    tbody = table_bodies[4]

    # Within the table, only the first match in a specific date has the date column
    # filled, the next matches in the same date do not have any data written in the
    # date column. So, when a new date is seen, it is stored and as long as the date
    # column is empty, the same date will be referenced.
    last_date = ''
    # Goint through all the tr elements (table rows)
    all_matches = []
    for tr in tbody.find_all('tr'):
        # Retrieving all td elements (table data)
        tds = tr.find_all('td')
        # Getting the inner-text of each td element
        match = [td.text for td in tds]
        # As explained above, if the date column is empty, use the previous date value,
        # otherwise, update the date value with the new one
        if match[0] == '':
            match[0] = last_date
        else:
            last_date = match[0]
        all_matches.append(match)

    result = pd.DataFrame(all_matches, columns=[
        'date',
        'division',
        'unused_1',
        'home',
        'home_pre_rating',
        'home_rating_delta',
        'home_post_rating',
        'result',
        'prob_h',
        'prob_d',
        'prob_a',
        'away',
        'away_pre_rating',
        'away_rating_delta',
        'away_post_rating',
        'unused_2'])

    return result;

In [3]:
# Retrieving the matches and merging them into a single dataframe
df = pd.concat(
    [extract_games(year) for year in league_years])

df = df.reset_index(drop=True)

# Data clean up

## Removing data for irrelevant leagues

In [4]:
df['division'].unique()

array(['Primera Division rel. ', 'Segunda Division ', 'Champions League ',
       'Primera Division ', 'Europa League ', 'Copa del Rey ',
       'Europa League q. ', 'Champions League q. ', 'Supercopa ',
       'Supercup ', 'Europa Conf. League ', 'Europa Conf. League q. '],
      dtype=object)

In [5]:
df = df[df['division'] == 'Primera Division ']

## Processing the match dates

The date column must be converted into DateTime64 format.

In [8]:
df['date'] = pd.to_datetime(df['date'], format='%Y/%m/%d')

## Dropping redundant data

The division column only contains the value 'Primera Division ', so it can be dropped.

In [10]:
df = df.drop('division', axis='columns')

The two unused columns must be dropped.

In [11]:
df = df.drop(['unused_1', 'unused_2'], axis='columns')

The probability columns are based on merely the difference in the ratings of the two teams; hence they are not very accurate and may be dropped.

In [12]:
df = df.drop(['prob_h', 'prob_d', 'prob_a'], axis='columns')

The match result is available in other datasets, so it can be dropped here

In [13]:
df = df.drop('result', axis='columns')

# Saving the cleaned up dataset

In [14]:
df.to_csv("./dataset/ratings.csv", index=False)