All data extracted from http://www.elofootball.com/

In [1]:
league_years = list(range(1999, 2023))

In [2]:
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Initiating a new Google Chrome window
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# The function below extracts the games of any given season of La Liga and
# returns the output in the form of a dataframe
def extract_games(year):
    # The URL is shapes in the form of
    # http://www.elofootball.com/country.php?countryiso=ESP&season=1999-2000
    url = f"http://www.elofootball.com/country.php?countryiso=ESP&season={year}-{year+1}"
    
    # # This dataframe contains the matches of the given season.
    # result = pd.DataFrame(columns=[
    #     'date',
    #     'division',
    #     'unused_1'
    #     'home',
    #     'home_pre_rating',
    #     'home_rating_delta',
    #     'home_post_rating',
    #     'result',
    #     'prob_h',
    #     'prob_d',
    #     'prob_a',
    #     'away',
    #     'away_pre_rating',
    #     'away_rating_delta',
    #     'away_post_rating',
    #     'unused_2'
    # ])
    
    # Retrieving the URL in the Chrome window
    driver.get(url)
    
    # When the page loads, the table's empty and is later filled by AJAX calls.
    # So, we wait until there is an element by the id of for instance
    # "tabcontainer" available on the page before proceeding to
    # extract the table.
    # The availability of the said element shows that the table is fully loaded
    # and filled.
    element = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.ID, f"tabcontainer"))
    )
    soup = BeautifulSoup(driver.page_source, 'lxml')
    
    # Retrieving the specific table amongst all
    table_bodies = soup.findAll('tbody')
    tbody = table_bodies[4]

    # Within the table, only the first match in a specific date has the date column
    # filled, the next matches in the same date do not have any data written in the
    # date column. So, when a new date is seen, it is stored and as long as the date
    # column is empty, the same date will be referenced.
    last_date = ''
    # Goint through all the tr elements (table rows)
    all_matches = []
    for tr in tbody.find_all('tr'):
        # Retrieving all td elements (table data)
        tds = tr.find_all('td')
        # Getting the inner-text of each td element
        match = [td.text for td in tds]
        # As explained above, if the date column is empty, use the previous date value,
        # otherwise, update the date value with the new one
        if match[0] == '':
            match[0] = last_date
        else:
            last_date = match[0]
        all_matches.append(match)

    result = pd.DataFrame(all_matches, columns=[
        'date',
        'division',
        'unused_1',
        'home',
        'home_pre_rating',
        'home_rating_delta',
        'home_post_rating',
        'result',
        'prob_h',
        'prob_d',
        'prob_a',
        'away',
        'away_pre_rating',
        'away_rating_delta',
        'away_post_rating',
        'unused_2'])

    return result;

In [3]:
# Retrieving the matches and merging them into a single dataframe
df = pd.concat(
    [extract_games(year) for year in league_years])

df.reset_index(drop=True)

Unnamed: 0,date,division,unused_1,home,home_pre_rating,home_rating_delta,home_post_rating,result,prob_h,prob_d,prob_a,away,away_pre_rating,away_rating_delta,away_post_rating,unused_2
0,2000/06/04,Segunda Division,,Atlético Madrid B,1705,0,1705,0 - 0,34,29,37,CP Mérida,1817,0,1817,
1,2000/06/04,Segunda Division,,CD Logroñés,1683,+11,1694,1 - 0,31,29,40,Villarreal CF,1832,-11,1821,
2,2000/06/04,Segunda Division,,UD Las Palmas,1812,+5,1817,2 - 1,59,23,18,Levante UD,1700,-5,1695,
3,2000/06/04,Segunda Division,,UD Salamanca,1741,+7,1748,2 - 1,47,28,26,SD Eibar,1738,-7,1731,
4,2000/06/04,Segunda Division,,CA Osasuna,1803,+4,1807,2 - 1,61,22,16,Recreativo Huelva,1673,-4,1669,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23593,2022/08/13,Segunda Division,,UD Las Palmas,1778,-4,1774,0 - 0,52,26,22,Real Zaragoza,1727,+4,1731,
23594,2022/08/13,Segunda Division,,CD Mirandés,1709,-3,1706,1 - 1,47,28,25,Sporting Gijón,1703,+3,1706,
23595,2022/08/12,Primera Division,,CA Osasuna,1921,+12,1933,2 - 1,28,28,44,Sevilla FC,2103,-12,2091,
23596,2022/08/12,Segunda Division,,Levante UD,1913,-6,1907,0 - 0,65,21,15,SD Huesca,1755,+6,1761,


Since the operation of scraping data from the web is very resource-intensive and time consuming, the gathered data is saved into file before the processing of the data, so that should anything go wrong in the processing, there would not be a need to re-scrape the data.

In [4]:
df.to_csv('./initial_ds/ratings.csv', index=False)