All data downloaded from https://www.skysports.com/

In [1]:
league_seasons = [f"{2000+year}-{year+1}" for year in range(11, 23)]

In [2]:
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Initiating a new Google Chrome window
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# The function below extracts the games of any given season of La Liga and
# returns the output in the form of a dataframe
def extract_games(league_season):
    # The URL is shapes in the form of
    # https://www.skysports.com/la-liga-results/2011-12
    url = f"https://www.skysports.com/la-liga-results/{league_season}"
    
    # Retrieving the URL in the Chrome window
    driver.get(url)
    
    # When the page loads, the table's empty and is later filled by AJAX calls.
    # So, we wait until there is an element by the id of for instance
    # "widgetLite-5" available on the page before proceeding to
    # extract the table.
    # The availability of the said element shows that the table is fully loaded
    # and filled.
    element = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.ID, 'widgetLite-10'))
    )

    # These are elements initially displayed on the page
    visible_elements = BeautifulSoup(driver.page_source, 'lxml')

    # These are elements that are hidden inside a script for the "show more" button
    hidden_elements = BeautifulSoup(
        driver.find_element(By.CSS_SELECTOR, 'script[data-role="load-more-content"]').get_attribute('innerHTML'),
        'lxml')
    
    # Listing all the 'DIV' tags each containing the data for a single match
    all_match_divs = (visible_elements.findAll('div', attrs={"class": 'fixres__item'}) +
                   hidden_elements.find_all('div', attrs={"class": 'fixres__item'}))
    
    # Going through all the matches and extracting each one's info
    last_date = ''
    all_matches = []
    for match_div in all_match_divs:
        # The link to the match's specific webpage
        stats_link = match_div.find('a').get('href')
        
        # The names of the two teams
        team_names = [team.text for team in match_div.findAll('span', attrs={"class": 'swap-text__target'})]
        
        # If the previous sibling of the current 'DIV' element (single match) is an 'h4' element, get the
        # date of the match from it, otherwise the date of the match is the same as the previous match.
        previous_element = match_div.previous_sibling.previous_sibling
        match_date = previous_element.text if previous_element.name == 'h4' else last_date
        last_date = match_date
        
        # Appending the data of the current match to the list of all matches
        all_matches.append([
            match_date,
            team_names[0],
            team_names[1],
            stats_link
        ])

    result = pd.DataFrame(all_matches, columns=[
        'date',
        'home',
        'away',
        'link'])

    return result;

rows = extract_games('2011-12')

In [3]:
rows

Unnamed: 0,date,home,away,link
0,Sunday 13th May,Espanyol,Sevilla,https://www.skysports.com/football/espanyol-vs...
1,Sunday 13th May,Getafe,Real Zaragoza,https://www.skysports.com/football/getafe-vs-r...
2,Sunday 13th May,Levante,Athletic Bilbao,https://www.skysports.com/football/levante-vs-...
3,Sunday 13th May,Malaga,Sporting Gijon,https://www.skysports.com/football/malaga-vs-s...
4,Sunday 13th May,Racing Santander,Osasuna,https://www.skysports.com/football/racing-sant...
...,...,...,...,...
375,Sunday 28th August,Real Zaragoza,Real Madrid,https://www.skysports.com/football/real-zarago...
376,Sunday 28th August,Sevilla,Malaga,https://www.skysports.com/football/sevilla-vs-...
377,Saturday 27th August,Sporting Gijon,Real Sociedad,https://www.skysports.com/football/sporting-gi...
378,Saturday 27th August,Valencia,Racing Santander,https://www.skysports.com/football/valencia-vs...
