All data downloaded from https://www.skysports.com/

In [35]:
import pandas as pd

In [36]:
links_df = pd.read_csv("./initial_ds/match_stat_links.csv")

In [37]:
try:
    stats_df = pd.read_csv("./initial_ds/match_stats.csv")
except:
    stats_df = pd.DataFrame(columns=[
        'stats_link',
        'match_date',
        'venue',
        'attendance',
        'home_possession',
        'away_possession',
        'home_total_shots',
        'away_total_shots',
        'home_shots_on_target',
        'away_shots_on_target',
        'home_shots_off_target',
        'away_shots_off_target',
        'home_blocked',
        'away_blocked',
        'home_passing_percent',
        'away_passing_percent',
        'home_clear_cut_chances',
        'away_clear_cut_chances',
        'home_corners',
        'away_corners',
        'home_offsides',
        'away_offsides',
        'home_tackles_percent',
        'away_tackles_percent',
        'home_aerial_duels',
        'away_aerial_duels',
        'home_saves',
        'away_saves',
        'home_fouls_committed',
        'away_fouls_committed',
        'home_fouls_won',
        'away_fouls_won',
        'home_yellow_cards',
        'away_yellow_cards',
        'home_red_cards',
        'away_red_cards',
    ])
    
print(f"Currently holding {stats_df.shape[0]} instances.")

Currently holding 2069 instances.


In [38]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import re

# Getting the list of links to scrape
stats_links = links_df[~links_df['stats_link'].isin(stats_df['stats_link'])]['stats_link']

try:
    # Initiating a new Google Chrome window
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    
    # Going through the list of links and scraping them one by one
    for stats_link in stats_links:
        print(f"Adding the link:\n{stats_link}\n")
        # Retrieving the URL in the Chrome window
        driver.get(stats_link)
        stats = BeautifulSoup(driver.page_source, 'lxml')
            
        # Extracting the date
        match_date = stats.find('time').text
            
        # Extracting the match's venue
        venue_element = stats.find('span', attrs={"class": 'sdc-site-match-header__detail-venue'})
        venue = venue_element and venue_element.text
        
        # Extracting the venue's attendance
        attendance_element = stats.find('span', attrs={"class": 'sdc-site-match-header__detail-attendance-label'})
        attendance_label = attendance_element and attendance_element.next_sibling
        attendance = attendance_label and ''.join(re.findall('\d', attendance_label.get_text()))
        
        # Extracting all other stats
        other_stats = [s.text for s in stats.find_all('span', attrs={'class': 'sdc-site-match-stats__stats-label'})]
        other_stats.extend([None for i in range(32 - len(other_stats))])
        
        # Combining all the single stats into a list
        values = [
            stats_link,
            match_date,
            venue,
            attendance,
        ] + other_stats
        
        # Saving the list of stats into the dataframe
        print(f"The extracted values:\n{values}\n(Total of {len(values)})\n")
        stats_df.loc[len(stats_df)] = values
except Exception as e:
    print(f"Error occurred while scraping the link:\n{stats_link}\n")
    raise e      
finally:
    # Saving the newly added stats into file
    stats_df.to_csv("./initial_ds/match_stats.csv", index=False)         

Adding the link:
https://www.skysports.com/football/real-betis-vs-girona/stats/469571

The extracted values:
['https://www.skysports.com/football/real-betis-vs-girona/stats/469571', '5:30pm, Sunday 18th September 2022.', 'Benito Villamarin', '52229', '44', '56', '9', '9', '3', '3', '3', '5', '3', '1', '84.8', '87.2', '0', '2', '2', '4', '2', '5', '42.1', '40.9', '50', '50', '2', '1', '9', '8', '8', '9', '0', '3', '0', '0']
(Total of 36)

Adding the link:
https://www.skysports.com/football/real-sociedad-vs-espanyol/stats/469572

The extracted values:
['https://www.skysports.com/football/real-sociedad-vs-espanyol/stats/469572', '5:30pm, Sunday 18th September 2022.', 'Reale Seguros Stadium', '31511', '62.9', '37.1', '14', '14', '6', '4', '5', '7', '3', '3', '81.2', '67.7', '4', '1', '4', '2', '2', '3', '64.3', '55.6', '49', '51', '3', '4', '18', '8', '8', '17', '3', '0', '0', '0']
(Total of 36)

Adding the link:
https://www.skysports.com/football/atletico-madrid-vs-real-madrid/stats/46956