All data downloaded from https://www.skysports.com/

In [29]:
import pandas as pd

In [30]:
links_df = pd.read_csv("./initial_ds/match_stat_links.csv")

In [31]:
try:
    stats_df = pd.read_csv("./initial_ds/match_stats.csv")
except:
    stats_df = pd.DataFrame(columns=[
        'stats_link',
        'match_date',
        'venue',
        'attendance',
        'home_possession',
        'away_possession',
        'home_total_shots',
        'away_total_shots',
        'home_shots_on_target',
        'away_shots_on_target',
        'home_shots_off_target',
        'away_shots_off_target',
        'home_blocked',
        'away_blocked',
        'home_passing_percent',
        'away_passing_percent',
        'home_clear_cut_chances',
        'away_clear_cut_chances',
        'home_corners',
        'away_corners',
        'home_offsides',
        'away_offsides',
        'home_tackles_percent',
        'away_tackles_percent',
        'home_aerial_duels',
        'away_aerial_duels',
        'home_saves',
        'away_saves',
        'home_fouls_committed',
        'away_fouls_committed',
        'home_fouls_won',
        'away_fouls_won',
        'home_yellow_cards',
        'away_yellow_cards',
        'home_red_cards',
        'away_red_cards',
    ])
    
print(f"Currently holding {stats_df.shape[0]} instances.")

Currently holding 1706 instances.


In [32]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import re

# Getting the list of links to scrape
stats_links = links_df[~links_df['stats_link'].isin(stats_df['stats_link'])]['stats_link']
remaining_links = total_links = stats_links.shape[0]

try:
    # Initiating a new Google Chrome window
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    
    # Going through the list of links and scraping them one by one
    for stats_link in stats_links:
        print(f"Adding the link:\n{stats_link}\n...")
        # Retrieving the URL in the Chrome window
        driver.get(stats_link)
        stats = BeautifulSoup(driver.page_source, 'lxml')
            
        # Extracting the date
        match_date = stats.find('time').text
            
        # Extracting the match's venue
        venue_element = stats.find('span', attrs={"class": 'sdc-site-match-header__detail-venue'})
        venue = venue_element and venue_element.text
        
        # Extracting the venue's attendance
        attendance_element = stats.find('span', attrs={"class": 'sdc-site-match-header__detail-attendance-label'})
        attendance_label = attendance_element and attendance_element.next_sibling
        attendance = attendance_label and ''.join(re.findall('\d', attendance_label.get_text()))
        
        # Extracting all other stats
        other_stats = [s.text for s in stats.find_all('span', attrs={'class': 'sdc-site-match-stats__stats-label'})]
        other_stats.extend([None for i in range(32 - len(other_stats))])
        
        # Combining all the single stats into a list
        values = [
            stats_link,
            match_date,
            venue,
            attendance,
        ] + other_stats
        
        # Saving the list of stats into the dataframe
        print(f"The extracted values:\n{values}\n(Total of {len(values)})\n")
        stats_df.loc[len(stats_df)] = values
        remaining_links -= 1
        print(f"Total of {total_links - remaining_links} links scraped, {remaining_links} links remaining.\n")
except Exception as e:
    print(f"Error occurred while scraping the link:\n{stats_link}\n")
    raise e      
finally:
    # Saving the newly added stats into file
    stats_df.to_csv("./initial_ds/match_stats.csv", index=False)         

Adding the link:
https://www.skysports.com/football/cadiz-vs-sevilla/stats/450681
...
The extracted values:
['https://www.skysports.com/football/cadiz-vs-sevilla/stats/450681', '8:15pm, Monday 3rd January 2022.', 'Ramon de Carranza', '14342', '25.2', '74.8', '9', '14', '2', '4', '6', '9', '1', '1', '65', '90.8', '0', '2', '2', '8', '2', '4', '85.7', '57.1', '40.6', '59.4', '2', '2', '4', '9', '9', '4', '0', '1', '0', '0']
(Total of 36)

Total of 1 links scraped, 187 links remaining.

Adding the link:
https://www.skysports.com/football/getafe-vs-real-madrid/stats/450683
...
The extracted values:
['https://www.skysports.com/football/getafe-vs-real-madrid/stats/450683', '1:00pm, Sunday 2nd January 2022.', 'Coliseum Alfonso Perez', '11890', '26.5', '73.5', '6', '14', '3', '4', '1', '5', '2', '5', '69.5', '87.3', '0', '1', '2', '9', '0', '6', '70.6', '83.3', '34.4', '65.6', '4', '2', '20', '8', '8', '19', '3', '2', '0', '0']
(Total of 36)

Total of 2 links scraped, 186 links remaining.

Add