All data downloaded from https://www.skysports.com/

In [100]:
import pandas as pd

In [101]:
links_df = pd.read_csv("./initial_ds/match_stat_links.csv")

In [138]:
try:
    stats_df = pd.read_csv("./initial_ds/match_stats.csv")
except:
    stats_df = pd.DataFrame(columns=[
        'stats_link',
        'venue',
        'attendance',
        'home_possiosson',
        'away_possiosson',
        'home_total_shots',
        'away_total_shots',
        'home_on_target',
        'away_on_target',
        'home_off_target',
        'away_off_target',
        'home_blocked',
        'away_blocked',
        'home_passing_percent',
        'away_passing_percent',
        'home_clear_cut_chances',
        'away_clear_cut_chances',
        'home_corners',
        'away_corners',
        'home_offsides',
        'away_offsides',
        'home_tackles_percent',
        'away_tackles_percent',
        'home_aerial_duels',
        'away_aerial_duels',
        'home_saves',
        'away_saves',
        'home_fouls_committed',
        'away_fouls_committed',
        'home_fouls_won',
        'away_fouls_won',
        'home_yellow_cards',
        'away_yellow_cards',
        'home_red_cards',
        'away_red_cards',
    ])
    
print(f"Currently holding {stats_df.shape[0]} instances.")

Currently holding 115 instances.


In [139]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import re

# Getting the list of links to scrape
links_count = 500
stats_links = links_df[~links_df['stats_link'].isin(stats_df['stats_link'])]['stats_link'].head(links_count)

try:
    # Initiating a new Google Chrome window
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    
    # Going through the list of links and scraping them one by one
    for stats_link in stats_links:
        # Retrieving the URL in the Chrome window
        driver.get(stats_link)
        stats = BeautifulSoup(driver.page_source, 'lxml')
            
        # Extracting the match's venue
        venue_element = stats.find('span', attrs={"class": 'sdc-site-match-header__detail-venue'})
        venue = venue_element and venue_element.text
        
        # Extracting the venue's attendance
        attendance_element = stats.find('span', attrs={"class": 'sdc-site-match-header__detail-attendance-label'})
        attendance_label = attendance_element and attendance_element.next_sibling
        attendance = attendance_label and ''.join(re.findall('\d', attendance_label.get_text()))
        
        # Extracting all other stats
        other_stats = [s.text for s in stats.find_all('span', attrs={'class': 'sdc-site-match-stats__stats-label'})]
        other_stats.extend([None for i in range(32 - len(other_stats))])
        
        # Combining all the single stats into a list
        values = [
            stats_link,
            venue,
            attendance,
        ] + other_stats
        
        # Saving the list of stats into the dataframe
        print(f"Adding the link:\n{stats_link}\nThe values:\n{values}\n(Total of {len(values)})\n")
        stats_df.loc[len(stats_df)] = values        
except Exception as e:
    print(e)
finally:
    # Saving the newly added stats into file
    stats_df.to_csv("./initial_ds/match_stats.csv", index=False)         

Adding the link:
https://www.skysports.com/football/getafe-vs-real-sociedad/stats/257554
The values:
['https://www.skysports.com/football/getafe-vs-real-sociedad/stats/257554', 'Coliseum Alfonso Perez', '11000', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
(Total of 35)

Adding the link:
https://www.skysports.com/football/granada-vs-sporting-gijon/stats/257552
The values:
['https://www.skysports.com/football/granada-vs-sporting-gijon/stats/257552', None, '12000', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
(Total of 35)

Adding the link:
https://www.skysports.com/football/real-zaragoza-vs-osasuna/stats/257553
The values:
['https://www.skysports.com/football/real-zaragoza-vs-osasuna/stats/257553', 'La Romareda', '25321', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',