In [3]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.common.by import By

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

We will start with scraping one match:

We want to get the following:

1. Team Names (Home - Away)
2. Goals
3. Amount of missing players per team
4. Month
5. GD Per team prior to the match (or something else that shows the form)

In [11]:
def get_points(form):
    # Initialize the sum
    total_sum = 0

    # Define color-to-value mapping
    color_values = {
        "var(--TeamForm-green)": 3,
        "var(--TeamForm-grey)": 1,
        "var(--TeamForm-red)": 0
    }


    # Iterate through the match items
    match_items = form.find_all('li', class_='css-tgulmi-TeamFormContainerRow e3w5gu44')

    for match_item in match_items:
        result_div = match_item.find('div', color=["var(--TeamForm-green)", "var(--TeamForm-grey)", "var(--TeamForm-red)"])
        
        # Exclude the horizontal line element
        if result_div and not result_div.find_previous_sibling('div', class_='css-xw5oij-HorizontalLine ecz4wo11'):
            color = result_div.get('color')
            
            if color:
                value = color_values.get(color)
                
                if value is not None:
                    total_sum += value
    return total_sum

def scrape_match(url, i):
    driver = webdriver.Edge()
    driver.get(url)
    # Extract home team name, away team name, and score
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    home_team_name = soup.find('span', class_='css-q98y93-TeamNameItself-TeamNameOnMobile-hideOnTabletOrBigger').text.strip()
    away_team_name = soup.find_all('span', class_='css-q98y93-TeamNameItself-TeamNameOnMobile-hideOnTabletOrBigger')[1].text.strip()
    score = soup.find('span', class_='css-ta04x1-MFHeaderStatusScore-topRow').text.strip()
    scores = score.split()
    home_score = int(scores[0])
    away_score = int(scores[-1])
    scores_string = home_score, away_score


    # Close the WebDriver
    

    # Count the number of <a> elements
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()
    injured_and_missing = soup.find_all('div', class_='css-1ou3v7p-LineupAdditionalContainer ejc6in01')[-1]
    injured_lists = injured_and_missing.find_all('div', class_='css-1xyusi4-LineupAdditionalSideContainer ejc6in00')
    home_injured = injured_lists[0]
    # Assuming you already have the 'home_injured' element
    a_elements = home_injured.find_all('a', href=True)
    # Count the number of <a> elements within 'home_injured'
    count = len(a_elements)

    # doing the one for the away team
    away_injured = injured_lists[1]
    home_missing = count
    # Print the count
    print(f"Number of <a href> elements within 'home_injured': {count}")

    a_elements = away_injured.find_all('a', href=True)
    # Count the number of <a> elements within 'home_injured'
    count = len(a_elements)
    print(f"Number of <a href> elements within 'away_injured': {count}")
    away_missing = count

    #### and doing one for the team form:
    wrap = soup.find('div', class_='css-qas03g-RightColumn-commonColumn eozzfav0')
    form = wrap.find('div', class_='css-1p6t12o-TeamFormColumnsWrapper eqza12c0')
    home_form = form.find('ul', class_='css-1mgoca0-TeamFormContainerList e3w5gu45')
    away_form = form.find('ul', class_='css-1g2soyd-TeamFormContainerList e3w5gu45')
    
    home_sum = get_points(home_form)
    away_sum = get_points(away_form)
    match_time_element = soup.find_all('time', datetime=True)[-1]
    if match_time_element:
        match_time = match_time_element.text.strip()
    else:
        match_time = "N/A"

    # Find the <span> element with the specified class
    referee_element = soup.find_all('span', class_='css-1scisqg-InfoBoxValue ejh09150')[3]
    if referee_element:
        referee_name = referee_element.text.strip()
    else:
        referee_name = "N/A"
    # Find the match rounds element
    match_round_element = soup.find('div', class_='css-1m0k67e-MiddleGridItem erezbcv3')
    match_round_text = match_round_element.find('span').text.strip() if match_round_element else "N/A"
    match_round = match_round_text if match_round_text else "N/A"


    ### Getting the Head To Head Stats
    h_2_h = soup.find('div', {'name':'head to head'})
        # Extract the relevant information
    wins_element = h_2_h.find('span', class_='css-q6dc64-NumberOfWins ew2zkhp10')
    draws_element = h_2_h.find('span', class_='css-10nshmy-NumberOfWins')
    losses_element = h_2_h.find('span', class_='css-x6i3js-NumberOfWins ew2zkhp10')

    # Get the text content of each element
    wins = int(wins_element.get_text()) if wins_element else 0
    draws = int(draws_element.get_text()) if draws_element else 0
    losses = int(losses_element.get_text()) if losses_element else 0


        
    # Print the results
    print(f"Wins: {wins}")
    print(f"Draws: {draws}")
    print(f"Losses: {losses}")
    ###
    # Print the results
    print(f"Match Round: {match_round}")
    # Print the results
    print(f"Match Time: {match_time}")
    print(f"Referee: {referee_name}")
    print(f"Total Sum for the home team: {home_sum}")
    print(f"Total Sum for the home team: {away_sum}")
    # Print the results
    print(f"Home Team: {home_team_name}")
    print(f"Away Team: {away_team_name}")



    return pd.DataFrame(data={'HomeTeam' : home_team_name,
                         "AwayTeam" : away_team_name,
                         'HomeScore' : home_score,
                         'AwayScore' : away_score,
                         'Time' : match_time,
                         'Ref' : referee_name,
                         'HomeForm' : home_sum,
                         'AwayForm' : away_sum,
                         'HomeMissingPlayers' : home_missing,
                         'AwayMissingPlayers' : away_missing,
                         'Round' : match_round}, index=[i])


In [12]:
match = scrape_match('https://www.fotmob.com/match/3901282/matchfacts/arsenal-vs-brighton-hove-albion', 0)

Number of <a href> elements within 'home_injured': 4
Number of <a href> elements within 'away_injured': 7
Wins: 6
Draws: 3
Losses: 5
Match Round: Premier League Round 36 2022/2023
Match Time: May 14, 2023, 6:30 PM
Referee: Andy Madley
Total Sum for the home team: 8
Total Sum for the home team: 6
Home Team: Arsenal
Away Team: Brighton


In [7]:
match.head()

Unnamed: 0,HomeTeam,AwayTeam,HomeScore,AwayScore,Time,Ref,HomeForm,AwayForm,HomeMissingPlayers,AwayMissingPlayers,Round
0,Arsenal,Brighton,0,3,"May 14, 2023, 6:30 PM",Andy Madley,8,6,4,7,Premier League Round 36 2022/2023


In [8]:
match = pd.concat([match, match])

In [9]:
def create_pages(last_url):
    curr = int(last_url[-2:]) ## assuming double digits
    base = last_url[:len(last_url)-2]
    urls = []
    while (curr >= 0):
        urls.append(last_url)
        last_url = base+str(curr-1)
        curr-=1
    return urls

def get_urls(url):
    driver = webdriver.Edge()
    driver.get(url)


    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Find all the match containers
    match_containers = soup.find_all('div', class_='css-1fkfix2-LeagueMatchCSS e565gvj0')

    # Initialize a list to store the href attributes
    match_links = []
 
    # Iterate through each match container and extract the href attribute
    for match_container in match_containers:
        match_link = match_container.find('a', href=True)
        if match_link:
            match_href = match_link['href']
            match_links.append('https://www.fotmob.com'+match_href)

    # Close the WebDriver
    driver.quit()

    # Return the list of match href attributes
    return match_links

In [10]:
urls23 = create_pages('https://www.fotmob.com/leagues/47/matches/premier-league?season=2022-2023&page=33')
urls22 = create_pages('https://www.fotmob.com/leagues/47/matches/premier-league?season=2021-2022&page=34')
urls19 = create_pages('https://www.fotmob.com/leagues/47/matches/premier-league?season=2018-2019&page=35')
urls18 = create_pages('https://www.fotmob.com/leagues/47/matches/premier-league?season=2017-2018&page=35')

In [11]:
def get_season_links(season_pages):
    links = []
    for p in season_pages:
        curr = get_urls(p)
        links.extend(curr)
    return links

In [12]:
link23 = get_season_links(urls23)
link22 = get_season_links(urls22)
link19 = get_season_links(urls19)
test_urls = get_season_links(urls18)

In [16]:
def create_table(links):
    index = 0
    df = pd.DataFrame()
    for link in links:
        while True:
            try:
                curr = scrape_match(link, index)
                index+=1
                df = pd.concat([df, curr])
                break
            except:
                print('refreshing')
    return df

Sanity Check: we will use the create_table functino on a single match to see if everything works well

In [1]:
epl23 = create_table(link23)
epl22 = create_table(link22)
epl19 = create_table(link19)

test = create_table(test_urls)

NameError: name 'create_table' is not defined

In [2]:
epl23.to_csv("Epl23.csv")
epl22.to_csv("Epl22.csv")
epl19.to_csv("Epl19.csv")

test.to_csv('test.csv')

NameError: name 'epl23' is not defined