In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

import pandas as pd

options = Options()
options.headless = True

driver = webdriver.Chrome(options=options,service=ChromeService(ChromeDriverManager().install()))

In [2]:
def scrape_matches_list_fotmob(driver,leagueId=47,round=1):
    url = f'https://www.fotmob.com/leagues/{leagueId}/matches/premier-league/by-round?round={round}'
    
    driver.get(url)
    matches = driver.find_elements(By.CSS_SELECTOR,".css-qg9kjw-LeagueMatchCSS-applyHover a")
    
    matchIdList = []
    matchURLList = []    
    
    for i in range(len(matches)):
        try:
            matchUrl = matches[i].get_attribute('href')
            matchId = matches[i].get_attribute('href').split('/')[4]
            matchName = matchUrl.split('/')[6]
            print(f'{round}-{i+1} {matchName}')
        except:
            matchUrl,matchId = None,None
            print(f'error crawled {round}-{i+1}')    
        matchURLList.append(matchUrl)
        matchIdList.append(matchId)
        
    match_list_by_round = {"matchId":matchIdList,"matchUrl":matchURLList}
    match_list_by_round = pd.DataFrame(match_list_by_round)    
    return match_list_by_round

In [3]:
def scrape_match_fotmob(driver,url):
    driver.get(url)
    
    match_id = url.split('/')[4]

    club_names = driver.find_elements(By.CSS_SELECTOR,".css-er0nau-TeamName span")
    try:
        home_club_name = club_names[2].text
        away_club_name = club_names[3].text
    except:
        home_club_name,away_club_name = None,None

    match_dates = driver.find_elements(By.CSS_SELECTOR,"time")
    try:
        match_date = match_dates[0].get_attribute('datetime')[0:10]
    except:
        match_date = None

    xGs = driver.find_elements(By.CSS_SELECTOR,"ul.e683amr7 li.e683amr6 span.e683amr5")    
    try:
        home_xG = float(xGs[0].text)
        away_xG = float(xGs[1].text)
    except:
        home_xG,away_xG=None,None

    scorediv = driver.find_elements(By.CSS_SELECTOR,".css-slmchi-wrapper .css-bw7eig-topRow")
    try:
        score = scorediv[0].get_attribute('innerHTML').split(" - ")
        home_score = score[0]
        away_score = score[1]
    except:
        home_score,away_score = None,None
        
    club_ids = driver.find_elements(By.CSS_SELECTOR,".e3q4wbq2 .e3q4wbq6 .e3q4wbq3 a")
    try:
        home_club_id = club_ids[0].get_attribute('href').split('/')[4]
        away_club_id = club_ids[1].get_attribute('href').split('/')[4]
    except:
        home_club_id,away_club_id=None,None
    
    potmdiv = driver.find_elements(By.CSS_SELECTOR,".TopPlayersAndPlayerOfTheMatch a:has(#roundStar24Px) .e1ozyfg82")
    try:
        potm = potmdiv[0].text
    except:
        potm = None

    match_stat = {
        'match_id' : match_id,
        'home_club_id':home_club_id,
        'away_club_id':away_club_id,
        'home_club_name':home_club_name,
        'away_club_name':away_club_name,
        'match_date' : match_date,
        'home_score' : home_score,
        'away_score' : away_score,
        'home_xG': home_xG,
        'away_xG': away_xG,
        'potm' : potm
    }
    
    return match_stat

In [4]:
url = 'https://www.fotmob.com/match/3411351'
s = scrape_match_fotmob(driver,url)
print(s)

{'match_id': '3411351', 'home_club_id': '9826', 'away_club_id': '8466', 'home_club_name': 'Crystal Palace', 'away_club_name': 'Southampton', 'match_date': '2020-09-12', 'home_score': '1', 'away_score': '0', 'home_xG': 1.11, 'away_xG': 0.89, 'potm': 'Guaita'}


In [5]:
matchlist = pd.DataFrame()
for i in range(1,39):
    m = scrape_matches_list_fotmob(driver,round=i)
    matchlist = pd.concat([matchlist,m]).reset_index(drop=True)

1-1 crystal-palace-vs-arsenal
1-2 fulham-vs-liverpool
1-3 afc-bournemouth-vs-aston-villa
1-4 leeds-united-vs-wolverhampton-wanderers
1-5 newcastle-united-vs-nottingham-forest
1-6 tottenham-hotspur-vs-southampton
1-7 everton-vs-chelsea
1-8 leicester-city-vs-brentford
1-9 manchester-united-vs-brighton-&-hove-albion
1-10 west-ham-united-vs-manchester-city
2-1 aston-villa-vs-everton
2-2 arsenal-vs-leicester-city
2-3 brighton-&-hove-albion-vs-newcastle-united
2-4 manchester-city-vs-afc-bournemouth
2-5 southampton-vs-leeds-united
2-6 wolverhampton-wanderers-vs-fulham
2-7 brentford-vs-manchester-united
2-8 nottingham-forest-vs-west-ham-united
2-9 chelsea-vs-tottenham-hotspur
2-10 liverpool-vs-crystal-palace
3-1 tottenham-hotspur-vs-wolverhampton-wanderers
3-2 crystal-palace-vs-aston-villa
3-3 everton-vs-nottingham-forest
3-4 fulham-vs-brentford
3-5 leicester-city-vs-southampton
3-6 afc-bournemouth-vs-arsenal
3-7 leeds-united-vs-chelsea
3-8 west-ham-united-vs-brighton-&-hove-albion
3-9 newcast

In [6]:
match_stats = []
for i in range(len(matchlist)):
    matchName = matchlist['matchUrl'][i].split('/')[6]
    stat = scrape_match_fotmob(driver,matchlist['matchUrl'][i])    
    match_stats.append(stat)
    print(f'{i+1}. {matchName}')
match_stats_df = pd.DataFrame(match_stats)

1. crystal-palace-vs-arsenal
2. fulham-vs-liverpool
3. afc-bournemouth-vs-aston-villa
4. leeds-united-vs-wolverhampton-wanderers
5. newcastle-united-vs-nottingham-forest
6. tottenham-hotspur-vs-southampton
7. everton-vs-chelsea
8. leicester-city-vs-brentford
9. manchester-united-vs-brighton-&-hove-albion
10. west-ham-united-vs-manchester-city
11. aston-villa-vs-everton
12. arsenal-vs-leicester-city
13. brighton-&-hove-albion-vs-newcastle-united
14. manchester-city-vs-afc-bournemouth
15. southampton-vs-leeds-united
16. wolverhampton-wanderers-vs-fulham
17. brentford-vs-manchester-united
18. nottingham-forest-vs-west-ham-united
19. chelsea-vs-tottenham-hotspur
20. liverpool-vs-crystal-palace
21. tottenham-hotspur-vs-wolverhampton-wanderers
22. crystal-palace-vs-aston-villa
23. everton-vs-nottingham-forest
24. fulham-vs-brentford
25. leicester-city-vs-southampton
26. afc-bournemouth-vs-arsenal
27. leeds-united-vs-chelsea
28. west-ham-united-vs-brighton-&-hove-albion
29. newcastle-united-v

In [20]:
match_stats_df = match_stats_df[~match_stats_df['home_score'].isnull()]
match_stats_df.to_csv('data/epl_20221012.csv')
match_stats_df[(match_stats_df['home_club_id']=='10260') | (match_stats_df['away_club_id']=='10260')].reset_index(drop=True)

Unnamed: 0,match_id,home_club_id,away_club_id,home_club_name,away_club_name,match_date,home_score,away_score,home_xG,away_xG,potm
0,3900940,10260,10204,Manchester United,Brighton & Hove Albion,2022-08-07,1,2,1.38,1.5,Gross
1,3900944,9937,10260,Brentford,Manchester United,2022-08-13,4,0,1.61,0.92,Jensen
2,3900958,10260,8650,Manchester United,Liverpool,2022-08-23,2,1,1.79,1.35,Sancho
3,3900970,8466,10260,Southampton,Manchester United,2022-08-27,0,1,1.45,1.4,Dalot
4,3900976,8197,10260,Leicester City,Manchester United,2022-09-02,0,1,0.73,1.55,Dalot
5,3900987,10260,9825,Manchester United,Arsenal,2022-09-04,3,1,1.55,1.32,Rashford
6,3901019,8456,10260,Manchester City,Manchester United,2022-10-02,6,3,3.3,1.67,Haaland
7,3901027,8668,10260,Everton,Manchester United,2022-10-10,1,2,0.65,1.45,Iwobi


In [24]:
msdf = pd.read_csv('data/epl_20221012.csv',index_col=0)

msarr = []
for index, row in msdf.iterrows():
    H_xG_for = {
            "match_id" : row['match_id'],
            "date" : row['match_date'],
            'variable': 'xG_for',
            'value': row['home_xG'],
            'venue': 'H',
            'team_id': row['home_club_id'],
            'team_name': row['home_club_name'],
        }
    H_xG_ag = {
            "match_id" : row['match_id'],
            "date" : row['match_date'],
            'variable': 'xG_ag',
            'value': row['away_xG'],
            'venue': 'H',
            'team_id': row['home_club_id'],
            'team_name': row['home_club_name'],
        }
    A_xG_for = {
            "match_id" : row['match_id'],
            "date" : row['match_date'],
            'variable': 'xG_for',
            'value': row['away_xG'],
            'venue': 'A',
            'team_id': row['away_club_id'],
            'team_name': row['away_club_name'],
        }
    A_xG_ag = {
            "match_id" : row['match_id'],
            "date" : row['match_date'],
            'variable': 'xG_ag',
            'value': row['home_xG'],
            'venue': 'A',
            'team_id': row['away_club_id'],
            'team_name': row['away_club_name'],
        }
    msarr.append(H_xG_for)
    msarr.append(H_xG_ag)
    msarr.append(A_xG_for)
    msarr.append(A_xG_ag)

epl_2223 = pd.DataFrame(msarr)
epl_2223.to_csv('data/epl_xg_2223.csv')