In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

import pandas as pd

options = Options()
options.headless = True

driver = webdriver.Chrome(options=options,service=ChromeService(ChromeDriverManager().install()))
fotmob_match_url = 'https://www.fotmob.com/match/'

In [2]:
def scrape_matches_list_fotmob(driver, leagueId=47, round=1):
    leagueName = 'premier-league'
    if leagueId == 47:
        leagueName = 'premier-league'
    elif leagueId == 55:
        leagueName = 'serie-a'
    elif leagueId == 54:
        leagueName = 'bundesliga'
    elif leagueId == 87:
        leagueName = 'laliga'
    
    url = f'https://www.fotmob.com/leagues/{leagueId}/matches/{leagueName}/by-round?round={round}'

    driver.get(url)
    matches = driver.find_elements(
        By.CSS_SELECTOR, ".css-qg9kjw-LeagueMatchCSS-applyHover a")

    matchIdList = []
    matchURLList = []

    for i in range(len(matches)):
        try:
            matchUrl = matches[i].get_attribute('href')
            matchId = matches[i].get_attribute('href').split('/')[4]
            try:
                matchName = matchUrl.split('/')[6]
            except:
                matchName = matchId
            print(f'{round}-{i+1} {matchName}')
            matchURLList.append(matchUrl)
            matchIdList.append(matchId)
        except:
            matchUrl, matchId = None, f'{url} - {i+1}'
            print(f'error crawled {round}-{i+1}')

    match_list_by_round = {"matchId": matchIdList, "matchUrl": matchURLList}
    match_list_by_round = pd.DataFrame(match_list_by_round)
    return match_list_by_round


In [3]:
def scrape_match_fotmob(driver,url):
    driver.get(url)
    print(f"scraping {url}....")
    
    match_id = url.split('/')[4]

    club_names = driver.find_elements(By.CSS_SELECTOR,".css-er0nau-TeamName span")
    try:
        home_club_name = club_names[2].text
        away_club_name = club_names[3].text
    except:
        home_club_name,away_club_name = None,None

    match_dates = driver.find_elements(By.CSS_SELECTOR,"time")
    try:
        match_date = match_dates[0].get_attribute('datetime')[0:10]
    except:
        match_date = None

    xGs = driver.find_elements(By.CSS_SELECTOR,"ul.e683amr7 li.e683amr6 span.e683amr5")    
    try:
        home_xG = float(xGs[0].text)
        away_xG = float(xGs[1].text)
    except:
        home_xG,away_xG=None,None

    scorediv = driver.find_elements(By.CSS_SELECTOR,".css-slmchi-wrapper .css-bw7eig-topRow")
    try:
        score = scorediv[0].get_attribute('innerHTML').split(" - ")
        home_score = score[0]
        away_score = score[1]
    except:
        home_score,away_score = None,None
        
    club_ids = driver.find_elements(By.CSS_SELECTOR,".e3q4wbq2 .e3q4wbq6 .e3q4wbq3 a")
    try:
        home_club_id = club_ids[0].get_attribute('href').split('/')[4]
        away_club_id = club_ids[1].get_attribute('href').split('/')[4]
    except:
        home_club_id,away_club_id=None,None
    
    potmdiv = driver.find_elements(By.CSS_SELECTOR,".TopPlayersAndPlayerOfTheMatch a:has(#roundStar24Px) .e1ozyfg82")
    try:
        potm = potmdiv[0].text
    except:
        potm = None

    match_stat = {
        'match_id' : match_id,
        'home_club_id':home_club_id,
        'away_club_id':away_club_id,
        'home_club_name':home_club_name,
        'away_club_name':away_club_name,
        'match_date' : match_date,
        'home_score' : home_score,
        'away_score' : away_score,
        'home_xG': home_xG,
        'away_xG': away_xG,
        'potm' : potm
    }
    
    return match_stat

In [4]:
def convert_to_variable_df(df):
    arr = []
    for index, row in df.iterrows():
        H_xG_for = {
                "match_id" : row['match_id'],
                "date" : row['match_date'],
                'variable': 'xG_for',
                'value': row['home_xG'],
                'venue': 'H',
                'team_id': row['home_club_id'],
                'team_name': row['home_club_name'],
            }
        H_xG_ag = {
                "match_id" : row['match_id'],
                "date" : row['match_date'],
                'variable': 'xG_ag',
                'value': row['away_xG'],
                'venue': 'H',
                'team_id': row['home_club_id'],
                'team_name': row['home_club_name'],
            }
        A_xG_for = {
                "match_id" : row['match_id'],
                "date" : row['match_date'],
                'variable': 'xG_for',
                'value': row['away_xG'],
                'venue': 'A',
                'team_id': row['away_club_id'],
                'team_name': row['away_club_name'],
            }
        A_xG_ag = {
                "match_id" : row['match_id'],
                "date" : row['match_date'],
                'variable': 'xG_ag',
                'value': row['home_xG'],
                'venue': 'A',
                'team_id': row['away_club_id'],
                'team_name': row['away_club_name'],
            }
        arr.append(H_xG_for)
        arr.append(H_xG_ag)
        arr.append(A_xG_for)
        arr.append(A_xG_ag)

    var_df = pd.DataFrame(arr)
    return var_df

In [5]:
url = 'https://www.fotmob.com/match/3411351'
s = scrape_match_fotmob(driver,url)
print(s)

scraping https://www.fotmob.com/match/3411351....
{'match_id': '3411351', 'home_club_id': '9826', 'away_club_id': '8466', 'home_club_name': 'Crystal Palace', 'away_club_name': 'Southampton', 'match_date': '2020-09-12', 'home_score': '1', 'away_score': '0', 'home_xG': 1.11, 'away_xG': 0.89, 'potm': 'Guaita'}


Scrape EPL 2022/2023

In [None]:
matchlist = pd.DataFrame()
for i in range(1,39):
    m = scrape_matches_list_fotmob(driver,round=i)
    matchlist = pd.concat([matchlist,m]).reset_index(drop=True)

In [None]:
match_stats = []
for i in range(len(matchlist)):
    matchName = matchlist['matchUrl'][i].split('/')[6]
    stat = scrape_match_fotmob(driver,matchlist['matchUrl'][i])    
    match_stats.append(stat)
    print(f'{i+1}. {matchName}')
match_stats_df = pd.DataFrame(match_stats)

In [None]:
match_stats_df = match_stats_df[~match_stats_df['home_score'].isnull()]
match_stats_df.to_csv('data/epl_20221012.csv')
match_stats_df[(match_stats_df['home_club_id']=='10260') | (match_stats_df['away_club_id']=='10260')].reset_index(drop=True)

In [None]:
msdf = pd.read_csv('data/epl_20221012.csv',index_col=0)
epl_2223 = convert_to_variable_df(msdf)
epl_2223.to_csv('data/epl_xg_2223.csv')

Scrape Serie A 2022/2023

In [None]:
leagueId = 55
ita_matchlist = pd.DataFrame()
for i in range(1,39):
    m = scrape_matches_list_fotmob(driver,leagueId=leagueId,round=i)
    ita_matchlist = pd.concat([ita_matchlist,m]).reset_index(drop=True)

In [None]:
ita_match_stats = []
for i in range(120):
    stat = scrape_match_fotmob(driver,ita_matchlist['matchUrl'][i])
    home_club = stat['home_club_name']
    away_club = stat['away_club_name']
    print(f'{i+1}. {home_club} vs {away_club}')
    ita_match_stats.append(stat)
ita_match_stats_df = pd.DataFrame(ita_match_stats)

In [16]:
ita_match_stats_df = ita_match_stats_df[~ita_match_stats_df['home_score'].isnull()]
seriea_2223 = convert_to_variable_df(ita_match_stats_df)
seriea_2223.to_csv('data/seriea_xg_2223.csv')

Scrape Serie A 2021/2022

In [19]:
fotmob_match_url = 'https://www.fotmob.com/match/'
m1 = 3656991

seriea_2122_matchlist = []

for i in range(400):
    seriea_2122_matchlist.append(f'{fotmob_match_url}{m1+i}')

seriea_2122_matchlist[-1]

'https://www.fotmob.com/match/3657390'

In [None]:
ita_match_stats = []
for i in range(len(seriea_2122_matchlist)):
    stat = scrape_match_fotmob(driver,seriea_2122_matchlist[i])
    home_club = stat['home_club_name']
    away_club = stat['away_club_name']
    print(f'{i+1}. {home_club} vs {away_club}')
    ita_match_stats.append(stat)
ita_match_stats_df = pd.DataFrame(ita_match_stats)

In [22]:
ita_match_stats_df = ita_match_stats_df[~ita_match_stats_df['home_score'].isnull()]
seriea_2122 = convert_to_variable_df(ita_match_stats_df)
seriea_2122.to_csv('data/seriea_xg_2122.csv')

Scrape Serie A 2020/2021

In [5]:
fotmob_match_url = 'https://www.fotmob.com/match/'
m1 = 3428766

seriea_2021_matchlist = []

for i in range(400):
    seriea_2021_matchlist.append(f'{fotmob_match_url}{m1+i}')

seriea_2021_matchlist[-1]

'https://www.fotmob.com/match/3429165'

In [None]:
ita_match_stats = []
for i in range(len(seriea_2021_matchlist)):
    stat = scrape_match_fotmob(driver,seriea_2021_matchlist[i])
    home_club = stat['home_club_name']
    away_club = stat['away_club_name']
    print(f'{i+1}. {home_club} vs {away_club}')
    ita_match_stats.append(stat)
ita_match_stats_df = pd.DataFrame(ita_match_stats)

In [None]:
ita_match_stats_df = ita_match_stats_df[~ita_match_stats_df['home_score'].isnull()]
seriea_2021 = convert_to_variable_df(ita_match_stats_df)
seriea_2021.to_csv('data/seriea_xg_2021.csv')

Scrape Bundesliga 2022/2023

In [None]:
leagueId = 54
ger_matchlist = pd.DataFrame()
for i in range(1,35):
    m = scrape_matches_list_fotmob(driver,leagueId=leagueId,round=i)
    ger_matchlist = pd.concat([ger_matchlist,m]).reset_index(drop=True)

In [None]:
ger_match_stats = []
for i in range(120):
    stat = scrape_match_fotmob(driver,ger_matchlist['matchUrl'][i])
    home_club = stat['home_club_name']
    away_club = stat['away_club_name']
    print(f'{i+1}. {home_club} vs {away_club}')
    ger_match_stats.append(stat)
ger_match_stats_df = pd.DataFrame(ger_match_stats)

In [10]:
ger_match_stats_df = ger_match_stats_df[~ger_match_stats_df['home_score'].isnull()]
bundesliga_2223 = convert_to_variable_df(ger_match_stats_df)
bundesliga_2223.to_csv('data/bundesliga_xg_2223.csv')

Scrape Bundesliga 2021/2022

In [None]:
ger_match_stats = []
fotmob_match_url = 'https://www.fotmob.com/match/'
bundesliga_match1 = 3624340

for i in range(406):
    match_url = f'{fotmob_match_url}{bundesliga_match1+i}'
    stat = scrape_match_fotmob(driver,match_url)
    home_club = stat['home_club_name']
    away_club = stat['away_club_name']
    print(f'{i+1}. {home_club} vs {away_club}')
    ger_match_stats.append(stat)
ger_match_stats_df = pd.DataFrame(ger_match_stats)

In [25]:
ger_match_stats_df=ger_match_stats_df[~ger_match_stats_df['home_score'].isnull()]
bundesliga_2122= convert_to_variable_df(ger_match_stats_df)
bundesliga_2122.to_csv('data/bundesliga_xg_2122.csv')

Scrape Bundesliga 2020/2021

In [26]:
ger_match_stats = []
bundesliga_match1 = 3399144

for i in range(406):
    match_url = f'{fotmob_match_url}{bundesliga_match1+i}'
    stat = scrape_match_fotmob(driver,match_url)
    home_club = stat['home_club_name']
    away_club = stat['away_club_name']
    print(f'{i+1}. {home_club} vs {away_club}')
    ger_match_stats.append(stat)
ger_match_stats_df = pd.DataFrame(ger_match_stats)

scraping https://www.fotmob.com/match/3399144....
1. Bayern München vs Schalke 04
scraping https://www.fotmob.com/match/3399145....
2. Borussia Dortmund vs Borussia M'Gladbach
scraping https://www.fotmob.com/match/3399146....
3. RB Leipzig vs Mainz 05
scraping https://www.fotmob.com/match/3399147....
4. Wolfsburg vs Bayer Leverkusen
scraping https://www.fotmob.com/match/3399148....
5. Eintracht Frankfurt vs Arminia Bielefeld
scraping https://www.fotmob.com/match/3399149....
6. Union Berlin vs Augsburg
scraping https://www.fotmob.com/match/3399150....
7. 1. FC Köln vs TSG Hoffenheim
scraping https://www.fotmob.com/match/3399151....
8. Werder Bremen vs Hertha BSC
scraping https://www.fotmob.com/match/3399152....
9. VfB Stuttgart vs Freiburg
scraping https://www.fotmob.com/match/3399153....
10. None vs None
scraping https://www.fotmob.com/match/3399154....
11. None vs None
scraping https://www.fotmob.com/match/3399155....
12. None vs None
scraping https://www.fotmob.com/match/3399156....


In [32]:
ger_match_stats_df = ger_match_stats_df[~ger_match_stats_df['home_club_id'].isnull()]
ger_match_stats_df = convert_to_variable_df(ger_match_stats_df)
ger_match_stats_df.to_csv('data/bundesliga_xg_2021.csv')

Scrape Laliga 2022/2023

In [5]:
esp_match_stats = []
laliga_match1 = 3917938

for i in range(400):
    match_url = f'{fotmob_match_url}{laliga_match1+i}'
    stat = scrape_match_fotmob(driver, match_url)
    home_club = stat['home_club_name']
    away_club = stat['away_club_name']
    print(f'{i+1}. {home_club} vs {away_club}')
    esp_match_stats.append(stat)
esp_match_stats_df = pd.DataFrame(esp_match_stats)


scraping https://www.fotmob.com/match/3917938....
1. Athletic Club vs Valencia
scraping https://www.fotmob.com/match/3917939....
2. Atletico Madrid vs Villarreal
scraping https://www.fotmob.com/match/3917940....
3. Celta Vigo vs Real Madrid
scraping https://www.fotmob.com/match/3917941....
4. Espanyol vs Rayo Vallecano
scraping https://www.fotmob.com/match/3917942....
5. Sevilla vs Real Valladolid
scraping https://www.fotmob.com/match/3917943....
6. Real Sociedad vs Barcelona
scraping https://www.fotmob.com/match/3917944....
7. Mallorca vs Real Betis
scraping https://www.fotmob.com/match/3917945....
8. Girona vs Getafe
scraping https://www.fotmob.com/match/3917946....
9. Osasuna vs Cadiz
scraping https://www.fotmob.com/match/3917947....
10. Elche vs Almeria
scraping https://www.fotmob.com/match/3917948....
11. Barcelona vs Real Valladolid
scraping https://www.fotmob.com/match/3917949....
12. Real Betis vs Osasuna
scraping https://www.fotmob.com/match/3917950....
13. Elche vs Real Socie

In [13]:
esp_match_stats_df = esp_match_stats_df[~esp_match_stats_df['home_xG'].isnull()]
esp_match_stats_df = convert_to_variable_df(esp_match_stats_df)
esp_match_stats_df.to_csv('data/laliga_2223.csv')

Scrape Laliga 2021/2022

In [9]:
esp_match_stats = []
laliga_match1 = 3629092

for i in range(400):
    match_url = f'{fotmob_match_url}{laliga_match1+i}'
    stat = scrape_match_fotmob(driver, match_url)
    home_club = stat['home_club_name']
    away_club = stat['away_club_name']
    print(f'{i+1}. {home_club} vs {away_club}')
    esp_match_stats.append(stat)
esp_match_stats_df = pd.DataFrame(esp_match_stats)

scraping https://www.fotmob.com/match/3629092....
1. Deportivo Alaves vs Real Madrid
scraping https://www.fotmob.com/match/3629093....
2. Barcelona vs Real Sociedad
scraping https://www.fotmob.com/match/3629094....
3. Celta Vigo vs Atletico Madrid
scraping https://www.fotmob.com/match/3629095....
4. Mallorca vs Real Betis
scraping https://www.fotmob.com/match/3629096....
5. Cadiz vs Levante
scraping https://www.fotmob.com/match/3629097....
6. Osasuna vs Espanyol
scraping https://www.fotmob.com/match/3629098....
7. Sevilla vs Rayo Vallecano
scraping https://www.fotmob.com/match/3629099....
8. Valencia vs Getafe
scraping https://www.fotmob.com/match/3629100....
9. Villarreal vs Granada
scraping https://www.fotmob.com/match/3629101....
10. Elche vs Athletic Club
scraping https://www.fotmob.com/match/3629102....
11. FC Krasnodar vs Sepsi OSK
scraping https://www.fotmob.com/match/3629103....
12. None vs None
scraping https://www.fotmob.com/match/3629104....
13. Deportivo Alaves vs Mallorca


In [12]:
esp_match_stats_df = esp_match_stats_df[~esp_match_stats_df['home_score'].isnull()]
esp_match_stats_df = convert_to_variable_df(esp_match_stats_df)
esp_match_stats_df.to_csv('data/laliga_2122.csv')