In [9]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

import pandas as pd

options = Options()
options.headless = True

driver = webdriver.Chrome(options=options,service=ChromeService(ChromeDriverManager().install()))
fotmob_match_url = 'https://www.fotmob.com/match/'

In [10]:
def scrape_matches_list_fotmob(driver, leagueId=47, round=1):
    leagueName = 'premier-league'
    if leagueId == 47:
        leagueName = 'premier-league'
    elif leagueId == 55:
        leagueName = 'serie-a'
    elif leagueId == 54:
        leagueName = 'bundesliga'
    elif leagueId == 87:
        leagueName = 'laliga'
    
    url = f'https://www.fotmob.com/leagues/{leagueId}/matches/{leagueName}/by-round?round={round}'

    driver.get(url)
    matches = driver.find_elements(
        By.CSS_SELECTOR, ".css-qg9kjw-LeagueMatchCSS-applyHover a")

    matchIdList = []
    matchURLList = []

    for i in range(len(matches)):
        try:
            matchUrl = matches[i].get_attribute('href')
            matchId = matches[i].get_attribute('href').split('/')[4]
            try:
                matchName = matchUrl.split('/')[6]
            except:
                matchName = matchId
            print(f'{round}-{i+1} {matchName}')
            matchURLList.append(matchUrl)
            matchIdList.append(matchId)
        except:
            matchUrl, matchId = None, f'{url} - {i+1}'
            print(f'error crawled {round}-{i+1}')

    match_list_by_round = {"matchId": matchIdList, "matchUrl": matchURLList}
    match_list_by_round = pd.DataFrame(match_list_by_round)
    return match_list_by_round


In [11]:
def scrape_match_fotmob(driver, url):
    driver.get(url)
    print(f"scraping {url}....")

    match_id = url.split('/')[4]

    club_names = driver.find_elements(
        By.CSS_SELECTOR, ".css-er0nau-TeamName span")
    try:
        home_club_name = club_names[2].text
        away_club_name = club_names[3].text
    except:
        home_club_name, away_club_name = None, None

    match_dates = driver.find_elements(By.CSS_SELECTOR, "time")
    try:
        match_date = match_dates[0].get_attribute('datetime')[0:10]
    except:
        match_date = None

    xGs = driver.find_elements(
        By.CSS_SELECTOR, "ul.e683amr7 li.e683amr6 span.e683amr5")
    try:
        home_xG = float(xGs[0].text)
        away_xG = float(xGs[1].text)
    except:
        home_xG, away_xG = None, None

    scorediv = driver.find_elements(
        By.CSS_SELECTOR, ".css-slmchi-wrapper .css-bw7eig-topRow")
    try:
        score = scorediv[0].get_attribute('innerHTML').split(" - ")
        home_score = score[0]
        away_score = score[1]
    except:
        home_score, away_score = None, None

    club_ids = driver.find_elements(
        By.CSS_SELECTOR, ".e3q4wbq2 .e3q4wbq6 .e3q4wbq3 a")
    try:
        home_club_id = club_ids[0].get_attribute('href').split('/')[4]
        away_club_id = club_ids[1].get_attribute('href').split('/')[4]
    except:
        home_club_id, away_club_id = None, None

    potmdiv = driver.find_elements(
        By.CSS_SELECTOR, ".TopPlayersAndPlayerOfTheMatch a:has(#roundStar24Px) .e1ozyfg82")
    try:
        potm = potmdiv[0].text
    except:
        potm = None

    leaguediv = driver.find_elements(By.CSS_SELECTOR, ".e1fpx9th0")
    try:
        league_name = leaguediv[0].text
    except:
        league_name = None

    rounddiv = driver.find_elements(
        By.CSS_SELECTOR, ".e1fpx9th1 span:nth-of-type(2)")

    try:
        round = rounddiv[0].text
        round = int(round.replace('Round ',''))
    except:
        round = 0

    match_stat = {
        'match_id': match_id,
        'home_club_id': home_club_id,
        'away_club_id': away_club_id,
        'home_club_name': home_club_name,
        'away_club_name': away_club_name,
        'match_date': match_date,
        'home_score': home_score,
        'away_score': away_score,
        'home_xG': home_xG,
        'away_xG': away_xG,
        'potm': potm,
        'league_name': league_name,
        'round': round
    }

    return match_stat


In [12]:
def convert_to_variable_df(df):
    arr = []
    for index, row in df.iterrows():
        H_xG_for = {
            "match_id": row['match_id'],
            "date": row['match_date'],
            'variable': 'xG_for',
            'value': row['home_xG'],
            'venue': 'H',
            'team_id': row['home_club_id'],
            'team_name': row['home_club_name'],
            'league_name': row['league_name'],
            'round': row['round'],
        }
        H_xG_ag = {
            "match_id": row['match_id'],
            "date": row['match_date'],
            'variable': 'xG_ag',
            'value': row['away_xG'],
            'venue': 'H',
            'team_id': row['home_club_id'],
            'team_name': row['home_club_name'],
            'league_name': row['league_name'],
            'round': row['round'],
        }
        A_xG_for = {
            "match_id": row['match_id'],
            "date": row['match_date'],
            'variable': 'xG_for',
            'value': row['away_xG'],
            'venue': 'A',
            'team_id': row['away_club_id'],
            'team_name': row['away_club_name'],
            'league_name': row['league_name'],
            'round': row['round'],
        }
        A_xG_ag = {
            "match_id": row['match_id'],
            "date": row['match_date'],
            'variable': 'xG_ag',
            'value': row['home_xG'],
            'venue': 'A',
            'team_id': row['away_club_id'],
            'team_name': row['away_club_name'],
            'league_name': row['league_name'],
            'round': row['round'],
        }
        arr.append(H_xG_for)
        arr.append(H_xG_ag)
        arr.append(A_xG_for)
        arr.append(A_xG_ag)

    var_df = pd.DataFrame(arr)
    return var_df


In [13]:
def scrape_match_stat_by_season(first_match_id,csv_filename,start_index=0,match_count=400):
    match_stats = []
    fotmob_match_url = 'https://www.fotmob.com/match/'

    for i in range(start_index,match_count):
        match_url = f'{fotmob_match_url}{first_match_id+i}'
        stat = scrape_match_fotmob(driver, match_url)
        home_club = stat['home_club_name']
        away_club = stat['away_club_name']
        print(f'{i+1}. {home_club} vs {away_club}')
        match_stats.append(stat)
    match_stats_df = pd.DataFrame(match_stats)
    # match_stats_df = match_stats_df[~match_stats_df['home_score'].isnull()]
    match_stats_df = convert_to_variable_df(match_stats_df)
    match_stats_df.to_csv(f'../../data/{csv_filename}.csv')
    return match_stats_df

In [16]:
url = 'https://www.fotmob.com/match/3377503'
s = scrape_match_fotmob(driver,url)
print(s)

scraping https://www.fotmob.com/match/3377503....
{'match_id': '3377503', 'home_club_id': '8525', 'away_club_id': '8611', 'home_club_name': 'Willem II', 'away_club_name': 'FC Twente', 'match_date': '2020-10-18', 'home_score': '0', 'away_score': '3', 'home_xG': 1.8, 'away_xG': 3.11, 'potm': 'Danilo', 'league_name': 'Eredivisie', 'round': 5}


Scrape EPL 2022/2023

In [None]:
matchlist = pd.DataFrame()
for i in range(1,39):
    m = scrape_matches_list_fotmob(driver,round=i)
    matchlist = pd.concat([matchlist,m]).reset_index(drop=True)

In [None]:
match_stats = []
for i in range(len(matchlist)):
    matchName = matchlist['matchUrl'][i].split('/')[6]
    stat = scrape_match_fotmob(driver,matchlist['matchUrl'][i])    
    match_stats.append(stat)
    print(f'{i+1}. {matchName}')
match_stats_df = pd.DataFrame(match_stats)

In [None]:
match_stats_df = match_stats_df[~match_stats_df['home_score'].isnull()]
match_stats_df.to_csv('data/epl_20221012.csv')
match_stats_df[(match_stats_df['home_club_id']=='10260') | (match_stats_df['away_club_id']=='10260')].reset_index(drop=True)

In [None]:
msdf = pd.read_csv('data/epl_20221012.csv',index_col=0)
epl_2223 = convert_to_variable_df(msdf)
epl_2223.to_csv('data/epl_xg_2223.csv')

Scrape Serie A 2022/2023

In [None]:
leagueId = 55
ita_matchlist = pd.DataFrame()
for i in range(1,39):
    m = scrape_matches_list_fotmob(driver,leagueId=leagueId,round=i)
    ita_matchlist = pd.concat([ita_matchlist,m]).reset_index(drop=True)

In [None]:
ita_match_stats = []
for i in range(120):
    stat = scrape_match_fotmob(driver,ita_matchlist['matchUrl'][i])
    home_club = stat['home_club_name']
    away_club = stat['away_club_name']
    print(f'{i+1}. {home_club} vs {away_club}')
    ita_match_stats.append(stat)
ita_match_stats_df = pd.DataFrame(ita_match_stats)

In [None]:
ita_match_stats_df = ita_match_stats_df[~ita_match_stats_df['home_score'].isnull()]
seriea_2223 = convert_to_variable_df(ita_match_stats_df)
seriea_2223.to_csv('data/seriea_xg_2223.csv')

Scrape Serie A 2021/2022

In [None]:
fotmob_match_url = 'https://www.fotmob.com/match/'
m1 = 3656991

seriea_2122_matchlist = []

for i in range(400):
    seriea_2122_matchlist.append(f'{fotmob_match_url}{m1+i}')

seriea_2122_matchlist[-1]

In [None]:
ita_match_stats = []
for i in range(len(seriea_2122_matchlist)):
    stat = scrape_match_fotmob(driver,seriea_2122_matchlist[i])
    home_club = stat['home_club_name']
    away_club = stat['away_club_name']
    print(f'{i+1}. {home_club} vs {away_club}')
    ita_match_stats.append(stat)
ita_match_stats_df = pd.DataFrame(ita_match_stats)

In [None]:
ita_match_stats_df = ita_match_stats_df[~ita_match_stats_df['home_score'].isnull()]
seriea_2122 = convert_to_variable_df(ita_match_stats_df)
seriea_2122.to_csv('data/seriea_xg_2122.csv')

Scrape Serie A 2020/2021

In [None]:
fotmob_match_url = 'https://www.fotmob.com/match/'
m1 = 3428766

seriea_2021_matchlist = []

for i in range(400):
    seriea_2021_matchlist.append(f'{fotmob_match_url}{m1+i}')

seriea_2021_matchlist[-1]

In [None]:
ita_match_stats = []
for i in range(len(seriea_2021_matchlist)):
    stat = scrape_match_fotmob(driver,seriea_2021_matchlist[i])
    home_club = stat['home_club_name']
    away_club = stat['away_club_name']
    print(f'{i+1}. {home_club} vs {away_club}')
    ita_match_stats.append(stat)
ita_match_stats_df = pd.DataFrame(ita_match_stats)

In [None]:
ita_match_stats_df = ita_match_stats_df[~ita_match_stats_df['home_score'].isnull()]
seriea_2021 = convert_to_variable_df(ita_match_stats_df)
seriea_2021.to_csv('data/seriea_xg_2021.csv')

Scrape Bundesliga 2022/2023

In [None]:
leagueId = 54
ger_matchlist = pd.DataFrame()
for i in range(1,35):
    m = scrape_matches_list_fotmob(driver,leagueId=leagueId,round=i)
    ger_matchlist = pd.concat([ger_matchlist,m]).reset_index(drop=True)

In [None]:
ger_match_stats = []
for i in range(120):
    stat = scrape_match_fotmob(driver,ger_matchlist['matchUrl'][i])
    home_club = stat['home_club_name']
    away_club = stat['away_club_name']
    print(f'{i+1}. {home_club} vs {away_club}')
    ger_match_stats.append(stat)
ger_match_stats_df = pd.DataFrame(ger_match_stats)

In [None]:
ger_match_stats_df = ger_match_stats_df[~ger_match_stats_df['home_score'].isnull()]
bundesliga_2223 = convert_to_variable_df(ger_match_stats_df)
bundesliga_2223.to_csv('data/bundesliga_xg_2223.csv')

Scrape Bundesliga 2021/2022

In [None]:
ger_match_stats = []
fotmob_match_url = 'https://www.fotmob.com/match/'
bundesliga_match1 = 3624340

for i in range(406):
    match_url = f'{fotmob_match_url}{bundesliga_match1+i}'
    stat = scrape_match_fotmob(driver,match_url)
    home_club = stat['home_club_name']
    away_club = stat['away_club_name']
    print(f'{i+1}. {home_club} vs {away_club}')
    ger_match_stats.append(stat)
ger_match_stats_df = pd.DataFrame(ger_match_stats)

In [None]:
ger_match_stats_df=ger_match_stats_df[~ger_match_stats_df['home_score'].isnull()]
bundesliga_2122= convert_to_variable_df(ger_match_stats_df)
bundesliga_2122.to_csv('data/bundesliga_xg_2122.csv')

Scrape Bundesliga 2020/2021

In [None]:
ger_match_stats = []
bundesliga_match1 = 3399144

for i in range(406):
    match_url = f'{fotmob_match_url}{bundesliga_match1+i}'
    stat = scrape_match_fotmob(driver,match_url)
    home_club = stat['home_club_name']
    away_club = stat['away_club_name']
    print(f'{i+1}. {home_club} vs {away_club}')
    ger_match_stats.append(stat)
ger_match_stats_df = pd.DataFrame(ger_match_stats)

In [None]:
ger_match_stats_df = ger_match_stats_df[~ger_match_stats_df['home_club_id'].isnull()]
ger_match_stats_df = convert_to_variable_df(ger_match_stats_df)
ger_match_stats_df.to_csv('data/bundesliga_xg_2021.csv')

Scrape Laliga 2022/2023

In [None]:
esp_match_stats = []
laliga_match1 = 3917938

for i in range(400):
    match_url = f'{fotmob_match_url}{laliga_match1+i}'
    stat = scrape_match_fotmob(driver, match_url)
    home_club = stat['home_club_name']
    away_club = stat['away_club_name']
    print(f'{i+1}. {home_club} vs {away_club}')
    esp_match_stats.append(stat)
esp_match_stats_df = pd.DataFrame(esp_match_stats)


In [None]:
esp_match_stats_df = esp_match_stats_df[~esp_match_stats_df['home_xG'].isnull()]
esp_match_stats_df = convert_to_variable_df(esp_match_stats_df)
esp_match_stats_df.to_csv('data/laliga_2223.csv')

Scrape Laliga 2021/2022

In [None]:
esp_match_stats = []
laliga_match1 = 3629092

for i in range(400):
    match_url = f'{fotmob_match_url}{laliga_match1+i}'
    stat = scrape_match_fotmob(driver, match_url)
    home_club = stat['home_club_name']
    away_club = stat['away_club_name']
    print(f'{i+1}. {home_club} vs {away_club}')
    esp_match_stats.append(stat)
esp_match_stats_df = pd.DataFrame(esp_match_stats)

In [None]:
esp_match_stats_df = esp_match_stats_df[~esp_match_stats_df['home_score'].isnull()]
esp_match_stats_df = convert_to_variable_df(esp_match_stats_df)
esp_match_stats_df.to_csv('data/laliga_2122.csv')

Scrape Laliga 2020/2021

In [None]:
esp_match_stats = []
laliga_match1 = 3424038

for i in range(400):
    match_url = f'{fotmob_match_url}{laliga_match1+i}'
    stat = scrape_match_fotmob(driver, match_url)
    home_club = stat['home_club_name']
    away_club = stat['away_club_name']
    print(f'{i+1}. {home_club} vs {away_club}')
    esp_match_stats.append(stat)
esp_match_stats_df = pd.DataFrame(esp_match_stats)

In [None]:
esp_match_stats_df = esp_match_stats_df[~esp_match_stats_df['home_score'].isnull()]
esp_match_stats_df = convert_to_variable_df(esp_match_stats_df)
esp_match_stats_df.to_csv('data/laliga_2021.csv')

Scrape Ligue 1 2022/2023

In [None]:
ligue1_2223 = scrape_match_stat_by_season(3904384,'ligue1_2223')

In [None]:
ligue1_2122 = scrape_match_stat_by_season(3625826,'ligue1_2122')

In [None]:
ligue1_2021 = scrape_match_stat_by_season(3361606,'ligue1_2021')

In [None]:
ligue1_2021 = scrape_match_stat_by_season(3362194,'ligue1_2021x',match_count=100)

Scrape Eredivisie 2022/2023

In [None]:
eredivisie_2223 = scrape_match_stat_by_season(3900341,'eredivisie_2223')

In [None]:
eredivisie_2122 = scrape_match_stat_by_season(3602472,'eredivisie_2122')

In [15]:
eredivisie_2021 = scrape_match_stat_by_season(3377484,'eredivisie_2021')

scraping https://www.fotmob.com/match/3377484....
1. FC Twente vs FC Groningen
scraping https://www.fotmob.com/match/3377485....
2. PEC Zwolle vs Sparta Rotterdam
scraping https://www.fotmob.com/match/3377486....
3. Fortuna Sittard vs AZ Alkmaar
scraping https://www.fotmob.com/match/3377487....
4. SC Heerenveen vs VVV-Venlo
scraping https://www.fotmob.com/match/3377488....
5. Ajax vs Vitesse
scraping https://www.fotmob.com/match/3377489....
6. Feyenoord vs ADO Den Haag
scraping https://www.fotmob.com/match/3377490....
7. FC Utrecht vs RKC Waalwijk
scraping https://www.fotmob.com/match/3377491....
8. Heracles vs PSV Eindhoven
scraping https://www.fotmob.com/match/3377492....
9. FC Emmen vs Willem II
scraping https://www.fotmob.com/match/3377493....
10. FC Utrecht vs SC Heerenveen
scraping https://www.fotmob.com/match/3377494....
11. Vitesse vs Heracles
scraping https://www.fotmob.com/match/3377495....
12. RKC Waalwijk vs PEC Zwolle
scraping https://www.fotmob.com/match/3377496....
13. V

Scrape Liga Portugal 2022/2023

In [19]:
liga_portugal_2223 = scrape_match_stat_by_season(3937360,'liga_portugal_2223')

scraping https://www.fotmob.com/match/3937360....
1. Santa Clara vs Casa Pia AC
scraping https://www.fotmob.com/match/3937361....
2. Famalicao vs Braga
scraping https://www.fotmob.com/match/3937362....
3. Maritimo vs Chaves
scraping https://www.fotmob.com/match/3937363....
4. Vitoria de Guimaraes vs Estoril
scraping https://www.fotmob.com/match/3937364....
5. Vizela vs FC Porto
scraping https://www.fotmob.com/match/3937365....
6. Sporting CP vs Rio Ave
scraping https://www.fotmob.com/match/3937366....
7. Santa Clara vs Arouca
scraping https://www.fotmob.com/match/3937367....
8. Benfica vs Pacos de Ferreira
scraping https://www.fotmob.com/match/3937368....
9. Braga vs Maritimo
scraping https://www.fotmob.com/match/3937369....
10. Chaves vs Vizela
scraping https://www.fotmob.com/match/3937370....
11. Estoril vs Rio Ave
scraping https://www.fotmob.com/match/3937371....
12. Casa Pia AC vs Benfica
scraping https://www.fotmob.com/match/3937372....
13. Pacos de Ferreira vs Portimonense
scrapi