In [6]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager

import pandas as pd
import time
import json

options = Options()
options.headless = True

driver = webdriver.Chrome(options=options,service=ChromeService(ChromeDriverManager().install()))

In [7]:
def scrape_match_event(driver, match_id):
    match_url = f'https://www.fotmob.com/match/{match_id}'
    match_id = match_url.split('/')[4]
    driver.get(match_url)

    club_div = driver.find_elements(
        By.CSS_SELECTOR, '.e3q4wbq3 a .e3q4wbq4 span')
    club_id_div = driver.find_elements(By.CSS_SELECTOR, '.e3q4wbq3 a')
    club_name = [club_div[0].get_attribute(
        'innerHTML'), club_div[1].get_attribute('innerHTML')]
    club_id = [club_id_div[0].get_attribute('href').split(
        '/')[4], club_id_div[1].get_attribute('href').split('/')[4]]
    clubs = pd.DataFrame({'club_id': club_id, 'club_name': club_name})

    data = driver.find_elements(By.CSS_SELECTOR, '#__NEXT_DATA__')
    jstring = data[0].get_attribute('innerHTML')
    obje = json.loads(jstring)
    shots = obje['props']['pageProps']['initialState']['matchFacts']['data']['content']['shotmap']['shots']
    match_events = []
    for s in shots:
        event = {
            'teamId': s['teamId'],
            'teamName': clubs[clubs['club_id'] == str(s['teamId'])]['club_name'].iloc[0],
            'playerId': s['playerId'],
            'playerName': s['playerName'],
            'min': s['min'],
            'xG': s['expectedGoals'],
            'eventType': s['eventType'],
            'teamColor': s['teamColor'],
            'match_id': match_id,
            'isOwnGoal': s['isOwnGoal'],
            'x': s['x'],
            'y': s['y'],
            'situation': s['situation'],
        }
        match_events.append(event)

        min = s['min']
        playerName = s['playerName']
        eventType = s['eventType']

        # print(f'{min}\' {playerName} {eventType}')
    match_events_df = pd.DataFrame(match_events)
    return match_events_df


In [8]:
mancity_derby = scrape_match_event(driver,3901019)
mancity_derby.iloc[0]

teamId                        8456
teamName           Manchester City
playerId                    737066
playerName    Erling Braut Haaland
min                              3
xG                        0.065666
eventType                     Miss
teamColor                  #69A8D8
match_id                   3901019
isOwnGoal                    False
x                        101.11207
y                            26.69
situation              RegularPlay
Name: 0, dtype: object

In [9]:
def scrape_shots_by_season(driver, matchlist, filename):
    match_events_df = pd.DataFrame()
    for index, m in enumerate(matchlist):
        print(f'{index+1}/{len(matchlist)}. {m}')
        me = scrape_match_event(driver, int(m))
        match_events_df = pd.concat(
            [match_events_df, me]).reset_index(drop=True)
        match_events_df.to_csv(f'../../data/{filename}.csv')


In [10]:
epl_2223 = pd.read_csv('../../data/epl_xg_2223.csv')
epl_matches = epl_2223['match_id'].unique()
scrape_shots_by_season(driver,epl_matches,'epl_shots_2223')

1/87. 3900932
2/87. 3900933
3/87. 3900934
4/87. 3900935
5/87. 3900937
6/87. 3900938
7/87. 3900939
8/87. 3900936
9/87. 3900940
10/87. 3900941
11/87. 3900943
12/87. 3900942
13/87. 3900945
14/87. 3900948
15/87. 3900950
16/87. 3900951
17/87. 3900944
18/87. 3900949
19/87. 3900946
20/87. 3900947
21/87. 3900960
22/87. 3900953
23/87. 3900954
24/87. 3900955
25/87. 3900957
26/87. 3900952
27/87. 3900956
28/87. 3900961
29/87. 3900959
30/87. 3900958
31/87. 3900970
32/87. 3900964
33/87. 3900965
34/87. 3900966
35/87. 3900967
36/87. 3900968
37/87. 3900962
38/87. 3900963
39/87. 3900971
40/87. 3900969
41/87. 3900978
42/87. 3900974
43/87. 3900979
44/87. 3900975
45/87. 3900973
46/87. 3900972
47/87. 3900981
48/87. 3900977
49/87. 3900980
50/87. 3900976
51/87. 3900986
52/87. 3900983
53/87. 3900985
54/87. 3900988
55/87. 3900989
56/87. 3900990
57/87. 3900991
58/87. 3900982
59/87. 3900984
60/87. 3900987
61/87. 3901002
62/87. 3901009
63/87. 3901011
64/87. 3901008
65/87. 3901010
66/87. 3901003
67/87. 3901006
68/8

In [13]:
serie_a_2223 = pd.read_csv('../../data/seriea_xg_2223.csv')
matches = serie_a_2223['match_id'].unique()
scrape_shots_by_season(driver,matches,'serie_a_shots_2223')

1/90. 3919071
2/90. 3919073
3/90. 3919077
4/90. 3919072
5/90. 3919075
6/90. 3919074
7/90. 3919078
8/90. 3919076
9/90. 3919069
10/90. 3919070
11/90. 3919097
12/90. 3919098
13/90. 3919092
14/90. 3919096
15/90. 3919091
16/90. 3919093
17/90. 3919089
18/90. 3919090
19/90. 3919094
20/90. 3919095
21/90. 3919106
22/90. 3919103
23/90. 3919099
24/90. 3919102
25/90. 3919105
26/90. 3919108
27/90. 3919101
28/90. 3919107
29/90. 3919100
30/90. 3919104
31/90. 3919117
32/90. 3919112
33/90. 3919115
34/90. 3919111
35/90. 3919116
36/90. 3919118
37/90. 3919113
38/90. 3919114
39/90. 3919109
40/90. 3919110
41/90. 3919120
42/90. 3919123
43/90. 3919122
44/90. 3919119
45/90. 3919126
46/90. 3919121
47/90. 3919128
48/90. 3919124
49/90. 3919125
50/90. 3919127
51/90. 3919450
52/90. 3919446
53/90. 3919451
54/90. 3919442
55/90. 3919443
56/90. 3919449
57/90. 3919453
58/90. 3919448
59/90. 3919447
60/90. 3919445
61/90. 3919461
62/90. 3919454
63/90. 3919462
64/90. 3919463
65/90. 3919464
66/90. 3919455
67/90. 3919456
68/9

In [14]:
serie_a_2122 = pd.read_csv('../../data/seriea_xg_2122.csv')
matches = serie_a_2122['match_id'].unique()
scrape_shots_by_season(driver,matches,'serie_a_shots_2122')

1/380. 3656991
2/380. 3656992
3/380. 3656993
4/380. 3656994
5/380. 3656995
6/380. 3656996
7/380. 3656997
8/380. 3656998
9/380. 3656999
10/380. 3657000
11/380. 3657001
12/380. 3657002
13/380. 3657003
14/380. 3657004
15/380. 3657005
16/380. 3657006
17/380. 3657007
18/380. 3657008
19/380. 3657009
20/380. 3657010
21/380. 3657011
22/380. 3657012
23/380. 3657013
24/380. 3657014
25/380. 3657015
26/380. 3657016
27/380. 3657017
28/380. 3657018
29/380. 3657019
30/380. 3657020
31/380. 3657021
32/380. 3657022
33/380. 3657023
34/380. 3657024
35/380. 3657025
36/380. 3657026
37/380. 3657027
38/380. 3657028
39/380. 3657029
40/380. 3657030
41/380. 3657034
42/380. 3657035
43/380. 3657036
44/380. 3657037
45/380. 3657038
46/380. 3657039
47/380. 3657040
48/380. 3657041
49/380. 3657042
50/380. 3657043
51/380. 3657044
52/380. 3657045
53/380. 3657046
54/380. 3657047
55/380. 3657048
56/380. 3657049
57/380. 3657050
58/380. 3657051
59/380. 3657052
60/380. 3657053
61/380. 3657054
62/380. 3657055
63/380. 3657056
6