# Extract

In [1]:
# Make the project root folder accessible
from domino.utils.jupyter import notebook_init
notebook_init()

In [2]:
import os
import os.path
import time
import json

from selenium.webdriver import Firefox, FirefoxProfile
from selenium.common.exceptions import StaleElementReferenceException

In [3]:
# First, setup the browser profile with which we will work
profile = FirefoxProfile()

profile.set_preference('browser.download.folderList', 2)
profile.set_preference('browser.download.manager.showWhenStarting', False)
profile.set_preference('browser.download.dir', os.path.join(os.getcwd(), 'data'))
profile.set_preference('browser.helperApps.neverAsk.saveToDisk', 'text/csv/xls')

ff = Firefox(profile) # this opens a browser (Firefox) window

In [4]:
# Define the url from where to extract data
url = 'https://www.whoscored.com/Regions/206/Tournaments/4/Seasons/5933/Stages/12647/PlayerStatistics/Spain-La-Liga-2015-2016'

In [5]:
ff.get(url) # open the url

In [6]:
# Extract the column names of the table
header = [
    (th.get_attribute('class'), th.text)
    
    for th in ff.find_elements_by_xpath(
        "//thead[@id='player-table-statistics-head']//th"
    )
]

In [7]:
header

[('rank', 'R'),
 ('rgn', ''),
 ('pn', 'Player'),
 ('global sortable ap   ', 'Apps'),
 ('global sortable minsPlayed   ', 'Mins'),
 ('global sortable goal   ', 'Goals'),
 ('global sortable assistTotal   ', 'Assists'),
 ('global sortable yellowCard   ', 'Yel'),
 ('global sortable redCard   ', 'Red'),
 ('global sortable shotsPerGame   ', 'SpG'),
 ('global sortable passSuccess   ', 'PS%'),
 ('global sortable aerialWonPerGame   ', 'AerialsWon'),
 ('global sortable manOfTheMatch   ', 'MotM'),
 ('global sortable rating', 'Rating')]

In [8]:
# Clean its names

header = [
    (k.replace('global', '').replace('sortable', '').strip(), v)
    for k, v in header
]

In [9]:
header

[('rank', 'R'),
 ('rgn', ''),
 ('pn', 'Player'),
 ('ap', 'Apps'),
 ('minsPlayed', 'Mins'),
 ('goal', 'Goals'),
 ('assistTotal', 'Assists'),
 ('yellowCard', 'Yel'),
 ('redCard', 'Red'),
 ('shotsPerGame', 'SpG'),
 ('passSuccess', 'PS%'),
 ('aerialWonPerGame', 'AerialsWon'),
 ('manOfTheMatch', 'MotM'),
 ('rating', 'Rating')]

In [10]:
# Select all players
ff.find_element_by_xpath('//div[@class="statistics-table-filter"]//dl[@id="apps"]//dd[2]/a').click()
time.sleep(3) # wait for some time to avoid problems with page loading

In [11]:
rows = []

while True:
    subrows = []
    
    try:
        for tr in ff.find_elements_by_xpath(
            "//tbody[@id='player-table-statistics-body']//tr"
        ):
            d = {}

            for (th, th_title), td in zip(
                header,
                tr.find_elements_by_xpath(".//td")
            ):
                if th == 'rgn':
                    d[th] = td.find_element_by_xpath('./span').get_attribute('class')
                elif th == 'pn':
                    player = td.find_element_by_xpath(
                        './/a[@class="player-link"]'
                    )

                    d['%s %s' % (th, 'player_url')] = player.get_attribute('href')
                    d['%s %s' % (th, 'player_name')] = player.text

                    team = td.find_element_by_xpath(
                        './/a[@class="player-meta-data"]'
                    )

                    d['%s %s' % (th, 'team_url')] = team.get_attribute('href')
                    d['%s %s' % (th, 'team_name')] = team.find_element_by_xpath(
                        './/span'
                    ).text

                    d['%s %s' % (th, 'player_metadata')] = ''.join(
                        element.text
                        for element in td.find_elements_by_xpath(
                            './/span[@class="player-meta-data"]'
                        )
                    )
                else:
                    d[th] = td.text                

            subrows.append(d)
            
    except StaleElementReferenceException: # the loading time was slower than expected
        continue # repeat all rows again
        
    # everything worked, so add all loaded rows to rows
    rows += subrows

    # Click the next button
    button = ff.find_element_by_xpath('//a[@id="next"]')
    if 'disabled' in button.get_attribute('class'):
        break
    else:
        button.click()
        time.sleep(1)

In [12]:
rows

[{'aerialWonPerGame': '0.2',
  'ap': '31(2)',
  'assistTotal': '16',
  'goal': '26',
  'manOfTheMatch': '13',
  'minsPlayed': '2730',
  'passSuccess': '81.9',
  'pn player_metadata': '29, AM(CR),FW',
  'pn player_name': 'Lionel Messi',
  'pn player_url': 'https://www.whoscored.com/Players/11119',
  'pn team_name': 'Barcelona,',
  'pn team_url': 'https://www.whoscored.com/Teams/65',
  'rank': '1',
  'rating': '8.46',
  'redCard': '-',
  'rgn': 'ui-icon country flg-ar',
  'shotsPerGame': '4.8',
  'yellowCard': '3'},
 {'aerialWonPerGame': '0.6',
  'ap': '34',
  'assistTotal': '12',
  'goal': '24',
  'manOfTheMatch': '7',
  'minsPlayed': '3057',
  'passSuccess': '80.9',
  'pn player_metadata': '24, AM(CLR),FW',
  'pn player_name': 'Neymar',
  'pn player_url': 'https://www.whoscored.com/Players/50835',
  'pn team_name': 'Barcelona,',
  'pn team_url': 'https://www.whoscored.com/Teams/65',
  'rank': '2',
  'rating': '8.43',
  'redCard': '-',
  'rgn': 'ui-icon country flg-br',
  'shotsPerGame'

In [13]:
# Save data to a file in json format
with open(os.path.join('data', 'players.json'), 'w') as f:
    f.write(json.dumps(rows, indent=2))