# Crawling voetbaluitslagen.nl
This notebook crawls voetbaluitslagen.nl to automatically get the source code of pages for each match.

In [1]:
# IMPORT MODULES

# Beautiful soup as html parser
from bs4 import BeautifulSoup as bs4
# Requests to get HTML source code
import requests
# Selenium (with PhantomJS installed seperately) get HTML as in browser
from selenium import webdriver
# Regular expressions
import re
# Pandas
import pandas as pd
# Time for sleep and current time
import time
# Random so they don't see us
from random import random

In [18]:
# INITIALIZE FUNCTIONS

# Get the source code
def getSource(url):
    # Point Selenium browser to URL and get page source code
    browser.get(url)
    
    # Wait 2-8 seconds for the browser to load content
    time.sleep(random()*6+2)
    
    # Now get the source
    return browser.page_source

# Return the soup (DOM object) to use the document structure
def makeSoup(html):
    # Make the soup
    #soup = bs4(html, 'html.parser') # Built-in Python parser
    soup = bs4(html, 'html5lib')     # HTML5 parser
    return soup

# Reduce the big soup by selecting one league
def reduceSoupByLeague(bigSoup, leagueId):
    # Only include one league (e.g. Eredivisie)
    leagueRows = bigSoup.select('tr.league.' + leagueId)

    # Merge the table.soccer parents
    combinedTables = ''
    for row in leagueRows:
        # Get parent table and append
        combinedTables += str(row.find_parent('table', class_='soccer'))

    # You know it...
    reducedSoup = makeSoup(combinedTables)
    return reducedSoup

# Replace ID with identifier and info with {wedstrijdstatistieken, opstellingen} to get detailed stats about a game
def getGameUrl(identifier, page):
    url = 'https://www.voetbaluitslagen.nl/wedstrijd/id/#page'
    return url.replace('id', identifier).replace('page', page)

# Get the soup! This is a beautiful DOM (Document Object Model) of the website
def getHomepageSoup(season):
    seasonFull = '20' + season[:2] + '-20' + season[2:]   # Converts 1516 to 2015-2016

    # Visit the uitslagen page of that season
    url = 'https://www.voetbaluitslagen.nl/eredivisie-' + seasonFull + '/uitslagen/'    # 14-15
    print('Browsing homepage for season ' + seasonFull + ': ' + url)
    homepageSoup = makeSoup(getSource(url))
    #print('Characters in source code: ' + str(len(str(homepageSoup))))

    # Not quite there yet... Check if we see 'Toon meer wedstrijden' link!
    while not homepageSoup.select_one('table#tournament-page-results-more')['style'] == 'display: none;':
        # Click on the loadMoreGames() anchor
        print('Clicking on loadMoreGames()')
        browser.find_element_by_css_selector('table#tournament-page-results-more a').click()

        # Wait some time for the browser to load the content
        time.sleep(random()*6+2)

        # Make another soup and repeat
        homepageSoup = makeSoup(browser.page_source)
        #print('Characters in source code: ' + str(len(str(homepageSoup))))
    
    # We're done!
    return homepageSoup

# Get the league ID as used by voetbaluitslagen.nl
def getEredivisieLeagueId(season):
    if season == '1718':
        eredivisieId = 'l_1_4K9Dfl6U'    # Season 17/18
    elif season == '1617':
        eredivisieId = 'l_1_Uuh1RiXn'    # Season 16/17
    elif season == '1516':
        eredivisieId = 'l_1_6Ty1wfGO'    # Season 15/16
    elif season == '1415':
        eredivisieId = 'l_1_zahr8PMr'    # Season 14/15
    elif season == '1314':
        eredivisieId = 'l_1_z7xQ7ZEj'    # Season 13/14
    elif season == '1213':
        eredivisieId = 'l_1_lbWpn3AO'    # Season 12/13
    return eredivisieId

# Reduce soup to only include matches from Eredivisie
def getEredivisieMatchIds(eredivisieId):
    # Reduce the soup
    eredivisieSoup = reduceSoupByLeague(homepageSoup, eredivisieId)

    # Find all indentifiers (table rows with id starting with g_1_)
    identifiers = []
    for tableRow in eredivisieSoup.find_all('tr', id=re.compile('^g_1_')):
        # Now trim (remove 'g_1_') and put it in our list
        identifiers.append(tableRow['id'][4:])

    # Verbose
    print('Number of matches found: ' + str(len(identifiers)))
    
    # Return
    return identifiers

# Crawl the webpages of each match. Uses
def crawlPages(identifiers, tab, targetFolder):
    
    # Store the source codes in...
    print('Storing files in folder: ' + targetFolder)

    # Set max iterations (for debugging purposes)
    iMax = 399
    i = 0

    for idx in identifiers:        
        # Get it the soup from the file
        gameUrl = getGameUrl(idx, tab)
        soup = makeSoup(getSource(gameUrl))

        # CHECK if we can find <td> with 'Formatie' for 'Opstellingen' or 'Balbezit' for 'Wedstrijdstatistieken'
        if tab == 'opstellingen':
            check = soup.find('td', string='Formatie') is not None
        elif tab == 'wedstrijdstatistieken':
            check = soup.find('td', string='Balbezit') is not None
        if not check:
            print('\nWarning: ' + gameUrl + ': ' + soup.title.string.split('|')[0] + '-- ' + str(check), end='\n')

        # Print the source code to a file
        with open(targetFolder + 'soup-' + idx + '.html', 'w') as f:
            print(soup, file=f)

        # Check for max reached
        i += 1
        print('Running: ' + str(i) + ' | ' + idx, end='\r')
        if i >= iMax:
            break

    # Nice work!
    print('\nDone!')

### Start our invisible browser
You only have to do this once after starting Jupyter or if you've interrupted the kernel.

In [16]:
# Use Selenium and PhantomJS to get usable HTML code (after JS)
browser = webdriver.PhantomJS()

### Get the index number from all matches
All matches are listed on the homepage and they have a unique hash which is used for the hyperlink. We use this identifier as well for our data frame.

### Now we can start crawling!
We need to crawl the specific website for each game with its unique identifier. Get the source code and store it in a html.

In [13]:
# User input...
# Which season?
season = '1213'        # E.g. 1718

# Get the source code of the homepage
homepageSoup = getHomepageSoup(season)

# Reduce the code to only include matches from Eredivisie and get a emptydata frame with the unique identifiers
identifiers = getEredivisieMatchIds(getEredivisieLeagueId(season))

Browsing homepage for season 2012-2013: https://www.voetbaluitslagen.nl/eredivisie-2012-2013/uitslagen/
Clicking on loadMoreGames()
Clicking on loadMoreGames()
Clicking on loadMoreGames()
Number of matches found: 306


In [19]:
# Which tab / info?
tabNl = 'opstellingen'; tabEn = 'formations'

# Create target folder
targetFolder = season + '/source-' + tabEn + '-' + season + '/'
# Make the folder it doesnt exists yet
if not(os.path.isdir(targetFolder)):
    os.mkdir(targetFolder)

# Start crawling! This may take a while (~20 mins for 1 season)
crawlPages(identifiers, tabNl, targetFolder)

## REPEAT ##

# Which tab / info?
tabNl = 'wedstrijdstatistieken'; tabEn = 'stats'

# Create target folder
targetFolder = season + '/source-' + tabEn + '-' + season + '/'
# Make the folder it doesnt exists yet
if not(os.path.isdir(targetFolder)):
    os.mkdir(targetFolder)

# Start crawling! This may take a while (~20 mins for 1 season)
crawlPages(identifiers, tabNl, targetFolder)

Storing files in folder: 1213/source-stats-1213/

Running: 181 | A5rYBu0N
Running: 182 | 48uQD1VA
Running: 183 | QHtMEsG4
Running: 184 | 8tnHFN0b
Running: 185 | UkmDG3oh
Running: 186 | zTm9HqWo
Running: 187 | bVvUCLpH
Running: 188 | MPq5IPGu
Running: 189 | KIfaQJcJ
Running: 190 | tSeeRarD
Running: 191 | fBcmTLD0
Running: 192 | Y1diSuT6
Running: 193 | Cn8rU1bf
Running: 194 | bq4vVsql
Running: 195 | 4M4zWNTs
Running: 196 | pxO3yp6K
Running: 197 | 2DI8z4LQ
Running: 198 | 6oPaxQjE
Running: 199 | hryIDtED
Running: 200 | YsU5GrUf
Running: 201 | fcIEEMb7
Running: 202 | 48P1HOEl
Running: 203 | QHQcI4ar
Running: 204 | CYS9F2q1
Running: 205 | KzlVMpiR
Running: 206 | 6XqRNQyL
Running: 207 | CApNO6ME
Running: 208 | YJoJPn78
Running: 209 | 4nlR4oVr
Running: 210 | fysv2qF1
Running: 211 | YRur13U7
Running: 212 | 0Esz3Pae
Running: 213 | nBwW35pk
Running: 214 | 2w2a9RMR

KeyboardInterrupt: 

#### Now the HTML source codes are stored.