# Scrape HTML files
This notebook is used to parse the HTML files into useful data. The HTML files have been crawled before from voetbaluitslagen.nl using the crawling notebook.

In [1]:
# IMPORT MODULES

# Soup maker
from bs4 import BeautifulSoup as bs4
# Pandas
import pandas as pd
# OS to read from paths etc
from os import listdir
from os.path import isfile, join
# Regular expressions
import re
# Time
import time
# Show outputs from all lines instead of last one only
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# INITIALIZE FUNCTIONS - Dont look, just run it ;)

# Return soup (DOM object) of HTML source code to use the document structure
def makeSoup(html):
     # HTML5 parser
    return bs4(html, 'html5lib')

# Function to extract data corresponding to formation tab
def runForOpstellingen(currentSoup, df, idx):
    # Get the formation (or system)
    tdFormation = currentSoup.find('td', string='Formatie')
    df.at[idx, 'formationHome'] = tdFormation.previous_sibling.string  # One <td> to the left is for home team
    df.at[idx, 'formationAway'] = tdFormation.next_sibling.string      # One <td> to the right is for away team
    
    # Get the base players as comma separated string
    df.at[idx, 'basePlayersHome'] = ','.join(getPlayers(currentSoup, 'home')[:11])
    df.at[idx, 'basePlayersAway'] = ','.join(getPlayers(currentSoup, 'away')[:11])
    
    # Get the trainer
    df.at[idx, 'trainerHome'] = getPlayers(currentSoup, 'home')[-1]
    df.at[idx, 'trainerAway'] = getPlayers(currentSoup, 'away')[-1]
    
    # Return the new data frame
    return df

# Function to extract data corresponding to stats tab
def runForWedstrijdstatistieken(currentSoup, df, idx):
    # Get the following stats:
    stats = ['Balbezit', 'Doelpogingen', 'Corners', 'Vrije trappen', 'Schoten op doel', 'Schoten naast doel',\
            'Geblockte schoten', 'Buitenspel', 'Reddingen doelman', 'Overtredingen', 'Gele kaarten', 'Rode kaarten']
    # NOTE: Vrije trappen, gele en rode kaarten zijn niet altijd aanwezig.
    
    for stat in stats:
        try:
            # Get the center <td>
            td = currentSoup.find('td', string=stat)
            # One <td> to the left, take first child
            df.at[idx, toLowerCamelCase(stat) + 'Home'] = list(td.previous_sibling.children)[0].string
            # One <td> to the right, take second child
            df.at[idx, toLowerCamelCase(stat) + 'Away'] = list(td.next_sibling.children)[1].string
        except:
            # Something went horribly wrong...
            df.at[idx, toLowerCamelCase(stat) + 'Home'] = None
            df.at[idx, toLowerCamelCase(stat) + 'Away'] = None
    
    # Return the new data frame
    return df

# Get all filenames in the folder
def listAllFilesInFolder(folder):
    onlyFiles = [f for f in listdir(folder) if isfile(join(folder, f))]
    print('Files found: ', len(onlyFiles))
    return onlyFiles

#
def toLowerCamelCase(string):
    # Split
    words = string.split(' ')
    # Return join of transformed words
    return words[0].lower() + ''.join(w.capitalize() for w in words[1:])

# Create a new data frame
def initDataFrame():
    df = pd.DataFrame()
    df['datetime'] = '0'
    df['summary'] = '0'
    return df

# Function to scrape all HTML files in folder which have info as extracted from tab
def scrapeFiles(folder, files, tab):
    # Set max iterations (for debugging purposes)
    iMax = 399
    i = 0
    
    # Start with a clean data frame
    df = initDataFrame()

    # Iterate over all files in the folder
    for filename in files:
        # Get the HTML code (to soup) from the files
        currentFile = open(join(folder, filename), 'r')
        currentSoup = makeSoup(currentFile.read())

        # Extract identifier from filename
        filenameExpr = 'soup-(.*).html'
        idx = re.search(filenameExpr, filename).group(1)

        # Get the summary
        df.at[idx, 'summary'] = getSummary(currentSoup)
        #print(idx + ': ' + getSummary(currentSoup)) # Show some

        # Get the date and time
        df.at[idx, 'datetime'] = getDateTime(currentSoup)

        # Choose which data to extract based on which tab is specified
        if tab == 'formations':
            df = runForOpstellingen(currentSoup, df, idx)
        elif tab == 'stats':
            df = runForWedstrijdstatistieken(currentSoup, df, idx)

        # Check for max reached
        i += 1
        print('Running: ' + str(i) + ' | ' + idx + ' ' + getSummary(currentSoup), end='\r')
        if i >= iMax:
            break

    # Nice work!
    print('\nDone!')
    return df

# Export data frame to CSV
def saveDfToCsv(df):
    now = time.strftime("%Y%m%d-%H%M")
    csvFilename = targetFolder + 'eredivisie' + season + '-' + now + '-' + tab + '.csv'
    df.to_csv(csvFilename)
    print('Data frame saved as: ' + csvFilename)
    
# Get the date and time
def getDateTime(soup):
    return soup.select_one('td#utime').string

# Get the summary of the match. This holds home team, score and away team
def getSummary(soup):
    return soup.title.string.split('|')[0]

# Get all the players listed in the table
# Includes basisopstelling, wisselspelers, ontbrekende speler en trainers
def getPlayers(soup, team):
    # Choose left or right column
    if team.lower() == 'home':
        flr = 'fl'
    elif team.lower() == 'away':
        flr = 'fr'
    else:
        raise Exception('Choose home or away team')
    
    # Get all players of the home team
    players = soup.select('td.' + flr + ' > .name > a')
    
    # Take only contents[0] to ignore keeper/captain span
    playerList = [player.contents[0] for player in players]
    return playerList

## Extract data from the HTML files into a CSV file (using a data frame)

In [4]:
# Extract data from matches of...
season = '1718'        # E.g. 1718
#tab = 'formations'      # Opstellingen
tab = 'stats'          # Wedstrijdstatistieken

# So all files in...
folder = season + '/source-' + tab + '-' + season + '/'
targetFolder = season + '/'
print('Source folder: ' + folder)
#print('Target folder: ' + targetFolder)

# Get all filenames in folder
allFiles = listAllFilesInFolder(folder)

# Get scrapin'!
df = scrapeFiles(folder, allFiles, tab)

# Show top 5 rows
print('Number of rows is data frame: ' + str(len(df)))
df[:5]

# Save the data frame as CSV
saveDfToCsv(df)

Source folder: 1718/source-stats-1718/
Files found:  81
Running: 81 | zRIYHJ57 HER 1-1 VIT 
Done!
Number of rows is data frame: 81


Unnamed: 0,datetime,summary,balbezitHome,balbezitAway,doelpogingenHome,doelpogingenAway,cornersHome,cornersAway,vrijeTrappenHome,vrijeTrappenAway,...,buitenspelHome,buitenspelAway,reddingenDoelmanHome,reddingenDoelmanAway,overtredingenHome,overtredingenAway,geleKaartenHome,geleKaartenAway,rodeKaartenHome,rodeKaartenAway
08QLKLLl,14.10.2017 18:30,AJA 4-0 SPA,62%,38%,25,6,8,1,20,19,...,3,3,2,10,16,18,1,1,,
2DrZTPb2,01.10.2017 16:45,SPA 1-2 ROD,61%,39%,21,5,12,5,15,15,...,1,1,0,7,15,15,1,3,,
2FyC4UJF,12.08.2017 18:30,VVV 3-0 SPA,40%,60%,12,8,3,5,16,18,...,2,1,2,3,15,15,0,1,0.0,1.0
2iZ1h0sK,22.10.2017 14:30,AZ 3-0 UTR,46%,54%,14,10,5,5,15,16,...,0,0,3,3,16,14,3,2,,
2mD1BxXa,27.08.2017 12:30,FEY 5-0 WIL,69%,31%,16,4,11,1,11,11,...,1,2,1,3,9,10,0,1,,


Data frame saved as: 1718/eredivisie1718-20171027-1255-stats.csv
