**Outline:**

The Purpose of this notebook is to scrape the ESPN website for international rubgby player statistics. The script could easily be adapted for other sports and for stats about the games themselves but I've not tested that as all I want at this point in time are stats for players featuring in the 6 nations championship.

In [2]:
import time, bs4, requests
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, ElementNotVisibleException
import re

from pprint import pprint
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = [12, 12]

The below numbers show the 6 nations teams as identified in the ESPN system.

Finding other teams should be as simple as going to their main page and checking the address bar.

In [3]:
england = "1"
scotland = "2"
ireland = "3"
wales = "4"
france = "9"
italy = "20"
lions = "32" #2017

First we go to the page summarising all the games that a team has played in a given year. 

We use a webdriver and Beautiful Soup to strip this table to provide us with the date of the match, teams that played, the score and a link to the report on the game where we can find more detailed stats.

In [62]:
# Set your team
teamNo = "20"
year = "2015"

teamSeason ="http://www.espn.co.uk/rugby/results/_/team/"+teamNo+"/season/"+year

browser = webdriver.Chrome(r"C:\Users\Maurice\Desktop\Python\chromedriver_win32\chromedriver.exe")
browser.get(teamSeason)
seasonHTML = browser.page_source
browser.close()

# Use beautiful soup to search the HTML for the main table containing the results
seasonSoup = bs4.BeautifulSoup(seasonHTML, "html.parser")
schedule = seasonSoup.find("div", {"id": "sched-container"})
tables = schedule.select('table')

# Function takes a table and converts it into an array of data.
def makeList(table):
    result = []
    allrows = table.findAll('tr')
    for row in allrows:
        result.append([])
        allcols = row.findAll('td')
        for col in allcols:
          thestrings = [s for s in col.findAll(text=True)]
          thetext = ''.join(thestrings)
          result[-1].append(thestrings)
    return result

# We use the above function in the one below to organise the stats by team
# Function returns a list with two sublists
def teamFixtures(tableSet):
    teams = []
    for t in tableSet:
        teams.append(makeList(t)[1:])
    teams = [i for sublist in teams for i in sublist]
    return teams 

# Applying the function to our table from the results page
arrays= teamFixtures(tables)

# Below function creates a date format that is useable in a Pandas dataframe 
monthDict = {"Feb":"02", "Mar":"03", "May":"05", "Jun":"06", "Jul":"07", "Aug":"08", "Sep":"09", "Oct":"10", "Nov":"11", "Dec":"12"}
def dateToNum(date):
    monDay = date.split(", ")[1]
    mon, day = monDay.split(" ")
    if len(day) == 1:
        day = "0"+day
    return year+monthDict[mon]+day

#We don't want all the information from the arrays table just the date, teams and score
namesScore = []
for m in arrays:
    namesScore.append([dateToNum(m[0][0]), m[1][1], m[2][1], m[1][2]])

#We also search through the table for links to the more detailed Full Time report
links = []
for link in schedule.findAll('a', href=True, text='FT'):
    links.append(link['href'])

# Some games only have a brief summary and no stats so we'll ditch those from our final list.
# The below segment copies the indices of good links to filter our links and data arrays below.
duds = []
goodLinks = []
for i, t in enumerate(links):
    if 'report' not in t:
        duds.append(i)
    else:
        goodLinks.append(i)

# We're not going to follow the link to the full report. We just want the last segment which identifies the game.
links = [i.split("?")[1] for i in links]
        
full_reports = [list(zip(namesScore,links))[i] for i in goodLinks]
pprint(full_reports)

[(['20151011', 'ROM', 'ITALY', '22 - 32'], 'gameId=182008&league=164205'),
 (['20151004', 'IRE', 'ITALY', '16 - 9'], 'gameId=181996&league=164205'),
 (['20150926', 'ITALY', 'CAN', '23 - 18'], 'gameId=181984&league=164205'),
 (['20150919', 'FRA', 'ITALY', '32 - 10'], 'gameId=181972&league=164205'),
 (['20150905', 'WALES', 'ITALY', '23 - 19'], 'gameId=252323&league=252321'),
 (['20150829', 'SCOT', 'ITALY', '48 - 7'], 'gameId=263333&league=252321'),
 (['20150823', 'ITALY', 'SCOT', '12 - 16'], 'gameId=263325&league=248937'),
 (['20150321', 'ITALY', 'WALES', '20 - 61'], 'gameId=180691&league=180659'),
 (['20150315', 'ITALY', 'FRA', '0 - 29'], 'gameId=180690&league=180659'),
 (['20150228', 'SCOT', 'ITALY', '19 - 22'], 'gameId=180685&league=180659'),
 (['20150214', 'ENG', 'ITALY', '47 - 17'], 'gameId=180682&league=180659'),
 (['20150207', 'ITALY', 'IRE', '3 - 26'], 'gameId=180680&league=180659')]


In [63]:
testGame = full_reports[1]
testPage = "http://www.espn.co.uk/rugby/playerstats?"+testGame[1]

browser = webdriver.Chrome(r"C:\Users\Maurice\Desktop\Python\chromedriver_win32\chromedriver.exe")

tabs = ["Scoring", "Attacking", "Defending", "Discipline"]

#Function goes to an ESPN page of player stats for a game and cycles through four tabs of stats. 
#It returns a full page of HTML for each
def scrapePage(address):
    while True:
        browser.get(address)
        try:
            browser.find_element_by_xpath("//*[contains(text(),'Yes')]").click()
        except ElementNotVisibleException:
            pass
        pages = []
        count = 0
        while count < 4:
            try:
                browser.find_element_by_xpath("//*[contains(text(),'"+tabs[count]+"')]").click() 
                pages.append(browser.page_source)
                count+=1
            except NoSuchElementException:
                break
        if count > 0:
            break
        else:
            pass
    browser.close()
    return pages

#pageSet = scrapePage(testPage)
#print(len(pageSet))

In [64]:
# Function takes the HTML of a set of pages and returns the first two tables on each.
# We already know these are the player stats.
def gettables(sources):
    output=[]
    for s in sources:
        soup = bs4.BeautifulSoup(s, "html.parser")
        tables = soup.select('table')
        output.append(tables[0:2])
    return output

#tables = gettables(pageSet)

# Function takes a table and converts it into an array of data.
def makeList(table):
    result = []
    allrows = table.findAll('tr')
    for row in allrows:
        result.append([])
        allcols = row.findAll('td')
        for col in allcols:
          thestrings = [s for s in col.findAll(text=True)]
          thetext = ''.join(thestrings)
          result[-1].append(thestrings)
    return result

# We use the above function in the one below to organise the stats by team
# Function returns a list with two sublists
def teamTables(tableSet):
    team1 = []
    team2 = []
    for t in tableSet:
        team1.append(makeList(t[0]))
        team2.append(makeList(t[1]))
    teams = [team1,  team2]
    return teams 

#tablePair = teamTables(tables)

# Due to an issue with the text scraping this function is required 
# It runs over the arrays and splits the player names from their positions
def nameSplit(teamStats):
    result=[]
    for table in teamStats:
        newtable = []
        for row in table:
            
            if len(row)<1:
                newtable.append(row)
            else:
                try: 
                    newtable.append([row[0][0], row[0][1]]+[i for sublist in row[1:len(row)] for i in sublist])
                except IndexError:
                    newtable.append([row[0][0], "R"]+[i for sublist in row[1:len(row)] for i in sublist])
        result.append(newtable)
    return result

#team1 = nameSplit(tablePair[0])

scoringHeaders = ["Name", "Position", "Try", "Try Assist", "Conversion", "Penalty", "Drop Goal", "Points"]
attackingHeaders = ["Name", "Position", "Blank", "Passes", "Runs", "Meters Run", "Clean Breaks", "Defenders Beaten", "Offloads", "Blank"] 
defendingHeaders = ["Name", "Position", "Turnovers Conceeded", "Tackles", "Missed Tackles", "Lineouts Won"]
disciplineHeaders = ["Name", "Position", "Penalties", "Yellow Cards", "Red Cards"]

headers = [scoringHeaders, attackingHeaders, defendingHeaders, disciplineHeaders]

# Function adds headers to 
def addHeaders(tableset):
    for i, t in enumerate(tableset):
        t[0] = headers[i]

#addHeaders(team1)
#addHeaders(team2)
#pprint(team1)


In [65]:
import pandas as pd
from functools import reduce

# Function takes one of our arrays of player stats and makes a dataframe
def tableToDF(table):
    df = pd.DataFrame(table)
    df.columns = df.iloc[0]
    #df.set_index('Name', inplace=True)
    df.drop([0], axis=0, inplace =True)
    return df

# Function takes all our tables for a team, converts to DF and merge them on the name and position
# Function also converts all numeric columns to numbers and drops some blank columns
def tablesToDFs(tables):
    dfs = []
    for t in tables:
        dfs.append(tableToDF(t))
    df_final = reduce(lambda left,right: pd.merge(left,right, on=['Name', 'Position']), dfs)
    cols = list(df_final.columns)
    cols.remove('Name')
    cols.remove('Position')
    for col in cols:
        df_final[col]=df_final[col].apply(pd.to_numeric, errors='coerce')
    df_final.drop(['Blank'], axis=1, inplace=True)
    return df_final

#teamDF1 = tablesToDFs(team1)

#addCols = testGame[0]
#print(addCols)

def homeTeamsData(teamDF):
    teamDF["Team"] = addCols[1]
    teamDF["Opposition"] = addCols[2]
    teamDF["Points For"] = addCols[3].split(" ")[0]
    teamDF["Points Against"] = addCols[3].split(" ")[2]
    teamDF["Home/Away"] = "Home"
    teamDF["Date"] = addCols[0]

def awayTeamsData(teamDF):
    teamDF["Team"] = addCols[2]
    teamDF["Opposition"] = addCols[1]
    teamDF["Points For"] = addCols[3].split(" ")[2]
    teamDF["Points Against"] = addCols[3].split(" ")[0]
    teamDF["Home/Away"] = "Away"
    teamDF["Date"] = addCols[0]

#homeTeamsData(teamDF1)

#teamDF1.to_csv('testFile.csv')

#print(teamDF1.head(3))

In [66]:
pprint(full_reports)

[(['20151011', 'ROM', 'ITALY', '22 - 32'], 'gameId=182008&league=164205'),
 (['20151004', 'IRE', 'ITALY', '16 - 9'], 'gameId=181996&league=164205'),
 (['20150926', 'ITALY', 'CAN', '23 - 18'], 'gameId=181984&league=164205'),
 (['20150919', 'FRA', 'ITALY', '32 - 10'], 'gameId=181972&league=164205'),
 (['20150905', 'WALES', 'ITALY', '23 - 19'], 'gameId=252323&league=252321'),
 (['20150829', 'SCOT', 'ITALY', '48 - 7'], 'gameId=263333&league=252321'),
 (['20150823', 'ITALY', 'SCOT', '12 - 16'], 'gameId=263325&league=248937'),
 (['20150321', 'ITALY', 'WALES', '20 - 61'], 'gameId=180691&league=180659'),
 (['20150315', 'ITALY', 'FRA', '0 - 29'], 'gameId=180690&league=180659'),
 (['20150228', 'SCOT', 'ITALY', '19 - 22'], 'gameId=180685&league=180659'),
 (['20150214', 'ENG', 'ITALY', '47 - 17'], 'gameId=180682&league=180659'),
 (['20150207', 'ITALY', 'IRE', '3 - 26'], 'gameId=180680&league=180659')]


The below cell takes all the above functions and puts them together to save CSVs of all the games a team has played in a given year.

In [68]:
select_reports = full_reports[1:]

for m  in select_reports:
    testPage = "http://www.espn.co.uk/rugby/playerstats?"+m[1]

    browser = webdriver.Chrome(r"C:\Users\Maurice\Desktop\Python\chromedriver_win32\chromedriver.exe")

    pageSet = scrapePage(testPage)
    
    tables = gettables(pageSet)
    tablePair = teamTables(tables)
    
    teamDFs = []
    for t in tablePair:
        team = nameSplit(t)
        addHeaders(team)
        teamDFs.append(tablesToDFs(team))
    
    addCols = m[0]
    homeTeamsData(teamDFs[0])
    awayTeamsData(teamDFs[1])
    
    result = pd.concat(teamDFs)
    result.to_csv(addCols[0]+" "+addCols[1]+"-"+addCols[2]+'.csv')
    