In [1]:
# query to create the table in MySQL - this will need a match and player identifier
sql = """drop table if exists scraping.p_stats;
        create table p_stats( row_nk varchar(50) not null
                            , player_id bigint(7) not null
                            , match_id varchar(20) not null
                            , match_date varchar(20)
                            , name varchar(200)
                            , number varchar(10)
                            , position varchar(10)
                            , captain varchar(10)
                            , subbed varchar(10)
                            , homeAway varchar(10)
                            , subToolTip varchar(10)
                            , onPitch varchar(10)
                            , wasActive varchar(10)
                            , tries bigint(7)
                            , tryassists bigint(7)
                            , points bigint(7)
                            , kicks bigint(7)
                            , passes bigint(7)
                            , runs bigint(7)
                            , metres bigint(7)
                            , cleanbreaks bigint(7)
                            , defendersbeaten bigint(7)
                            , offload bigint(7)
                            , lineoutwonsteal bigint(7)
                            , turnoversconceded bigint(7)
                            , tackles bigint(7)
                            , missedtackles bigint(7)
                            , lineoutswon bigint(7)
                            , penaltiesconceded bigint(7)
                            , yellowcards bigint(7)
                            , redcards bigint(7)
                            , penalties bigint(7)
                            , penaltygoals bigint(7)
                            , conversiongoals bigint(7)
                            , dropgoalsconverted bigint(7)
                            , primary key(row_nk));""".replace("\n", "")

In [2]:
# Run this first to set up connection drop the table, and create again before putting more data into it

# Set up the connection
import pymysql
conn = pymysql.connect(host = '127.0.0.1', port = 3306,
                       user = 'root', passwd = '', db = 'mysql')

cur = conn.cursor()
cur.execute("USE scraping")
cur.execute(sql)

0

In [3]:
from urllib.request import urlopen
from bs4 import BeautifulSoup as bs
import datetime
import random
import re
import pprint
import json


# These functions all look the same, just depend on the initial link passed to them, keeping them
# separate for clarity instead of overloading

# Get the links for each match on a given date
def getLinks (articleUrl):
    html = urlopen("http://www.espn.co.uk/rugby/fixtures/_/date/" + articleUrl)
    bsObj = bs(html, "lxml")
    return bsObj.find("section", {"id":"pane-main"}).findAll("script", {"type":"text/javascript"})

# New function, very similar to the above, to extract the stats links given the above
def getStatsLinks (articleUrl):
    html = urlopen("http://www.espn.co.uk" + articleUrl)
    bsObj = bs(html, "lxml")
    return bsObj.find("section", {"id":"pane-main"}).findAll("script", {"type":"text/javascript"})

# Extract the JSON containin the actual stats data
def getStats(statsUrl):
    html = urlopen("http://www.espn.co.uk" + statsUrl)
    bsObj = bs(html, "lxml")  
    return bsObj.find("section", {"id":"pane-main"}).findAll("script", {"type":"text/javascript"})

In [4]:
# Now try to do it for every match on a given date:
# The list 'matches' contains 15 links for different games - try read all into DB

teams, players = ['home', 'away'], ['team', 'reserves']
dates = ['20170304', '20170305']

for date in dates:
    tst = getLinks(date)
    for t in tst:
        json_text = re.search(r'^\s*window\.__INITIAL_STATE__\s*=\s*({.*?})\s*;\s*$',
                      t.string, flags=re.DOTALL | re.MULTILINE).group(1)

    json_out = json.loads(json_text)
    
    
    json_match = json_out['schedule']['groups']
    matches = []
    for m in json_match:
        for c in m['complete']:
            matches.append(c['result']['href'])
            
    # Iterate through all the matches on a given day and pull the data
    for match_link in matches:
        tst = getStatsLinks(match_link)
        json_out = []
        for t in tst:
            json_text = re.search(r'^\s*window\.__INITIAL_STATE__\s*=\s*({.*?})\s*;\s*$',
                              t.string, flags=re.DOTALL | re.MULTILINE).group(1)
            json_out.append(json.loads(json_text))

        for j in json_out[0]['gamePackage']['links']:
            if j['pageType'] == 'matchstats':
                match_s = j['href']
            elif j['pageType'] == 'playerstats':
                player_s = j['href']

        # Use regex to extract the match ID from the link
        import re
        regex = re.compile('gameId=([0-9]*)')
        match_id = regex.findall(match_link)

        p_stats = getStats(player_s)

        # json.loads turns the JSON object into a Python dict - so it's now a dict and follows those rules 
        stat_out = []
        # This is length 1 but has to be done as if we don't iterate though it's of type "ResultsSet"
        # which we can do do any operations on. So after this it will take the __INITIAL_STATE__ variable,
        # grab the JSON object and make it into a Python dict
        for p in p_stats:
            json_text = re.search(r'^\s*window\.__INITIAL_STATE__\s*=\s*({.*?})\s*;\s*$',
                              t.string, flags=re.DOTALL | re.MULTILINE).group(1)
            stat_out.append(json.loads(json_text))

        for t in teams:
            for p in players:
                data = stat_out[0]["gamePackage"]["matchLineUp"][t][p]


                for d in data:
                    dct = d

                    dct['match_date'] = date
                    dct['match_id'] = match_id[0] # regex above returns list of one element
                    if 'id' in dct:
                        dct['player_id'] = dct.pop('id') # rename 'id' to 'player_id'
                    dct['row_nk'] = dct['player_id'] + dct['match_id']

                    # remove entries we don't want - url is useless and eventTimes doesn't conform to the required structure
                    # these are the keys, i.e. table column names
                    cols = list(dct.keys())
                    cols = list(filter(lambda c: c != 'eventTimes', cols))
                    cols = list(filter(lambda c: c != 'url', cols))

                    # these are used for the SQL query, so everything gets inserted at once
                    placeholders = ', '.join(['%s'] * len(cols))
                    columns = ', '.join(cols)

                    # These are the values to be inserted into the table
                    vals = []
                    for d in dct:
                        if d in cols:
                            if type(dct[d]) is dict:
                                vals.append(dct[d]['value'])
                            else:
                                vals.append(dct[d])

                    cur = conn.cursor()
                    cur.execute("USE scraping")
                    #cur.execute(sql)

                    sql_update = "insert into p_stats (%s) values (%s)" % (columns, placeholders)
                    cur.execute(sql_update, vals)
                    conn.commit()

cur.close()
conn.close()

['290088', '0', '0', '3', '0', '2', '', '0', 'Tom Homer', '82104', '0', '20170304', '2', '0', '0', True, '15', '0', '0', False, '15', '0', False, 1, 'FB', '6', 'home', '1', False, '82104290088', '1', '0', '6', '0', '0']
['290088', '1', '0', '0', '0', '5', '', '0', 'Semesa Rokoduguni', '172319', '2', '20170304', '1', '3', '0', True, '14', '0', '0', True, '55', '0', False, '0', 'W', '2', 'home', '1', False, '172319290088', '0', '1', '9', '0', '0']
['290088', '1', '0', '0', '0', '5', '', '0', 'Max Clark', '246113', '0', '20170304', '1', '1', '0', True, '13', '0', '0', True, '8', '0', False, '0', 'C', '5', 'home', '0', False, '246113290088', '0', '0', '6', '0', '1']
['290088', '0', '0', '0', '0', '7', '', '0', 'Ben Tapuai', '96363', '2', '20170304', '9', '1', '0', True, '12', '0', '0', True, '41', '0', False, '0', 'C', '12', 'home', '3', False, '96363290088', '0', '1', '8', '0', '0']
['290088', '0', '0', '0', '0', '6', '', '0', 'Anthony Watson', '149315', '0', '20170304', '2', '3', '0', Tr

In [32]:
for s in stat_out[0]['gamePackage']:
    print(s)

gameStrip
gameState
meta
headToHead
gameStateClass
matchSummary
news
leagueUid
matchCommentary
polling
matchAttacking
analytics
matchHomeForm
matchAwayForm
matchStats
links
loading
matchGlossary
matchLineUp
matchDefending
showGameDetailFooter
article
matchEvents
matchDiscipline
commentaryFeedback
matchConversation
matchDetails
standings
HeadToHeadNode


In [41]:
# Next step - find match details such as home and away team, date, and include
from pprint import pprint
pprint(stat_out[0]['gamePackage']['gameStrip'])

{'date': '04/03',
 'gameState': 'final',
 'header': 'Super Rugby 2017',
 'network': '',
 'target': '',
 'teams': {'away': {'abbrev': 'CRUS',
                    'color': 'FF0000',
                    'href': '/rugby/team/_/id/25936/crusaders',
                    'id': '25936',
                    'logo': 'http://a1.espncdn.com/combiner/i?img=/i/teamlogos/rugby/teams/500/25936.png&h=42&w=42',
                    'name': 'Crusaders',
                    'overDetails': '',
                    'record': '',
                    'runDetails': '30',
                    'score': '30',
                    'scoreMarkup': '30',
                    'scoreMarkupMobile': '30',
                    'trackingName': '&lpos=rugby:game:game:clubhouse:team',
                    'uid': 's:300~t:25936',
                    'winner': True},
           'home': {'abbrev': 'HLAND',
                    'color': '000099',
                    'href': '/rugby/team/_/id/25938/highlanders',
                    'id': 

In [21]:
for g in stat_out[0]['gamePackage']['gameStrip']:
    print(g)

winnerClass
network
header
winnerClassMobile
target
teams
time
gameState
date


In [31]:
tst = {1:'a', 2:'b'}
tst[3] = tst[1] + tst[2]
print(tst)

{1: 'a', 2: 'b', 3: 'ab'}


In [49]:
for d in data:
    dct = d

{'captain': False,
 'cleanbreaks': {'name': 'Clean Breaks', 'value': '2'},
 'conversiongoals': {'name': 'Conversion Goals', 'value': '0'},
 'defendersbeaten': {'name': 'Defenders Beaten', 'value': '2'},
 'dropgoalsconverted': {'name': 'Drop Goals Converted', 'value': '0'},
 'eventTimes': {},
 'homeAway': 'home',
 'kicks': {'name': 'Kicks', 'value': '0'},
 'lineoutswon': {'name': 'Lineouts Won', 'value': '0'},
 'lineoutwonsteal': {'name': 'Lineout Won Steal', 'value': '0'},
 'match_date': '20170304',
 'match_id': ['290088'],
 'metres': {'name': 'Metres Run', 'value': '61'},
 'missedtackles': {'name': 'Missed Tackles', 'value': '2'},
 'name': 'Taulupe Faletau',
 'number': '8',
 'offload': {'name': 'Offload', 'value': '0'},
 'onPitch': True,
 'passes': {'name': 'Passes', 'value': '8'},
 'penalties': {'name': 'penalties', 'value': '0'},
 'penaltiesconceded': {'name': 'Penalties Conceded', 'value': '1'},
 'penaltygoals': {'name': 'Penalty Goals', 'value': '0'},
 'player_id': '104729',
 'poi