In [1]:
from urllib.request import urlopen
html = urlopen("http://pythonscraping.com/pages/page1.html")
print(html.read())

b'<html>\n<head>\n<title>A Useful Page</title>\n</head>\n<body>\n<h1>An Interesting Title</h1>\n<div>\nLorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\n</div>\n</body>\n</html>\n'


In [19]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen ("http://www.pythonscraping.com/exercises/exercise1.html")
bsObj = BeautifulSoup(html.read(), "lxml");
print(bsObj.h1)

<h1>An Interesting Title</h1>


In [11]:
# Include some code to handle errors - in case the website doesn't exist 
# or the HTML tag doesn't exist
try:
    badContent = bsObj.nonExisting.anotherTag
except AttributeError as e:
    print("Tag not found")
else:
    if badContent == None:
        print ("Tag was not found")
    else:
        print(badContent)

Tag not found


In [26]:
# Adding it all together
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
def getTitle(url):
    try:
        html = urlopen(url)
    except HTTPError as e:
        print ("Page not found")
        return none
    try:
        bsObj = BeautifulSoup(html.read(), "lxml")
        title = bsObj.h1
    except AttributeError as e:
        print ("No such tag")
        return None
    return title

title = getTitle("http://www.pythonscraping.com/exercises/exercise1.html")
if title == None:
    print ("Title couldn't be found")
else:
    print(title)

<h1>An Interesting Title</h1>


In [3]:
# Code to start at Kevin Bacon's wiki page and crawl to 10 random wiki pages
# based on links from that page
from urllib.request import urlopen
from bs4 import BeautifulSoup as bs
import datetime
import random
import re

# regex - ?! is negative lookahead, so does NOT contain a colon
random.seed(datetime.datetime.now())
def getLinks (articleUrl):
    html = urlopen("http://en.wikipedia.org" + articleUrl)
    bsObj = bs(html, "lxml")
    return bsObj.find("div", {"id":"bodyContent"}).findAll("a", 
        href = re.compile("^(/wiki/)((?!:).)*$"))

links = getLinks("/wiki/Kevin_Bacon")

In [7]:
for i in range(10):
    newArticle = links[random.randint(0, len(links) - 1)].attrs["href"]
    print(newArticle)
    links = getLinks(newArticle)

/wiki/Republican_National_Convention
/wiki/California_Republican_Party
/wiki/Charles_Evans_Hughes
/wiki/2016_Republican_National_Convention
/wiki/Multi-level_marketing
/wiki/Statute
/wiki/Attorney_at_law
/wiki/Republic_of_Ireland
/wiki/List_of_sovereign_states_and_dependent_territories_by_population_density
/wiki/Belarus


In [1]:
# TEST - go to the scrum.com fixtures page and extract the results links. Info is in JSON format, so pull that 
# from the HTML then parse
from urllib.request import urlopen
from bs4 import BeautifulSoup as bs
import datetime
import random
import re
import pprint
import json


# This works for pages that have a single match, probably not for multiple matches
date = '20170302'
def getLinks (articleUrl):
    html = urlopen("http://www.espn.co.uk/rugby/fixtures/_/date/" + articleUrl)
    bsObj = bs(html, "lxml")
    return bsObj.find("section", {"id":"pane-main"}).findAll("script", {"type":"text/javascript"})
    return bsObj.find("section", {"id":"pane-main"})


tst = getLinks(date)
for t in tst:
    json_text = re.search(r'^\s*window\.__INITIAL_STATE__\s*=\s*({.*?})\s*;\s*$',
                      t.string, flags=re.DOTALL | re.MULTILINE).group(1)

json_out = json.loads(json_text)
match = json_out['schedule']['groups'][0]['complete'][0]['result']['href']
print(match)

/rugby/report?gameId=290778&league=242041


In [2]:
json_out['schedule']['groups'][0]['complete'][0]['result']['href']

'/rugby/report?gameId=290778&league=242041'

In [3]:
# Testing a day with multiple matches
# TEST - go to the scrum.com fixtures page and extract the results links. Info is in JSON format, so pull that 
# from the HTML then parse
from urllib.request import urlopen
from bs4 import BeautifulSoup as bs
import datetime
import random
import re
import pprint


# This works for pages that have a single match, probably not for multiple matches
date = '20170304'
def getLinks (articleUrl):
    html = urlopen("http://www.espn.co.uk/rugby/fixtures/_/date/" + articleUrl)
    bsObj = bs(html, "lxml")
    return bsObj.find("section", {"id":"pane-main"}).findAll("script", {"type":"text/javascript"})

tst = getLinks(date)
for t in tst:
    json_text = re.search(r'^\s*window\.__INITIAL_STATE__\s*=\s*({.*?})\s*;\s*$',
                      t.string, flags=re.DOTALL | re.MULTILINE).group(1)

json_out = json.loads(json_text)


In [4]:
# Structure like so: json_out['schedule']['groups'][m]['complete'][n]['result']['href']
# m is the league in question and n is the game within that league, iterate through both for all results

json_match = json_out['schedule']['groups']
matches = []

for m in json_match:
    for c in m['complete']:
        matches.append(c['result']['href'])
        
for m in matches:
    print(m)

/rugby/report?gameId=290088&league=267979
/rugby/report?gameId=290089&league=267979
/rugby/match?gameId=290364&league=270559
/rugby/match?gameId=290363&league=270559
/rugby/match?gameId=290366&league=270559
/rugby/match?gameId=290365&league=270559
/rugby/report?gameId=290524&league=270557
/rugby/report?gameId=290520&league=270557
/rugby/report?gameId=290523&league=270557
/rugby/report?gameId=290786&league=242041
/rugby/report?gameId=290785&league=242041
/rugby/report?gameId=290784&league=242041
/rugby/match?gameId=290783&league=242041
/rugby/report?gameId=290782&league=242041
/rugby/report?gameId=290781&league=242041


In [5]:
# Just pick a single match link to look at
match_link = matches[0]

# New function, very similar to the above, to extract the stats links
def getStatsLinks (articleUrl):
    html = urlopen("http://www.espn.co.uk" + articleUrl)
    bsObj = bs(html, "lxml")
    return bsObj.find("section", {"id":"pane-main"}).findAll("script", {"type":"text/javascript"})

# Saved to list this time but still only one element
tst = getStatsLinks(match_link)
json_out = []
for t in tst:
    json_text = re.search(r'^\s*window\.__INITIAL_STATE__\s*=\s*({.*?})\s*;\s*$',
                      t.string, flags=re.DOTALL | re.MULTILINE).group(1)
    json_out.append(json.loads(json_text))
    


In [6]:
# These are tall the match related links, we are interested in the match
# and player stats, as well as lineups and possibly table
json_out[0]['gamePackage']['links']

for j in json_out[0]['gamePackage']['links']:
    if j['pageType'] == 'matchstats':
        match_s = j['href']
    elif j['pageType'] == 'playerstats':
        player_s = j['href']

In [7]:
def getStats(statsUrl):
    html = urlopen("http://www.espn.co.uk" + statsUrl)
    bsObj = bs(html, "lxml")  
    return bsObj.find("section", {"id":"pane-main"}).findAll("script", {"type":"text/javascript"})

p_stats = getStats(player_s)

# json.loads turns the JSON object into a Python dict - so it's now a dict and follows those rules 
stat_out = []
# This is length 1 but has to be done as if we don't iterate though it's of type "ResultsSet"
# which we can do do any operations on. So after this it will take the __INITIAL_STATE__ variable,
# grab the JSON object and make it into a Python dict
for p in p_stats:
    json_text = re.search(r'^\s*window\.__INITIAL_STATE__\s*=\s*({.*?})\s*;\s*$',
                      t.string, flags=re.DOTALL | re.MULTILINE).group(1)
    stat_out.append(json.loads(json_text))

In [8]:
for s in stat_out[0]:
    for i in stat_out[0][s]:
        print(s, i)

page appProps
page edition
page template
page params
routing location
gamePackage matchStats
gamePackage matchAwayForm
gamePackage leagueUid
gamePackage matchLineUp
gamePackage matchDiscipline
gamePackage standings
gamePackage HeadToHeadNode
gamePackage meta
gamePackage gameState
gamePackage matchDetails
gamePackage matchGlossary
gamePackage article
gamePackage polling
gamePackage commentaryFeedback
gamePackage matchConversation
gamePackage matchHomeForm
gamePackage matchAttacking
gamePackage matchEvents
gamePackage gameStateClass
gamePackage matchSummary
gamePackage links
gamePackage loading
gamePackage headToHead
gamePackage analytics
gamePackage matchCommentary
gamePackage gameStrip
gamePackage matchDefending
gamePackage news
gamePackage showGameDetailFooter


In [9]:
for g in stat_out[0]["news"]:
    print(g)

In [10]:
vals = ['name', 'cleanbreaks']
data = stat_out[0]["gamePackage"]["matchLineUp"]["home"]["team"]

from pprint import pprint

pprint(data[0])

{'captain': False,
 'cleanbreaks': {'name': 'Clean Breaks', 'value': '0'},
 'conversiongoals': {'name': 'Conversion Goals', 'value': '0'},
 'defendersbeaten': {'name': 'Defenders Beaten', 'value': '0'},
 'dropgoalsconverted': {'name': 'Drop Goals Converted', 'value': '0'},
 'eventTimes': {'3': ["41'+3"]},
 'homeAway': 'home',
 'id': '82104',
 'kicks': {'name': 'Kicks', 'value': '2'},
 'lineoutswon': {'name': 'Lineouts Won', 'value': '0'},
 'lineoutwonsteal': {'name': 'Lineout Won Steal', 'value': '0'},
 'metres': {'name': 'Metres Run', 'value': '15'},
 'missedtackles': {'name': 'Missed Tackles', 'value': '1'},
 'name': 'Tom Homer',
 'number': '15',
 'offload': {'name': 'Offload', 'value': '0'},
 'onPitch': False,
 'passes': {'name': 'Passes', 'value': '6'},
 'penalties': {'name': 'penalties', 'value': 1},
 'penaltiesconceded': {'name': 'Penalties Conceded', 'value': '0'},
 'penaltygoals': {'name': 'Penalty Goals', 'value': '1'},
 'points': {'name': 'Points', 'value': '3'},
 'position':

In [32]:
for s in stat_out[0]["gamePackage"]["matchLineUp"]["home"]:
    print(s)

reserves
logo
name
team


In [14]:
# dct contains player data for a single game
dct = data[0]

# remove entries we don't want - url is useless and eventTimes doesn't conform to the required structure
# these are the keys, i.e. table column names
cols = list(dct.keys())
cols = list(filter(lambda c: c != 'eventTimes', cols))
cols = list(filter(lambda c: c != 'url', cols))

# these are used for the SQL query, so everything gets inserted at once
placeholders = ', '.join(['%s'] * len(cols))
columns = ', '.join(cols)

# These are the values to be inserted into the table
vals = []
for d in dct:
    if d in cols:
        if type(dct[d]) is dict:
            vals.append(dct[d]['value'])
        else:
            vals.append(dct[d])

In [41]:
# query to create the table in MySQL - this will need a match and player identifier
sql = """drop table if exists scraping.r_stats;
        create table r_stats(id bigint(7) not null
                            , name varchar(200)
                            , number varchar(10)
                            , position varchar(10)
                            , captain varchar(10)
                            , subbed varchar(10)
                            , homeAway varchar(10)
                            , subToolTip varchar(10)
                            , onPitch varchar(10)
                            , wasActive varchar(10)
                            , tries bigint(7)
                            , tryassists bigint(7)
                            , points bigint(7)
                            , kicks bigint(7)
                            , passes bigint(7)
                            , runs bigint(7)
                            , metres bigint(7)
                            , cleanbreaks bigint(7)
                            , defendersbeaten bigint(7)
                            , offload bigint(7)
                            , lineoutwonsteal bigint(7)
                            , turnoversconceded bigint(7)
                            , tackles bigint(7)
                            , missedtackles bigint(7)
                            , lineoutswon bigint(7)
                            , penaltiesconceded bigint(7)
                            , yellowcards bigint(7)
                            , redcards bigint(7)
                            , penalties bigint(7)
                            , penaltygoals bigint(7)
                            , conversiongoals bigint(7)
                            , dropgoalsconverted bigint(7)
                            , primary key(id));""".replace("\n", "")

In [42]:
# Insert the data into the table

# Set up the connection
import pymysql
conn = pymysql.connect(host = '127.0.0.1', port = 3306,
                       user = 'root', passwd = '', db = 'mysql')

cur = conn.cursor()
cur.execute("USE scraping")
cur.execute(sql)

#sql_update = "insert into r_stats (%s) values (%s)" % (columns, placeholders)
#cur.execute(sql_update, vals)

0

In [44]:
# attempt to do same for every player the match:
# dct contains player data for a single game

teams, players = ['home', 'away'], ['team', 'reserves']

for t in teams:
    for p in players:
        data = stat_out[0]["gamePackage"]["matchLineUp"][t][p]

        for d in data:
            dct = d

            # remove entries we don't want - url is useless and eventTimes doesn't conform to the required structure
            # these are the keys, i.e. table column names
            cols = list(dct.keys())
            cols = list(filter(lambda c: c != 'eventTimes', cols))
            cols = list(filter(lambda c: c != 'url', cols))

            # these are used for the SQL query, so everything gets inserted at once
            placeholders = ', '.join(['%s'] * len(cols))
            columns = ', '.join(cols)

            # These are the values to be inserted into the table
            vals = []
            for d in dct:
                if d in cols:
                    if type(dct[d]) is dict:
                        vals.append(dct[d]['value'])
                    else:
                        vals.append(dct[d])

            cur = conn.cursor()
            cur.execute("USE scraping")
            #cur.execute(sql)

            sql_update = "insert into r_stats (%s) values (%s)" % (columns, placeholders)
            cur.execute(sql_update, vals)
            conn.commit()

cur.close()
conn.close()