## Routine declarations 

In [25]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
from datetime import timedelta, date
import sys
import pandas as pd
import time


def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)
    
add_link = "bulli/2033"
race_date = "11-11-1111"


col = ['date','race_name','race_number','race_place', 'rug', 'dog_name', 'dog_trainer', 'time', 'mgn', 'split', 'inRun', 'wgt', 'sire', 'dam', 'sp']
all_data = pd.DataFrame([],columns=col)

## Function to scrape from the internet race_name-wise

In [26]:
def race_scrape():
    base_link = "https://thegreyhoundrecorder.com.au/results/"

    race_name = add_link
    link = base_link+add_link
    print(link)

    raw_html = simple_get(link) #Connection complete
    print(len(raw_html))

    html = BeautifulSoup(raw_html, 'html.parser')

    html_file_name='race.html'
    orig_stdout = sys.stdout
    f = open(html_file_name, 'w')
    sys.stdout = f

    print(html.prettify())

    sys.stdout = orig_stdout
    f.close()





    raceContent = html.findAll('div',{"class": "resultsDesktopContent tabs"})[0]

    raceTable = raceContent.findAll('table')
    no_of_races = int(len(raceTable)/2)
    #print(no_of_races)


    i=0 # race iterator
    while(i<no_of_races):

        # ##printing the unique race header

        raceHeader = raceTable[2*i]
        raceHeader = raceHeader.findAll('td')
        #print(len(raceHeader))

        raceNumber = raceHeader[0].decode_contents()
        raceSubName = raceHeader[1].decode_contents()
        raceLength = raceHeader[2].decode_contents()
        raceHeaderCategory = raceHeader[3].decode_contents()

        raceBets = raceHeader[4].decode_contents()
        raceBets = raceBets.replace(' ','')
        raceBets = raceBets[:(raceBets.rfind('-')+1)]+raceBets[raceBets.rfind('$'):] #output formating

        raceSplits = raceHeader[5].decode_contents()
        #print(raceNumber+raceSubName+raceLength+raceHeaderCategory+raceBets+raceSplits)

        #raceHeader is not included in the CSV at the moment
        
        raceBody = raceTable[2*i+1]
        raceBody = raceBody.find('tbody')
        rows = raceBody.findAll('tr')
        no_of_rows = len(rows)

        #printing the rest of the table

        j = 0 #row iterator
        while(j<no_of_rows):
            current_row = rows[j].findAll('td')

            #print(current_row)

            race_place = current_row[0].decode_contents()
            rug = current_row[1].decode_contents()
            dog_name = current_row[2].find('a')['href'][12:]
            dog_trainer = current_row[3].find('a')['href'][10:]
            
            if(race_place!='SCR'): 
                time = current_row[4].decode_contents()
                mgn = current_row[5].decode_contents()
                split = current_row[6].decode_contents()
                inRun = current_row[7].decode_contents()
                wgt = current_row[8].decode_contents()
                sire = current_row[9].find('a')['href'][12:]
                dam = current_row[10].find('a')['href'][12:]
                sp = current_row[11].find('p').decode_contents()[2:]
                
            else: #if its SCR all these contents do not exist
                time = 'blank'
                mgn = 'blank'
                split = 'blank'
                inRun = 'blank'
                wgt = 'blank'
                sire = 'blank'
                dam = 'blank'
                sp = 'blank'

            #print(race_place+' '+rug+' '+dog_name+' '+dog_trainer+' '+time+' '+mgn+' '+split+' '+inRun+' '+wgt+' '+sire+' '+dam+' '+sp)

            df2 = pd.DataFrame([[race_date, race_name, (i+1), race_place, rug, dog_name, dog_trainer, time, mgn, split, inRun, wgt, sire, dam, sp]],columns=col)
            global all_data # to prevent local creation
            all_data = all_data.append(df2) #appending the table row
            j=j+1
        i=i+1
    

In [27]:
print(all_data) # to check if the df is created properly and has no prior content on it

Empty DataFrame
Columns: [date, race_name, race_number, race_place, rug, dog_name, dog_trainer, time, mgn, split, inRun, wgt, sire, dam, sp]
Index: []


### fetching the date-list 

In [28]:
dateFile = pd.read_csv('date.csv') 
print(dateFile)

                       0
0             2016-01-01
1             bulli/2033
2         cambridge/2035
3            darwin/2032
4             dubbo/2031
5           geelong/2025
6           gosford/2027
7           ipswich/2029
8          mandurah/2034
9       strathalbyn/2030
10        traralgon/2026
11            wagga/2028
12            2016-01-02
13        addington/2020
14        bundaberg/2022
15          ipswich/2023
16         mandurah/2016
17         richmond/2015
18         tamworth/2017
19      the-gardens/2014
20      the-meadows/2012
21      warrnambool/2013
22         wauchope/2021
23   wentworth-park/2018
24            young/2019
25            2016-01-03
26      albion-park/2011
27           cairns/2009
28           gawler/2010
29      healesville/2005
..                   ...
344         hatrick/1685
345         ipswich/1690
346        mandurah/1688
347            sale/1683
348     strathalbyn/1684
349     the-gardens/1692
350           2016-01-30
351        armidale/1674


## All-date iterator 

In [30]:
t = time.process_time() # keeping a track of time

len_of_date = len(dateFile)
k=0
while(k<len_of_date): #iterate among all the dates in the dateFile
    print(dateFile.iloc[k,0])
    if(dateFile.iloc[k,0].find('/')==-1): #if the line is a date or a link
        race_date = dateFile.iloc[k,0] 
    else:
        add_link = dateFile.iloc[k,0]
        race_scrape()
    k=k+1
    
    
elapsed_time = time.process_time() - t

2016-01-01
bulli/2033
https://thegreyhoundrecorder.com.au/results/bulli/2033
291626
cambridge/2035
https://thegreyhoundrecorder.com.au/results/cambridge/2035
327685
darwin/2032
https://thegreyhoundrecorder.com.au/results/darwin/2032
230821
dubbo/2031
https://thegreyhoundrecorder.com.au/results/dubbo/2031
306420
geelong/2025
https://thegreyhoundrecorder.com.au/results/geelong/2025
335506
gosford/2027
https://thegreyhoundrecorder.com.au/results/gosford/2027
292359
ipswich/2029
https://thegreyhoundrecorder.com.au/results/ipswich/2029
294054
mandurah/2034
https://thegreyhoundrecorder.com.au/results/mandurah/2034
365933
strathalbyn/2030
https://thegreyhoundrecorder.com.au/results/strathalbyn/2030
293616
traralgon/2026
https://thegreyhoundrecorder.com.au/results/traralgon/2026
337466
wagga/2028
https://thegreyhoundrecorder.com.au/results/wagga/2028
287309
2016-01-02
addington/2020
https://thegreyhoundrecorder.com.au/results/addington/2020
349315
bundaberg/2022
https://thegreyhoundrecorder.co

321802
2016-01-10
albion-park/1926
https://thegreyhoundrecorder.com.au/results/albion-park/1926
343009
canberra/1928
https://thegreyhoundrecorder.com.au/results/canberra/1928
312633
gawler/1925
https://thegreyhoundrecorder.com.au/results/gawler/1925
281837
healesville/1922
https://thegreyhoundrecorder.com.au/results/healesville/1922
338837
manukau/1927
https://thegreyhoundrecorder.com.au/results/manukau/1927
338875
sale/1923
https://thegreyhoundrecorder.com.au/results/sale/1923
326915
sandown-park/1924
https://thegreyhoundrecorder.com.au/results/sandown-park/1924
305879
2016-01-11
albion-park/1916
https://thegreyhoundrecorder.com.au/results/albion-park/1916
283534
angle-park/1920
https://thegreyhoundrecorder.com.au/results/angle-park/1920
293612
ballarat/1912
https://thegreyhoundrecorder.com.au/results/ballarat/1912
342731
bulli/1918
https://thegreyhoundrecorder.com.au/results/bulli/1918
285496
dubbo/1919
https://thegreyhoundrecorder.com.au/results/dubbo/1919
254629
launceston/1915
htt

276090
mandurah/1804
https://thegreyhoundrecorder.com.au/results/mandurah/1804
344903
mandurah/1810
https://thegreyhoundrecorder.com.au/results/mandurah/1810
86844
townsville/1805
https://thegreyhoundrecorder.com.au/results/townsville/1805
259226
2016-01-20
albion-park/1796
https://thegreyhoundrecorder.com.au/results/albion-park/1796
288160
angle-park/1797
https://thegreyhoundrecorder.com.au/results/angle-park/1797
329521
ballarat/1787
https://thegreyhoundrecorder.com.au/results/ballarat/1787
322873
cranbourne/1788
https://thegreyhoundrecorder.com.au/results/cranbourne/1788
338068
gawler/1798
https://thegreyhoundrecorder.com.au/results/gawler/1798
39331
hatrick/1790
https://thegreyhoundrecorder.com.au/results/hatrick/1790
427869
northam/1793
https://thegreyhoundrecorder.com.au/results/northam/1793
36974
northam/1795
https://thegreyhoundrecorder.com.au/results/northam/1795
314250
richmond/1794
https://thegreyhoundrecorder.com.au/results/richmond/1794
281726
rockhampton/1791
https://theg

346426
shepparton/1695
https://thegreyhoundrecorder.com.au/results/shepparton/1695
341275
warragul/1696
https://thegreyhoundrecorder.com.au/results/warragul/1696
328806
warrnambool/1694
https://thegreyhoundrecorder.com.au/results/warrnambool/1694
338944
2016-01-29
addington/1691
https://thegreyhoundrecorder.com.au/results/addington/1691
349506
bendigo/1681
https://thegreyhoundrecorder.com.au/results/bendigo/1681
342705
bulli/1686
https://thegreyhoundrecorder.com.au/results/bulli/1686
277743
casino/1689
https://thegreyhoundrecorder.com.au/results/casino/1689
267781
darwin/1687
https://thegreyhoundrecorder.com.au/results/darwin/1687
191773
geelong/1682
https://thegreyhoundrecorder.com.au/results/geelong/1682
339095
hatrick/1685
https://thegreyhoundrecorder.com.au/results/hatrick/1685
292779
ipswich/1690
https://thegreyhoundrecorder.com.au/results/ipswich/1690
278022
mandurah/1688
https://thegreyhoundrecorder.com.au/results/mandurah/1688
375557
sale/1683
https://thegreyhoundrecorder.com.a

### storing to CSV

In [None]:
all_data.to_csv('january_2016.csv',index=False)


In [33]:
print(all_data)

          date          race_name race_number race_place rug  \
0   2016-01-01         bulli/2033           1          1   3   
0   2016-01-01         bulli/2033           1          2   2   
0   2016-01-01         bulli/2033           1          3   1   
0   2016-01-01         bulli/2033           1          4   6   
0   2016-01-01         bulli/2033           1          5   7   
0   2016-01-01         bulli/2033           1          6   8   
0   2016-01-01         bulli/2033           1          7   5   
0   2016-01-01         bulli/2033           1          8   9   
0   2016-01-01         bulli/2033           2          1   3   
0   2016-01-01         bulli/2033           2          2   8   
0   2016-01-01         bulli/2033           2          3   1   
0   2016-01-01         bulli/2033           2          4   9   
0   2016-01-01         bulli/2033           2          5   7   
0   2016-01-01         bulli/2033           2          6   2   
0   2016-01-01         bulli/2033       

## Total number of minutes it took 

In [36]:
print(elapsed_time/60) # amount of time it took for all of this calculation

27.501826993783332
