### 1. Web Scraping - Chicago Marathon Results 2014-2019

__Project - BSTN Capstone__
<br>__Beth McGregor__

#### Introduction

This notebook contains the code used for web scraping the results of the Chicago Marathon from 2014-2019. The results for each year were stored in html format. The results were retrieved using BeautifulSoup, and saved into csv files by year. 

In [2]:
# Import the required libraries

import pandas as pd
import html5lib
import lxml
from bs4 import BeautifulSoup
import requests
from time import sleep
from random import randint

#### Explore the format of the results

In [6]:
# Use requests to get the data

URL = 'https://chicago-history.r.mikatiming.com/2019/?page=1&event=MAR_999999107FA31100000000C9&lang=EN_CAP&num_results=1000&pid=list&pidp=start&search%5Bsex%5D=M&search%5Bage_class%5D=%25'

content = requests.get(URL)

In [7]:
# Use BeautifulSoup to parse 
soup = BeautifulSoup(content.text, 'html.parser')

# Pulls list elements with the CSS classes list-group-item and row
elements = soup.select("li.list-group-item.row")

In [126]:
# Experiment with retrieving the results
runner = {}
runner['place_overall'] = elements[1].select_one('.place-secondary').get_text()
runner['place_gender'] = elements[1].select_one('.place-primary').get_text()
runner['full_name'] = elements[1].select_one('.type-fullname').get_text()
runner['bib_number'] = elements[1].select_one('.type-field').contents[1]
runner['age_class'] = elements[1].select_one('.type-age_class').contents[1]
runner['half_split'] = elements[1].select('.type-time')[0].contents[1]
runner['finish_time'] = elements[1].select('.type-time')[1].contents[1]
runner

{'place_overall': '1',
 'place_gender': '1',
 'full_name': 'Cherono, Lawrence (KEN)',
 'bib_number': '4',
 'age_class': '30-34',
 'half_split': '01:02:15',
 'finish_time': '02:05:45'}

#### Create a function to retrieve the results of the Chicago Marathon

In [5]:
# Define Chicago Marathon web scraping function
def get_chicago_results(year, event_code, gender):
    race_results = []
    URL = 'https://chicago-history.r.mikatiming.com/'+year+'/?page=1&event='+event_code+'&lang=EN_CAP&num_results=1000&pid=list&pidp=start&search%5Bsex%5D='+gender+'&search%5Bage_class%5D=%25'
    content = requests.get(URL)
    html_soup = BeautifulSoup(content.text, 'html.parser')

    max_page = int(html_soup.select('div.pages li.hidden-xs')[-1].get_text())

    
    for p in range(1,max_page+1):
        print("Page: ", p, "/", max_page)
        #making a request
        URL2= 'https://chicago-history.r.mikatiming.com/'+year+'/?page='+str(p)+'&event='+event_code+'&lang=EN_CAP&num_results=1000&pid=list&pidp=start&search%5Bsex%5D='+gender+'&search%5Bage_class%5D=%25'
        content = requests.get(URL2)
        html_soup = BeautifulSoup(content.text, 'html.parser')
        elements = html_soup.select("li.list-group-item.row")
        
        #pausing the script
        sleep(randint(2,5))
        
        # Extract runner information
        for row in elements[1:]: 
            runner = {}
            runner['place_overall'] = row.select_one('.place-secondary').get_text()
            runner['place_gender'] = row.select_one('.place-primary').get_text()
            runner['full_name'] = row.select_one('.type-fullname').get_text()
            runner['bib_number'] = row.select_one('.type-field').contents[1]
            runner['age_class'] = row.select_one('.type-age_class').contents[1]
            runner['finish_time'] = row.select('.type-time')[0].contents[1]
            runner['gender'] = gender
            runner['year'] = year
            race_results.append(runner)   
    return race_results

# A dictionary of marathon years with event code
year= {"2014": "MAR_999999107FA3090000000065", 
       "2015": "MAR_999999107FA3090000000079",
       "2016": "MAR_999999107FA309000000008D", 
       "2017": "MAR_999999107FA30900000000A1", 
       "2018": "MAR_999999107FA30900000000B5", 
       "2019": "MAR_999999107FA31100000000C9"}


# loop over years
for y in year:
    print("Year: ", y) # This helps to monitor progress
    year_race_results = get_chicago_results(year = y, event_code = year[y], gender = 'M')
    year_race_results = year_race_results + get_chicago_results(year = y, event_code = year[y], gender = 'W')
    race_results_year_df = pd.DataFrame(year_race_results)
    race_results_year_df.to_csv(f'Chicago_race_results_{y}.csv')

        

Year:  2016
Page:  1 / 23
Page:  2 / 23
Page:  3 / 23
Page:  4 / 23
Page:  5 / 23
Page:  6 / 23
Page:  7 / 23
Page:  8 / 23
Page:  9 / 23
Page:  10 / 23
Page:  11 / 23
Page:  12 / 23
Page:  13 / 23
Page:  14 / 23
Page:  15 / 23
Page:  16 / 23
Page:  17 / 23
Page:  18 / 23
Page:  19 / 23
Page:  20 / 23
Page:  21 / 23
Page:  22 / 23
Page:  23 / 23
Page:  1 / 19
Page:  2 / 19
Page:  3 / 19
Page:  4 / 19
Page:  5 / 19
Page:  6 / 19
Page:  7 / 19
Page:  8 / 19
Page:  9 / 19
Page:  10 / 19
Page:  11 / 19
Page:  12 / 19
Page:  13 / 19
Page:  14 / 19
Page:  15 / 19
Page:  16 / 19
Page:  17 / 19
Page:  18 / 19
Page:  19 / 19
Year:  2017
Page:  1 / 23
Page:  2 / 23
Page:  3 / 23
Page:  4 / 23
Page:  5 / 23
Page:  6 / 23
Page:  7 / 23
Page:  8 / 23
Page:  9 / 23
Page:  10 / 23
Page:  11 / 23
Page:  12 / 23
Page:  13 / 23
Page:  14 / 23
Page:  15 / 23
Page:  16 / 23
Page:  17 / 23
Page:  18 / 23
Page:  19 / 23
Page:  20 / 23
Page:  21 / 23
Page:  22 / 23
Page:  23 / 23
Page:  1 / 22
Page:  2 / 22
