### 3. Web Scraping - London Marathon Results 2014-2019

__Project - BSTN Capstone__
<br>__Beth McGregor__

#### Introduction

This notebook contains the code used for web scraping the results of the London Marathon from 2014-2019. The results for each year were stored in either html for table format. The results were retrieved, processed and saved into csv files by year. 

In [2]:
# Import required libraries

import pandas as pd
import html5lib
import lxml
from bs4 import BeautifulSoup
import requests
from time import sleep
from random import randint

#### Retrieve the London Marathon results where results are stored in a table format

In [None]:
# This will retrieve years prior to 2019 where the results are stored in a table format
def get_london_results(year, event_code, gender):
    
    URL = 'https://results.virginmoneylondonmarathon.com/'+y+'/?page=1&event='+event_code+'&num_results=1000&pid=list&search%5Bsex%5D='+gender+'&search%5Bage_class%5D=%25'
    response = requests.get(URL)
    html_soup = BeautifulSoup(response.text, 'html.parser')

    # find the maximum number of result pages
    max_page = 1
    
    page_elements = html_soup.select('div.pages a')
    if len(page_elements) > 0:
        max_page = int(page_elements[-2].get_text())
    else: 
        page_elements = html_soup.select('div.pages li.hidden-xs')
        if len(page_elements) > 0:                                
            max_page = int(page_elements[-1].get_text())
    
    results_df = pd.read_html(response.content)[0]
    
    results_df['gender'] = gender
    results_df['year'] = year
    
    for p in range(2,max_page+1):
        print("Page: ", p, "/", max_page)
        #making a request
        URL2= 'https://results.virginmoneylondonmarathon.com/'+y+'/?page='+str(p)+'&event='+event_code+'&num_results=1000&pid=list&search%5Bsex%5D='+gender+'&search%5Bage_class%5D=%25'
        html_doc = requests.get(URL2).content
        race_output_df = pd.read_html(html_doc)[0]
        race_output_df['gender'] = gender
        race_output_df['year'] = year       
        results_df = pd.concat([results_df, race_output_df])
        #pausing the script
        sleep(randint(2,5))

    return results_df     
    


year = ["2014","2015", "2016", "2017", "2018"]
                                
#looping over years
for y in year:
    print("Year: ", y)
    year_race_results_M = get_london_results(year = y, event_code = "MAS", gender = "M")
    year_race_results_W = get_london_results(year = y, event_code = "MAS", gender = "W")
    year_race_results_ME = get_london_results(year = y, event_code = "ELIT", gender = "M")
    year_race_results_WE = get_london_results(year = y, event_code = "ELIT", gender = "W")
    race_results_year_df = pd.concat([year_race_results_M, year_race_results_W, year_race_results_ME, year_race_results_WE])
    
    #columns to drop
    drop_list = ["Unnamed: 4", "Club", "Unnamed: 10", "Place category", "Unnamed: 9", "Place cat."]
    race_results_filter = race_results_year_df.filter(drop_list)
    race_results_year_df.drop(race_results_filter, inplace=True, axis=1)

    race_results_year_df["Name"] = race_results_year_df["Name"].str.replace("» ", "")
    race_results_year_df.to_csv(f'London_race_results_{y}.csv')
    


#### Retrieve the results for 2019 (and other years) where the results are not stored in a table format: 

In [4]:
# This will retrieve the results for 2019 (and other years) where the results are not stored in a table format
def get_london_results(year, event_code, gender):
    race_results = []
    URL = 'https://results.virginmoneylondonmarathon.com/'+y+'/?page=1&event='+event_code+'&num_results=1000&pid=list&search%5Bsex%5D='+gender+'&search%5Bage_class%5D=%25'
    content = requests.get(URL)
    html_soup = BeautifulSoup(content.text, 'html.parser')

    # find the maximum number of result pages
    max_page = 1
    
    page_elements = html_soup.select('div.pages a')
    if len(page_elements) > 0:
        max_page = int(page_elements[-2].get_text())
    else: 
        page_elements = html_soup.select('div.pages li.hidden-xs')
        if len(page_elements) > 0:                                
            max_page = int(page_elements[-1].get_text())
    
    for p in range(1,max_page+1):
        print("Page: ", p, "/", max_page)
        #making a request
        URL2= 'https://results.virginmoneylondonmarathon.com/'+y+'/?page='+str(p)+'&event='+event_code+'&num_results=1000&pid=list&search%5Bsex%5D='+gender+'&search%5Bage_class%5D=%25'
        content = requests.get(URL2)
        html_soup = BeautifulSoup(content.text, 'html.parser')
        elements = html_soup.select("li.list-group-item.row")
        
        #pausing the script
        sleep(randint(2,5))
      

        print(len(elements))
        for row in elements[1:]: 
            runner = {}
            runner['place_overall'] = row.select_one('.place-secondary').get_text()
            runner['place_gender'] = row.select_one('.place-primary').get_text()
            runner['full_name'] = row.select_one('.type-fullname').get_text()
            runner['bib_number'] = row.select('.type-field')[1].contents[1]
            runner['age_class'] = row.select_one('.type-age_class').contents[1]
            runner['half_split'] = row.select('.type-time')[0].contents[1]
            runner['finish_time'] = row.select('.type-time')[1].contents[1]
            runner['gender'] = gender
            runner['year'] = year
            race_results.append(runner)  

    return race_results


year = ["2019"]


#looping over years
for y in year:
    print("Year: ", y)
    year_race_results = get_london_results(year = y, event_code = "MAS", gender = "M")
    year_race_results = year_race_results + get_london_results(year = y, event_code = "MAS", gender = "W")
    year_race_results = year_race_results + get_london_results(year = y, event_code = "ELIT", gender = "M")
    year_race_results = year_race_results + get_london_results(year = y, event_code = "ELIT", gender = "W")
    race_results_year_df = pd.DataFrame(year_race_results)
    race_results_year_df.to_csv(f'London_race_results_{y}.csv')


Year:  2019
Page:  1 / 25
1001
Page:  2 / 25
1001
Page:  3 / 25
1001
Page:  4 / 25
1001
Page:  5 / 25
1001
Page:  6 / 25
1001
Page:  7 / 25
1001
Page:  8 / 25
1001
Page:  9 / 25
1001
Page:  10 / 25
1001
Page:  11 / 25
1001
Page:  12 / 25
1001
Page:  13 / 25
1001
Page:  14 / 25
1001
Page:  15 / 25
1001
Page:  16 / 25
1001
Page:  17 / 25
1001
Page:  18 / 25
1001
Page:  19 / 25
1001
Page:  20 / 25
1001
Page:  21 / 25
1001
Page:  22 / 25
1001
Page:  23 / 25
1001
Page:  24 / 25
1001
Page:  25 / 25
780
Page:  1 / 18
1001
Page:  2 / 18
1001
Page:  3 / 18
1001
Page:  4 / 18
1001
Page:  5 / 18
1001
Page:  6 / 18
1001
Page:  7 / 18
1001
Page:  8 / 18
1001
Page:  9 / 18
1001
Page:  10 / 18
1001
Page:  11 / 18
1001
Page:  12 / 18
1001
Page:  13 / 18
1001
Page:  14 / 18
1001
Page:  15 / 18
1001
Page:  16 / 18
1001
Page:  17 / 18
1001
Page:  18 / 18
783
Page:  1 / 1
37
Page:  1 / 1
22
