# WSL Machine Learning Project

## Build athlete dataset with web scraping.


In [1]:
import pandas as pd
import numpy as np
from selenium import webdriver
#from selenium.webdriver.common.by import By
#from selenium.webdriver.support.ui import WebDriverWait
#from selenium.webdriver.support import expected_conditions as EC
#from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
import requests
from bs4 import BeautifulSoup

### Create a list of links to individual athlete pages using Selenium.

#### Using Selenium allows for a different list to be generated by each year without that data having to be provided. For now, we are just looking at the 2023 tour.

In [2]:
url = "https://www.worldsurfleague.com/athletes/tour/mct?year=2023"
chrome_options = Options()
chrome_options.add_experimental_option("detach", True)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),
                          options=chrome_options)
driver.get(url)

time.sleep(5)

links = []

# identify element with link to athlete
lnks = driver.find_elements("class name", "athlete-name")

for lnk in lnks:
   # get_attribute() to get all href
   links.append(lnk.get_attribute("href"))
    
driver.quit()

print(links)

['https://www.worldsurfleague.com/athletes/1456/filipe-toledo', 'https://www.worldsurfleague.com/athletes/4133/ethan-ewing', 'https://www.worldsurfleague.com/athletes/3165/griffin-colapinto', 'https://www.worldsurfleague.com/athletes/9167/joao-chianca', 'https://www.worldsurfleague.com/athletes/3442/jack-robinson', 'https://www.worldsurfleague.com/athletes/1085/gabriel-medina', 'https://www.worldsurfleague.com/athletes/3994/yago-dora', 'https://www.worldsurfleague.com/athletes/199/john-john-florence', 'https://www.worldsurfleague.com/athletes/2656/leonardo-fioravanti', 'https://www.worldsurfleague.com/athletes/1760/ryan-callinan', 'https://www.worldsurfleague.com/athletes/2838/connor-oleary', 'https://www.worldsurfleague.com/athletes/3962/barron-mamiya', 'https://www.worldsurfleague.com/athletes/1737/italo-ferreira', 'https://www.worldsurfleague.com/athletes/3896/kanoa-igarashi', 'https://www.worldsurfleague.com/athletes/2618/ian-gentil', 'https://www.worldsurfleague.com/athletes/564/j

### Scrape athlete data using BeautifulSoup.

#### These are a few metrics to start. Specific data for each stop on tour will be examined after.

In [3]:
athlete_information = []

for link in links:
    url = link
    
    #send an http request to the URL

    response = requests.get(url)

    soup = BeautifulSoup(response.content, "html.parser")
    
    #metrics

    name = soup.find("div", {"class": "avatar-text-primary"}).get_text(strip=True)
    
    nationality = soup.find("div", class_= "country-name").get_text(strip=True)
    
    stance = soup.find("div", class_="label", text="Stance").find_next("div", class_="value").get_text(strip=True)
    
    current_ranking = soup.find("div", class_="value").get_text(strip=True)
    
    try:
        age = soup.find("div", class_="label", text="Age").find_next("span", class_="imperial").get_text(strip=True)
    except:
        age = "Age not found."
        
    first_season = soup.find("div", class_="label", text="First season").find_next("div", class_="value").get_text(strip=True)
    
    athlete_information.append({'Name': name, 'Nationality': nationality, 'Stance': stance, 'Ranking': current_ranking, 'Age': age, 'First Season': first_season})
    
driver.quit()
    
#print(athlete_information)   

athlete_df = pd.DataFrame(athlete_information)

athlete_df.head(34)

  stance = soup.find("div", class_="label", text="Stance").find_next("div", class_="value").get_text(strip=True)
  age = soup.find("div", class_="label", text="Age").find_next("span", class_="imperial").get_text(strip=True)
  first_season = soup.find("div", class_="label", text="First season").find_next("div", class_="value").get_text(strip=True)


Unnamed: 0,Name,Nationality,Stance,Ranking,Age,First Season
0,Filipe Toledo,Brazil,Regular,#1,28,2009 Men's QS
1,Ethan Ewing,Australia,Regular,#2,25,2012 Men's JR
2,Griffin Colapinto,United States,Regular,#3,25,2011 Men's JR
3,Joao Chianca,Brazil,Regular,#4,23,2015 SPEC
4,Jack Robinson,Australia,Regular,#5,25,2011 Men's JR
5,Gabriel Medina,Brazil,Goofy,#6,29,2008 Men's QS
6,Yago Dora,Brazil,Goofy,#7,27,2012 Men's JR
7,John John Florence,Hawaii,Regular,#8,30,2008 Men's QS
8,Leonardo Fioravanti,Italy,Regular,#9,25,2011 Men's JR
9,Ryan Callinan,Australia,Goofy,#10,31,2009 Men's QS


#### Event data for each athlete. To be continued...

In [9]:
for link in links:
    url = link
    
    #send an http request to the URL
    response = requests.get(url)
    
    #extract data from the table using Pandas read_html() function
    tables = pd.read_html(response.text)
    
    #assuming the first table on the page contains the desired data
    if tables:
        #Get the first table as a DataFrame
        df = tables[0]
        
        print(df)
        
    else:
        print("No tables found on the page.")
        

                         Name Place Points        Defeated By
0      Billabong Pro Pipeline   5th   4745       Joao Chianca
1     Hurley Pro Sunset Beach   1st  10000               None
2   MEO Rip Curl Portugal Pro  17th   1330          Joan Duru
3    Rip Curl Pro Bells Beach   3rd   6085        Ethan Ewing
4          Margaret River Pro   5th   4745     Gabriel Medina
5              Surf Ranch Pro   3rd   6085  Griffin Colapinto
6   Surf City El Salvador Pro   1st  10000               None
7                VIVO Rio Pro   9th   3320       Jadson Andre
8           Corona Open J-Bay   1st  10000               None
9         SHISEIDO Tahiti Pro   9th   3320     Mihimana Braye
10        Rip Curl WSL Finals   1st      -               None
                         Name Place Points         Defeated By
0      Billabong Pro Pipeline  17th   1330        Liam O'Brien
1     Hurley Pro Sunset Beach   5th   4745   Griffin Colapinto
2   MEO Rip Curl Portugal Pro   9th   3320        Joao Chianca
3   