# WSL Machine Learning Project

## Build athlete dataset with web scraping.


In [None]:
import pandas as pd
import numpy as np
from selenium import webdriver
#from selenium.webdriver.common.by import By
#from selenium.webdriver.support.ui import WebDriverWait
#from selenium.webdriver.support import expected_conditions as EC
#from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
import requests
from bs4 import BeautifulSoup

### Create a list of links to individual athlete pages using Selenium.

#### Using Selenium allows for a different list to be generated by each year without that data having to be provided. For now, we are looking at the last 10 years of the tour.

In [None]:
year = ['2023', '2022', '2021', '2020', '2019', '2018', '2017', '2016', '2015', '2014', '2013']
for years in year:
    url = "https://www.worldsurfleague.com/athletes/tour/mct?year="+years
    print(url)

In [None]:
links = []

year = ['2023', '2022', '2021', '2020', '2019', '2018', '2017', '2016', '2015', '2014', '2013']
for years in year:
    url = "https://www.worldsurfleague.com/athletes/tour/mct?year="+years
    chrome_options = Options()
    chrome_options.add_experimental_option("detach", True)
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),
                          options=chrome_options)
    driver.get(url)

    time.sleep(5)

    # identify element with link to athlete
    lnks = driver.find_elements("class name", "athlete-name")

    for lnk in lnks:
        # get_attribute() to get all href
        links.append(lnk.get_attribute("href"))
    
    driver.quit()

    print(links)

In [None]:
links = list(set(links))
print(links)

### Scrape athlete data using BeautifulSoup.

#### These are a few metrics to start. Specific data for each stop on tour will be examined after.

In [None]:
athlete_information = []

for link in links:
    url = link
    
    #send an http request to the URL

    response = requests.get(url)

    soup = BeautifulSoup(response.content, "html.parser")
    
    #metrics

    name = soup.find("div", {"class": "avatar-text-primary"}).get_text(strip=True)
    
    nationality = soup.find("div", class_= "country-name").get_text(strip=True)
    
    try:
        stance = soup.find("div", class_="label", text="Stance").find_next("div", class_="value").get_text(strip=True)
    except:
        stance = "Stance not found."
        
    last_ranking = soup.find("div", class_="value").get_text(strip=True)
    
    try:
        age = soup.find("div", class_="label", text="Age").find_next("span", class_="imperial").get_text(strip=True)
    except:
        age = "Age not found."
        
    first_season = soup.find("div", class_="label", text="First season").find_next("div", class_="value").get_text(strip=True)
    
    athlete_information.append({'Name': name, 'Nationality': nationality, 'Stance': stance, 'Last_Ranking': last_ranking, 'Age': age, 'First Season': first_season})
    
driver.quit()
    
#print(athlete_information)   

athlete_df = pd.DataFrame(athlete_information)

athlete_df.head(100)

In [None]:
athlete_df.to_excel(r"/Users/carmenhoyt/Downloads/athlete_df.xlsx")

#### Event data for each athlete. To be continued...

In [None]:
print(links)

In [None]:
event_links = []

for link in links:
    #url = link
    url = "https://www.worldsurfleague.com/athletes/8801/morgan-cibilic"
    chrome_options = Options()
    chrome_options.add_experimental_option("detach", True)
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),
                          options=chrome_options)
    driver.get(url)

    time.sleep(5)
       
    year_option = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]
      
    for x in year_option:
        #try:
        
        select = Select(driver.findElement(By.xpath("//*[@id='primary']/div/div/div[2]/div/div[1]/div[3]/div[1]/div[2]/form[2]/select")))
        select.select_by_index(2)
        
        #driver.find_elements("//*[@id='primary']/div/div/div[2]/div/div[1]/div[3]/div[1]/div[2]/form[2]/select").click()
        #xpath = "//*[@id='primary']/div/div/div[2]/div/div[1]/div[3]/div[1]/div[2]/form[2]/select/option[2]"
        #xpath = "//*[@id='primary']/div/div/div[2]/div/div[1]/div[3]/div[1]/div[2]/form[2]/select/option["+events+"]"    
        #driver.find_elements("xpath", xpath).click()
        time.sleep(5)
        current_url = driver.current_url
        tables = pd.read_html(current_url.text)
        name = soup.find("div", {"class": "avatar-text-primary"}).get_text(strip=True)
        year = soup.find("div", {"class": "on-change-filter"}).get_text(strip=True)
    
        #assuming the first table on the page contains the desired data
        if tables:
            #Get the first table as a DataFrame
            df = tables[0]
            df['Surfer'] = name
            df['Year'] = year
        
            print(df)
        
        else:
            print("No tables found on the page.")
            #ath_events= driver.current_url
            #event_links.append(ath_events)
            
        #except:
            #print("Could not find.")
            #continue
    
    driver.quit()

    print(event_links)

In [None]:
#url = 'https://www.worldsurfleague.com/athletes/4133/ethan-ewing'

for link in links:
    
    url = link
    
    #send an http request to the URL
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")   
    
    #extract data from the table using Pandas read_html() function
    tables = pd.read_html(response.text)
    name = soup.find("div", {"class": "avatar-text-primary"}).get_text(strip=True)
    year = soup.find()
    
    #assuming the first table on the page contains the desired data
    if tables:
        #Get the first table as a DataFrame
        df = tables[0]
        df['Surfer'] = name
        
        print(df)
        
    else:
        print("No tables found on the page.")
        
driver.quit()

In [None]:
df.head()