# ESPN PGA Tour: 2022 Rocket Mortgage Tournament Scraper
## Final Project
## Adie Maki

In [2]:
# import libraries 
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

# Chrome headless
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# initialize the chrome driver 
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


In [3]:
# set the URL for the PGA Tour Rocket Mortage Leaderboard July 28-31 2022
url = "https://www.espn.com/golf/leaderboard/_/tournamentId/401353214"
driver.get(url)

In [4]:
# Wait until the main leaderboard table is present
#  wait up to 15 seconds 
#  ensures that the page has loaded the leaderboard before it begins scraping
wait = WebDriverWait(driver, 15)
table = wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))

# scrape the table headers 
# loop through each header cell to get its text 
header_elements = table.find_elements(By.TAG_NAME, "th")
headers = [h.text for h in header_elements]

# scrape the table rows 
#  each <tr> is a row of player data
#  for each row, extract the <td> cells which are the list of cell values 
#  only append rows that contain data
rows = []
for tr in table.find_elements(By.TAG_NAME, "tr")[1:]:  # skip header
    cells = [td.text for td in tr.find_elements(By.TAG_NAME, "td")]
    if cells:  # ignore empty rows 
        rows.append(cells)

# convert the scraped data into a dataframe 
df = pd.DataFrame(rows, columns=headers)

# show the first 5 rows to confirm the table loks right
df.head()



Unnamed: 0,Unnamed: 1,POS,PLAYER,SCORE,R1,R2,R3,R4,TOT,EARNINGS,FEDEX PTS
0,,1,T. Finau,-26,64,66,65,67,262,"$1,512,000",500
1,,T2,P. Cantlay,-21,70,65,66,66,267,"$635,600",208
2,,T2,C. Young,-21,71,63,65,68,267,"$635,600",208
3,,T2,T. Pendrith,-21,64,65,66,72,267,"$635,600",208
4,,5,S. Jaeger,-20,67,68,65,68,268,"$344,400",110


In [5]:
# convert into a CSV to be able to clean and merge later 
df.to_csv("espn_pga_leaderboard.csv", index=False)

In [6]:
# quit the driver
driver.quit()