In [2]:
# Scraper for Davidson basketball games (first attempt)
# only works with urls from davidsonwilcats.com, 
#      ex. https://davidsonwildcats.com/sports/mens-basketball/stats/2023-24/richmond/boxscore/9122
# If there's an error, try running it again. Sometimes doesn't work the first time for some reason
# By Luke Horne (luhorne@davidson.edu)
# make sure you've installed required packages
# pip install requests
# pip install beautifulsoup4
# pip install selenium

import csv
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# clicks the inputted button
# returns true if successfully clicked, false if not
def click_button(xpath):
    time.sleep(1)
    try:
        btn = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, xpath))
        )
        time.sleep(1)
        try:
            driver.execute_script("arguments[0].click();", btn)
            return True
        except:
            return False
    except:
        return False

# returns the scraped data from the current page as a nested list
# input the current score, for continuity between halves
def getData(score):
    try:
        WebDriverWait(driver, 10).until( # finding scores, time, and actions
            EC.presence_of_element_located((By.XPATH, "//*[@class[contains(., 'text-theme-safe-light')]] | //*[contains(text(), ':')] | //p[contains(@class, 's-team__team-score') and contains(@class, 's-text-large-bold')]"))
        )
        new_element = driver.find_elements(By.XPATH, "//*[@class[contains(., 'text-theme-safe-light')]] | //*[contains(text(), ':')] | //p[contains(@class, 's-team__team-score') and contains(@class, 's-text-large-bold')]")
        prev_location=0
        current_row = []
        switch_order=[]
        global global_half
        for i in new_element:
            k=i.text.strip()
            if k:
                # Everything below here is for sorting the data to the intended order
                if abs(i.location['y']-prev_location) > 25: # if we're on a new line (new action)
                    current_row += switch_order
                    current_row += score
                    if len(current_row)==6: # only add rows with complete information
                        rows.append(current_row)
                    switch_order=[]
                    current_row=[global_half]
                    if len(k) < 4: # if it's a score 
                        score.pop(0)
                        score.append(k)
                    elif k[2]==":": # if it's a time
                        current_row.append(k)
                    else: # if it's an action
                        switch_order.append(k)
                        switch_order.append("")
                else:
                    if len(k) < 4: # score
                        score.pop(0)
                        score.append(k)
                    elif k[2]==":": # time
                        current_row.append(k)
                    elif len(current_row)==1:
                        switch_order.append(k)
                        switch_order.append("")
                    else:
                        switch_order.append("")
                        switch_order.append(k)
                prev_location = i.location['y']
                ####
        current_row += switch_order
        current_row += score
        if len(current_row)==6:
            rows.append(current_row)
        global global_score
        global_score = score
        return rows

    except Exception as e:
        print("Error finding new content:", e)

rows = []
global_score = ["0","0"] # starting score
global_half = "1" # first half
url = input("URL: ")

# setting up driver
options = Options()
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)
driver.get(url)

time.sleep(1) # wait a second

# getting opponent's name
try: 
    opp_name = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.XPATH, "//*[contains(@class, 's-teams__opponent-name')]"))).text
except:
    print("Couldn't find opponent name. Is the URL the correct format?")
    sys.exit()
    
# going to the play-by-play page
if not click_button("//button[text()='Play-by-play']"):
    print("Error clicking play by play button")
    sys.exit()

# getting 1st half data
h1 = getData(global_score)

# going to second half page
if not click_button("//button[text()='2nd']"):
    print("Error clicking second half")
    sys.exit()
global_half = "2"

# 2nd half data
getData(global_score)

# if we have overtime
if click_button("//button[text()='3rd']"):
    global_half = "3"
    getData(global_score)    
if click_button("//button[text()='4th']"):
    global_half = "4"
    getData(global_score)

output = opp_name.replace(" ", "_") + ".csv"

with open(output, 'w') as f:
    writer = csv.writer(f)
    header = ["Half", "Time", "Davidson", opp_name, "Davidson_Score", opp_name + "_Score"]
    writer.writerow(header)
    for row in h1:
        writer.writerow(row)
        
driver.quit()
f.close()
print("File is ready")

URL:  https://davidsonwildcats.com/sports/mens-basketball/stats/2023-24/charlotte/boxscore/9103


File is ready
