In [None]:
import requests
from bs4 import BeautifulSoup as BS
import pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [None]:
def extract_standings_table(soup):
    
    table = soup.find('table')
    
     # Extract the table data into a list of dictionaries
    data = []
    rows = table.find_all('tr')
    headers = [header.get_text(strip=True) for header in rows[0].find_all('th')]
    
    for row in rows[1:]:
        values = [cell.get_text(strip=True) for cell in row.find_all(['td', 'th'])]
        data.append(dict(zip(headers, values)))

    # Creating DataFrame
    team_standings_df = pd.DataFrame(data)

    # Cleaning up DataFrame
    team_standings_df = team_standings_df.replace(r'\n', '', regex=True)  # Removes newline characters
    team_standings_df['TEAM'] = team_standings_df['TEAM'].str.strip()  # Strips leading/trailing spaces

    return(team_standings_df)

In [None]:
URL = 'https://nisaofficial.com/standings'

## 2023 Regular Season Team Standings Scrapping

In [None]:
response2023 = requests.get(URL)
soup2023 = BS(response2023.text)

In [None]:
team_standings_2023_df = extract_standings_table(soup2023)
print(team_standings_2023_df)

## Web scrapping with dynamic content rendering
The NISA website URL does not change when looking at different seasons for the team standings. This is most likely due to the website using JavaScript to fetch and load the data asynchronously. To get around this we will use a library called 'selenium'.

## 2022 Regular Season Team Standings Scrapping

In [None]:

chrome_driver_path = 'C:\Program Files (x86)\Google\Chrome\Application\chrome'

# Starting Chrome with Selenium
service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service)

# Navigating to the page
driver.get(URL)

# Interacting with the page
button1 = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#selectSeason')))
button1.click()

button2 = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#select2-year-container')))
button2.click()

In [None]:
# The button selector id for each season is dynamic and changes each time the page is opened
# As of now I am inspecting the page to find the dynamic part

In [None]:
# '#select2-year-result-{dynamic_part}-33'
season_select2022 = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#select2-year-result-9yzv-33')))
season_select2022.click()


In [None]:
soup2022 = BS(driver.page_source,'html.parser')

In [None]:
team_standings_2022_df = extract_standings_table(soup2022)
print(team_standings_2022_df)

In [None]:
driver.quit()

## 2021 (Fall) Regular Season Team Standings Scrapping

In [None]:
chrome_driver_path = 'C:\Program Files (x86)\Google\Chrome\Application\chrome'

# Starting Chrome with Selenium
service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service)

# Navigating to the page
driver.get(URL)

# Interacting with the page
button1 = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#selectSeason')))
button1.click()

button2 = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#select2-year-container')))
button2.click()

In [None]:
# '#select2-year-result-{dynamic_part}-17'
season_select2021 = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#select2-year-result-36kv-17')))
season_select2021.click()

In [None]:
soup2021 = BS(driver.page_source,'html.parser')

In [None]:
team_standings_2021_df = extract_standings_table(soup2021)
print(team_standings_2021_df)

In [None]:
driver.quit()

## 2020 (Spring and Fall) Regular Season Team Standings Scrapping

In [None]:
chrome_driver_path = 'C:\Program Files (x86)\Google\Chrome\Application\chrome'

# Starting Chrome with Selenium
service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service)

# Navigating to the page
driver.get(URL)

# Interacting with the page
button1 = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#selectSeason')))
button1.click()

button2 = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#select2-year-container')))
button2.click()

In [None]:
# '#select2-year-result-{dynamic_part}-17'
season_select2020 = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#select2-year-result-74h1-6')))
season_select2020.click()

In [None]:
soup2020 = BS(driver.page_source,'html.parser')

In [None]:
team_standings_2020_df = extract_standings_table(soup2020)
print(team_standings_2020_df)

In [None]:
driver.quit()

## 2019 (Spring) Regular Season Team Standings Scrapping

#### Saving data frames locally as CSVs

In [None]:
team_standings_2023_df.to_csv('data/team_standings_2023.csv', index=False)
team_standings_2022_df.to_csv('data/team_standings_2022.csv', index=False)
team_standings_2021_df.to_csv('data/team_standings_2021.csv', index=False)
team_standings_2020_df.to_csv('data/team_standings_2020.csv', index=False)