# This notebook gathers data of NISA league stats tables

In [None]:
import requests
from bs4 import BeautifulSoup as BS
import pandas as pd
import os

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [None]:
# Optional cell, stretches cell width for better readability
from IPython.core.display import HTML

custom_css = """
<style>
.container { width: 100% !important; }
.code_cell { flex-grow: 1; width: 100% !important; }
.code_cell .input_area { width: 100% !important; }
</style>
"""

display(HTML(custom_css))

In [None]:
def extract_stats_table(soup):
    
    table = soup.find('table')
    
     # Extract the table data into a list of dictionaries
    data = []
    rows = table.find_all('tr')
    headers = [header.get_text(strip=True) for header in rows[0].find_all('th')]
    
    for row in rows[1:]:
        values = [cell.get_text(strip=True) for cell in row.find_all(['td', 'th'])]
        data.append(dict(zip(headers, values)))

    # Creating DataFrame
    league_stats_df = pd.DataFrame(data)

    return(league_stats_df)

In [None]:
def create_df(csv_filename):
    
    soup = BS(driver.page_source,'html.parser')
    
    league_stats_df = extract_stats_table(soup)
    
    # Creates a "data" folder if it doesn't exist
    data_folder = 'data'
    if not os.path.exists(data_folder):
        os.makedirs(data_folder)
    
    #Saves the dataframe locally to a csv
    csv_filepath = os.path.join(data_folder, csv_filename)
    league_stats_df.to_csv(csv_filepath, index=False)
    
    print(league_stats_df)

In [None]:
def download_table(desired_table, desired_year, file_name):
    
    # Find and click the button to open the drop-down for selecting the year
    year_button = driver.find_element(By.CSS_SELECTOR, 'span#select2-year-container')
    year_button.click()

    # Find and click the specific year option in the drop-down
    year_option = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, f'//li[text()="{desired_year}"]')))
    year_option.click()

    # Find and click the button to open the drop-down for selecting the table
    table_button = driver.find_element(By.CSS_SELECTOR, 'span#select2-type-container')
    table_button.click()

    # Find and click the specific table option in the drop-down
    table_option = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, f'//li[text()="{desired_table}"]')))
    table_option.click()

    # Call create_df function to download the table data
    create_df(f"{file_name}_{desired_year}.csv")

    driver.quit()

In [None]:
def print_options_for_year(desired_year):
    
    driver = webdriver.Chrome()

    # Navigating to the page
    URL = 'https://nisaofficial.com/league-stats'
    driver.get(URL)

    # Find and click the button to open the drop-down for selecting the year
    year_button = driver.find_element(By.CSS_SELECTOR, 'span#select2-year-container')
    year_button.click()

    # Find and click the specific year option in the drop-down
    year_option = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, f'//li[text()="{desired_year}"]')))
    year_option.click()

    # Wait for the third button to be clickable
    WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'span#select2-type-container')))

    # Hover over the third button to trigger its drop-down
    third_button = driver.find_element(By.CSS_SELECTOR, 'span#select2-type-container')
    driver.execute_script("arguments[0].click();", third_button)

    # Wait for the drop-down to open
    table_dropdown = WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'select#type')))

    # Find the drop-down options and print their text
    options = table_dropdown.find_elements(By.TAG_NAME, 'option')
    for option in options:
        print(option.text)
    
    driver.quit()

## Run the below cell to see all the available years.

In [None]:
# Start Chrome with Selenium
driver = webdriver.Chrome()

# Navigate to the page with player stats
player_stats_url = 'https://nisaofficial.com/league-stats'
driver.get(player_stats_url)

# Interaction with the first button (year selection)
year_button = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.ID, 'select2-year-container')))
year_button.click()

# Get the available year options from the drop-down menu
year_options = driver.find_elements(By.CSS_SELECTOR, 'li.select2-results__option')
years = [option.text for option in year_options]
print(years)

# Quit the browser
driver.quit()

## Run the below cell to see the filters you can chose based on the year you're interested in.

In [None]:
print_options_for_year('2023')

## Table Selection
### Run both cells for every table you want to download. All you need to do is change are the arguements in the 'download_table()' function. The first arguement is the type of table you want to download (optional arguements come from the 'print_options_for_year()' function. The second arguement is what year you want to pull a table from, and the third is what you want the file to be called. You do not need to end the file name with '.csv'. The file will be downloaded into a directory called 'data'. You do not have to create this directory yourself. The 'create_df()' function will do this for you. Finally, make sure your agruements are encased in apostrophes or quotations.

In [None]:
# Start Chrome with Selenium
driver = webdriver.Chrome()

# Navigate to the page
URL = 'https://nisaofficial.com/league-stats'
driver.get(URL)

In [None]:
download_table('Regular Season', '2023', 'regseason_leaguestat')

## Use the below cell to download all Regular Season Tables
### You have to choose 'Choose Type' to get the regular season stats of years besides 2023. This is the only way to refresh the table. If you have the program click on 'Regular Season' it will not refresh the table and will keep giving you the data from 2023.

In [None]:
# Loop through the years and download tables
for year in years:
    
    print(f'\nLeague Stats for {year}')
    
    # Start Chrome with Selenium
    driver = webdriver.Chrome()

    # Navigate to the page
    URL = 'https://nisaofficial.com/league-stats'
    driver.get(URL)
    
    download_table('Choose Type', year, 'regseason_leaguestat')

## Below will concat all the tables into one given you used the same file naming.

In [None]:
data_frames = []

# Read and preprocess data for each year
for year in years:
    filename = f'data/regseason_leaguestat_{year}.csv'
    df = pd.read_csv(filename)
    
    # Remove the first column
    df = df.iloc[:, 1:]
    
    # Add a column for the year
    df['Year'] = year
    
    # Append the modified DataFrame to the list
    data_frames.append(df)

# Concatenate all DataFrames into a single DataFrame
complete_regseason_leaguestat_df = pd.concat(data_frames, ignore_index=True)

# Change column names
column_name_mapping = {
    'TEAM': 'Team Name',
    'GP': 'Games Played',
    'GOALS': 'Goals Scored',
    'SHOTS': 'Total Shots',
    'YEL': 'Yellow Cards',
    'RED': 'Red Cards',
    'Year': 'Year'
}

complete_regseason_leaguestat_df = complete_regseason_leaguestat_df.rename(columns=column_name_mapping)

# Save the concatenated DataFrame to a CSV file
complete_regseason_leaguestat_df.to_csv('complete_regseason_leaguestat.csv', index=False)

print(complete_regseason_leaguestat_df)