# This notebook gathers data of NISA player stats tables

In [1]:
import requests
from bs4 import BeautifulSoup as BS
import pandas as pd
import os

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

In [2]:
# Optional cell, stretches cell width for better readability
from IPython.core.display import HTML

custom_css = """
<style>
.container { width: 100% !important; }
.code_cell { flex-grow: 1; width: 100% !important; }
.code_cell .input_area { width: 100% !important; }
</style>
"""

display(HTML(custom_css))

In [3]:
def extract_stats_table(soup):
    table = soup.find('table')
    
    # Extract the table headers from the header row
    header_row = table.find('thead').find('tr')
    headers = [header.get_text(strip=True) for header in header_row.find_all('th')]

    # Extract the table data from the data rows
    data = []
    data_rows = table.find('tbody').find_all('tr')
    
    for row in data_rows:
        values = [cell.get_text(strip=True) for cell in row.find_all('td')]
        data.append(dict(zip(headers, values)))
        
    # Creating DataFrame
    player_stats_df = pd.DataFrame(data)

    return player_stats_df

In [4]:
def create_df(csv_filename):
    
    soup = BS(driver.page_source,'html.parser')
    
    player_stats_df = extract_stats_table(soup)
    
    # Creates a "data" folder if it doesn't exist
    data_folder = 'data'
    if not os.path.exists(data_folder):
        os.makedirs(data_folder)
    
    #Saves the dataframe locally to a csv
    csv_filepath = os.path.join(data_folder, csv_filename)
    player_stats_df.to_csv(csv_filepath, index=False)
    
    print(player_stats_df)

In [5]:
def download_table(desired_table, desired_year, file_name):
    # Find and click the button to open the drop-down for selecting the year
    year_button = driver.find_element(By.CSS_SELECTOR, 'span#select2-year-container')
    year_button.click()

    # Find and click the specific year option in the drop-down
    year_option = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, f'//li[text()="{desired_year}"]')))
    year_option.click()

    # Find and click the button to open the drop-down for selecting the table (third button)
    table_button = driver.find_element(By.CSS_SELECTOR, 'span#select2-type-container')
    table_button.click()

    # Find and click the specific table option in the drop-down
    table_option = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, f'//li[text()="{desired_table}"]')))
    table_option.click()

    # Call create_df function to download the table data
    create_df(file_name)

    driver.quit()

In [6]:
def print_options_for_year(desired_year):
    
    chrome_driver_path = 'C:\Program Files (x86)\Google\Chrome\Application\chrome'

    # Starting Chrome with Selenium
    service = Service(chrome_driver_path)
    driver = webdriver.Chrome(service=service)

    # Navigating to the page
    URL = 'https://nisaofficial.com/player-stats'
    driver.get(URL)

    # Find and click the button to open the drop-down for selecting the year
    year_button = driver.find_element(By.CSS_SELECTOR, 'span#select2-year-container')
    year_button.click()

    # Find and click the specific year option in the drop-down
    year_option = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, f'//li[text()="{desired_year}"]')))
    year_option.click()

    # Wait for the third button to be clickable
    WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'span#select2-type-container')))

    # Hover over the third button to trigger its drop-down
    third_button = driver.find_element(By.CSS_SELECTOR, 'span#select2-type-container')
    driver.execute_script("arguments[0].click();", third_button)

    # Wait for the drop-down to open
    table_dropdown = WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'select#type')))

    # Find the drop-down options and print their text
    options = table_dropdown.find_elements(By.TAG_NAME, 'option')
    for option in options:
        print(option.text)
    
    driver.quit()

## Run the below cell to see all the available years.

In [7]:
# Start Chrome with Selenium
chrome_driver_path = 'C:\Program Files (x86)\Google\Chrome\Application\chrome'
service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service)

# Navigate to the page with player stats
player_stats_url = 'https://nisaofficial.com/player-stats'
driver.get(player_stats_url)

# Interaction with the first button (year selection)
year_button = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.ID, 'select2-year-container')))
year_button.click()

# Get the available year options from the drop-down menu
year_options = driver.find_elements(By.CSS_SELECTOR, 'li.select2-results__option')
for option in year_options:
    print(option.text)  # Print the available year options

# Quit the browser
driver.quit()

2019-2020
2020-2021
2021
2022
2023


## Run the below cell to see the filters you can chose based on the year you're interested in.

In [8]:
print_options_for_year('2023')

Choose Type
Championship
NISA Challenge Cup
Playoffs
Quarterfinals
Regular Season
Semifinals
US Open Cup
All


## Table Selection
### Run both cells for every table you want to download. All you need to do is change are the arguements in the 'download_table()' function. The first arguement is the type of table you want to download (optional arguements come from the 'print_options_for_year()' function. The second arguement is what year you want to pull a table from, and the third is what you want the file to be called. You will need to end the file name with '.csv' and the file will be downloaded into a directory called 'data'. You do not have to create this directory yourself. The 'create_df()' function will do this for you. Finally, make sure your agruements are encased in apostrophes or quotations.

In [9]:
chrome_driver_path = 'C:\Program Files (x86)\Google\Chrome\Application\chrome'

# Start Chrome with Selenium
service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service)

# Navigate to the page
URL = 'https://nisaofficial.com/player-stats'
driver.get(URL)

In [10]:
download_table('Regular Season', '2023', 'regseason_playerstat_2023.csv')

      #         Pos                Name  Team  GP  GS   MIN   G  A  SH  Y  R
0    99     Forward   Naglestad, Markus   CFC  14  14  1122  11  0  23  1  0
1    11     Forward     Espinal, Darwin  MBFC  17  17  1395   9  5  35  0  0
2     9     Forward        Maric , Leon  MICH  15  15  1321   6  2   8  3  0
3    09     Forward  Ten Lopez, Ignacio   CDL  10  10   810   6  0  11  5  0
4     9     Forward   Diakhate, Alioune   FCU  11  10   863   5  1   9  2  0
..   ..         ...                 ...   ...  ..  ..   ...  .. ..  .. .. ..
222   2    Defender    Villatoro, Erick   LAF  13  10   885   0  1   1  1  0
223   3    Defender      Wichmann, Theo   SAV   9   8   621   0  0   2  2  0
224   1  Goalkeeper      Wilson, Trevor   FCU   2   2   180   0  0   0  0  0
225  24     Forward         Woods, Alex   SAV   8   1   106   0  0   0  1  0
226  22    Defender      Yriarte, Tomas   CDL  15   7   677   0  0   1  2  0

[227 rows x 12 columns]
