In [1]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC



In [35]:
import pandas as pd

def process_csv(seasons: list, overwrite: bool = True):
    """
    Procesa archivos CSV para múltiples temporadas:
    1. Agregar encabezados correctos.
    2. Insertar columnas 'SEASON' y 'PLAYER ID'.
    
    Args:
        seasons (list): Lista de temporadas a asignar en la columna 'SEASON'.
        overwrite (bool): Si es True, sobrescribe el archivo original. 
                          Si es False, guarda un nuevo archivo con '_updated'.
    """
    for season in seasons:
        # Definir el nombre del archivo original
        csv_file = f'holy_grail/labels/1st_try_OREB/OREB_{season}_labels.csv'

        # Leer el archivo CSV sin encabezado explícito
        df = pd.read_csv(csv_file, header=None)

        # Definir el nuevo encabezado
        new_header = ['PLACEHOLDER', 'PLAYER NAME', 'PLAY DESCRIPTION', 
                      'REBOUND TYPE', 'BOXSCORE', 'VTM', 'HTM', 
                      'GAME DATE', 'PERIOD', 'Video Link']

        # Asignar encabezados
        df.columns = new_header

        # Reemplazar la columna vacía por 'SEASON' y asignarle el valor de la temporada
        df.rename(columns={'PLACEHOLDER': 'SEASON'}, inplace=True)
        df['SEASON'] = season

        # Crear un diccionario asignando un ID único en el orden de aparición
        unique_players = {name: idx + 1 for idx, name in enumerate(df['PLAYER NAME'].unique())}

        # Asignar los IDs a la columna 'PLAYER ID'
        df.insert(1, 'PLAYER ID', df['PLAYER NAME'].map(unique_players))

        # Definir el nombre del archivo de salida
        output_file = csv_file if overwrite else f'holy_grail/labels/OREB_{season}_labels_updated.csv'

        # Guardar el DataFrame actualizado
        df.to_csv(output_file, index=False)

        # Imprimir las primeras filas para verificar
        print(f"Processed {season}:")

# Llamada a la función con las temporadas deseadas
seasons = ["2014-15", "2015-16", "2016-17", "2017-18", "2018-19", 
           "2019-20", "2020-21", "2021-22", "2022-23", "2023-24"]
process_csv(seasons, overwrite=True)


Processed 2014-15:
Processed 2015-16:
Processed 2016-17:
Processed 2017-18:
Processed 2018-19:
Processed 2019-20:
Processed 2020-21:
Processed 2021-22:
Processed 2022-23:
Processed 2023-24:


Concatenate dfs

In [None]:
import pandas as pd

# Lista de temporadas
seasons = ["2023-24", "2022-23", "2021-22", "2020-21", "2019-20", 
           "2018-19", "2017-18", "2016-17", "2015-16", "2014-15"]

# Lista para almacenar los DataFrames individuales
dfs = []

# Leer y concatenar los archivos CSV
for season in seasons:
    file_name = f"holy_grail/labels/1st_try_OREB/OREB_{season}_labels.csv"  # Nombre del archivo

    try:
        df = pd.read_csv(file_name)  # Leer el CSV
        df["Season"] = season  # Agregar columna de la temporada
        dfs.append(df)  # Guardarlo en la lista
        print(f"✅ Cargado: {file_name}")
    except FileNotFoundError:
        print(f"⚠️ Archivo no encontrado: {file_name}")

# Concatenar todos los DataFrames en uno solo
if dfs:
    final_df = pd.concat(dfs, ignore_index=True)
    print("✅ Todos los archivos concatenados en un solo DataFrame.")
else:
    print("⚠️ No se encontraron archivos para concatenar.")


✅ Cargado: holy_grail/labels/1st_try_OREB/OREB_2023-24_labels.csv
✅ Cargado: holy_grail/labels/1st_try_OREB/OREB_2022-23_labels.csv
✅ Cargado: holy_grail/labels/1st_try_OREB/OREB_2021-22_labels.csv
✅ Cargado: holy_grail/labels/1st_try_OREB/OREB_2020-21_labels.csv
✅ Cargado: holy_grail/labels/1st_try_OREB/OREB_2019-20_labels.csv
✅ Cargado: holy_grail/labels/1st_try_OREB/OREB_2018-19_labels.csv
✅ Cargado: holy_grail/labels/1st_try_OREB/OREB_2017-18_labels.csv
✅ Cargado: holy_grail/labels/1st_try_OREB/OREB_2016-17_labels.csv
✅ Cargado: holy_grail/labels/1st_try_OREB/OREB_2015-16_labels.csv
✅ Cargado: holy_grail/labels/1st_try_OREB/OREB_2014-15_labels.csv
✅ Todos los archivos concatenados en un solo DataFrame.
   Unnamed: 0  Joel Embiid  Embiid REBOUND (Off:2 Def:2)  OREB  PHI @ DET  \
0         NaN  Joel Embiid  Embiid REBOUND (Off:3 Def:2)  OREB  PHI @ DET   
1         NaN  Joel Embiid  Embiid REBOUND (Off:4 Def:4)  OREB  PHI @ DET   
2         NaN  Joel Embiid  Embiid REBOUND (Off:5 Def

## 1. Scrape labels for FGA: Includes info of FGM (2p & 3p) and AST.

In [None]:
driver = initialize_driver(season="2023-24")

general_players_table_xpath = '//*[@id="__next"]/div[2]/div[2]/div[3]/section[2]/div/div[2]/div[3]/table/'

# Create the empty dataframe for the labels, depending on the type of play
play_type_indices = {"FGA": 11, "REB": 21, "TOV": 23, "STL": 24, "BLK": 25}
play_type = 'FGA'

df_path = '/Users/arnaubarrera/Desktop/MSc Computer Vision/TFM/labeled_plays_NBA/holy_grail/FGA_labels.csv'
labels_df = pd.read_csv(df_path)

for i in range(25, 100):  # Iterate over the list of players

    # Clicar en cierta columna (tipo de jugada) de un jugador en concreto
    try:

        link_xpath = general_players_table_xpath + f'tbody/tr[{i}]/td[{play_type_indices[play_type]}]/a'
        link_element = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, link_xpath))
        )

        link_url = link_element.get_attribute("href")
        driver.get(link_url)

    except Exception as e:
        print(f"Error: {e}")

    video_display_xpath = '//*[@id="vjs_video_3_html5_api"]'

    # Load all the rows in one page before iterating
    dropdown_xpath = '//*[@class="DropDown_select__4pIg9"]'
    option_value = '-1'

    dropdown_element = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, dropdown_xpath))
    )
    select = Select(dropdown_element)
    select.select_by_value(option_value)

    rows_number = count_rows_table(driver, table_xpath='//*[@id="__next"]/div[2]/div[2]/div[3]/section/div/div/div[3]/table/tbody')

    for i in range(1, rows_number):  # Iterate over all the plays of a particular player
        line_play = f'//*[@id="__next"]/div[2]/div[2]/div[3]/section/div/div/div[3]/table/tbody/tr[{i}]/td'

        play = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, line_play))
        )

        columns_play = play.find_elements(By.XPATH, line_play)
        
        row_data = []
        for col in columns_play:
            row_data.append(col.text)
        
        play.click()

        # Verify if the shot is assisted
        ast_xpath = '//*[@id="__next"]/div[2]/div[2]/div[3]/section/div/main/section[1]/div/div[2]/h2'

        ast_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, ast_xpath))
        )

        ast_present, assisted_by = extract_assist_info(ast_element.text)
        row_data.extend([ast_present, assisted_by])

        # Video display element
        video_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, video_display_xpath))
        )
        video_src = video_element.get_attribute('src')
        row_data.append(video_src) 

        # La fila se agrega al DataFrame
        labels_df.loc[i - 1] = row_data  
    
    # Actualizar el csv con los nuevos datos
    labels_df.to_csv(df_path, mode='a', index=True, header=False)

    # Cuando se ha guardado, click en la flecha de ir para atrás
    driver.back()
    time.sleep(2)


driver.quit()

## 3. Scrape BLOCKS

In [None]:
season = "2023-24"
play_type = "BLK"

driver = initialize_driver(season=season)

general_players_table_xpath = '//*[@id="__next"]/div[2]/div[2]/div[3]/section[2]/div/div[2]/div[3]/table/'

# Create the empty dataframe for the labels, depending on the type of play
play_type_indices = {"FGA": 11, "OREB": 19, "DREB": 20, "REB": 21, "TOV": 23, "STL": 24, "BLK": 25}

labels_df = create_dataframe(play_type)
df_path = f'/Users/arnaubarrera/Desktop/MSc Computer Vision/TFM/labeled_plays_NBA/holy_grail/labels/{play_type}_labels.csv'
labels_df.to_csv(df_path, mode='w', index=True, header=True)

for i in range(1, 100):  # Iterate over the list of players

    # Player name
    xpath_player_name = f'//*[@id="__next"]/div[2]/div[2]/div[3]/section[2]/div/div[2]/div[3]/table/tbody/tr[{i}]/td[2]/a'
    player_name = driver.find_element(By.XPATH, xpath_player_name).text

    # Clicar en cierta columna (tipo de jugada) de un jugador en concreto
    link_xpath = general_players_table_xpath + f'tbody/tr[{i}]/td[{play_type_indices[play_type]}]/a'
    link_element = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, link_xpath))
    )

    link_url = link_element.get_attribute("href")
    driver.get(link_url)

    video_display_xpath = '//*[@id="vjs_video_3_html5_api"]'

    # Load all the rows in one page before iterating
    dropdown_xpath = '//*[@class="DropDown_select__4pIg9"]'
    option_value = '-1'

    try:
        dropdown_element = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, dropdown_xpath))
        )
        select = Select(dropdown_element)
        select.select_by_value(option_value)
    except:
        print("There's only one page")
        

    rows_number = count_rows_table(driver, table_xpath='//*[@id="__next"]/div[2]/div[2]/div[3]/section/div/div/div[3]/table/tbody')

    for i in range(1, rows_number):  # Iterate over all the plays of a particular player
        line_play = f'//*[@id="__next"]/div[2]/div[2]/div[3]/section/div/div/div[3]/table/tbody/tr[{i}]/td'

        play = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, line_play))
        )

        columns_play = play.find_elements(By.XPATH, line_play)
        
        row_data = []
        for col in columns_play:
            row_data.append(col.text)

        row_data.insert(1, player_name)
        
        play.click()

        # Video display element
        video_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, video_display_xpath))
        )
        video_src = video_element.get_attribute('src')
        row_data.append(video_src) 

        # La fila se agrega al DataFrame
        labels_df.loc[i - 1] = row_data  
    
    # Actualizar el csv con los nuevos datos
    labels_df.to_csv(df_path, mode='a', index=True, header=False)

    # Cuando se ha guardado, click en la flecha de ir para atrás
    driver.back()
    time.sleep(2)


driver.quit()

Cookies banner closed.
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page
There's only one page


NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//*[@id="__next"]/div[2]/div[2]/div[3]/section[2]/div/div[2]/div[3]/table/tbody/tr[47]/td[2]/a"}
  (Session info: chrome=131.0.6778.265); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x00000001009fa138 cxxbridge1$str$ptr + 3653888
1   chromedriver                        0x00000001009f2988 cxxbridge1$str$ptr + 3623248
2   chromedriver                        0x0000000100458968 cxxbridge1$string$len + 89228
3   chromedriver                        0x000000010049cd4c cxxbridge1$string$len + 368752
4   chromedriver                        0x00000001004d64f0 cxxbridge1$string$len + 604180
5   chromedriver                        0x0000000100491564 cxxbridge1$string$len + 321672
6   chromedriver                        0x00000001004921b4 cxxbridge1$string$len + 324824
7   chromedriver                        0x00000001009c4fc0 cxxbridge1$str$ptr + 3436424
8   chromedriver                        0x00000001009c82dc cxxbridge1$str$ptr + 3449508
9   chromedriver                        0x00000001009abe60 cxxbridge1$str$ptr + 3333672
10  chromedriver                        0x00000001009c8b9c cxxbridge1$str$ptr + 3451748
11  chromedriver                        0x000000010099d678 cxxbridge1$str$ptr + 3274304
12  chromedriver                        0x00000001009e32b4 cxxbridge1$str$ptr + 3560060
13  chromedriver                        0x00000001009e3430 cxxbridge1$str$ptr + 3560440
14  chromedriver                        0x00000001009f25fc cxxbridge1$str$ptr + 3622340
15  libsystem_pthread.dylib             0x000000019f9f72e4 _pthread_start + 136
16  libsystem_pthread.dylib             0x000000019f9f20fc thread_start + 8
