# Scrapper para obtener las estadísticas de todos los jugadores del top 100 en sus enfrentamientos contra los 75 rivales contra los que más veces han jugado

In [None]:
# pip install BeautifulSoup4
# pip install pandas
# pip install numpy
# pip install selenium
# pip instal webdriver_manager (este se usa para descargar automaticamente ChromeDriverManager en la misma versión que tu Google Chrome)
# y es necesario porque usamos todo el rato el ChromeDriver

In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import os

# Ruta a la carpeta descomprimida de la extensión uBlock Origin
extension_path = "/Users/usuario/Downloads/uBlock-Origin"  # Cambia esto a la ruta donde tienes descomprimida la extensión

# Configuración del driver de Selenium
options = Options()
options.add_argument("--disable-notifications")
options.add_argument("--no-first-run")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument(f"--load-extension={extension_path}")  # Añadir la extensión uBlock Origin descomprimida
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
options.add_argument(f"user-agent={user_agent}")
s = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=s, options=options)

# Definir columnas para el DataFrame
columnas = ["Player ID", "Opponent ID", "Ace", "Double Fault", "1st Serve", "1st Serve Won",
            "2nd Serve Won", "Break Points Saved", "Service Points Won", "Service Games Won", "Ace Against",
            "Double Fault Against", "1st Serve Return Won", "2nd Serve Return Won", "Break Points Won",
            "Return Points Won", "Return Games Won", "Total Points Won", "Games Won", "Sets Won", "Matches Won"]

# Archivo CSV para guardar las estadísticas
csv_file = 'tennis_players_rivalries_stats_top100.csv'

# Crear el archivo CSV con encabezados si no existe
if not os.path.exists(csv_file):
    df = pd.DataFrame(columns=columnas)
    df.to_csv(csv_file, index=False)

# Lista para almacenar las estadísticas
all_players_rivalries_stats = []

# Función para extraer estadísticas de las tablas de Overview
def extract_overview_stats(soup):
    tables = soup.find_all("table", class_="table table-condensed table-hover table-striped")
    stats = []
    for table in tables[:3]:  # Recorremos las tres primeras tablas (servicio, devolución y totales)
        table_stats = [stat.text.strip().replace('%', '') for stat in table.select("th.text-right.pct-data")]
        stats.extend(table_stats)
    return [stat for stat in stats if stat]

# Función para obtener y procesar los botones de estadísticas de los rivales en la página actual
def process_rival_buttons(player_id):
    rival_buttons = driver.find_elements(By.XPATH, '//a[contains(@id, "rivalryStats-")]')
    for button in rival_buttons:
        opponent_id = button.get_attribute("id").split('-')[1]

        # Desplazarse al botón antes de hacer clic en él
        driver.execute_script("arguments[0].scrollIntoView(true);", button)
        time.sleep(1)

        # Hacer clic en el botón de estadísticas
        try:
            button.click()
            time.sleep(2)
        except Exception as e:
            print(f"Error clicking on button: {e}")
            continue

        soup = BeautifulSoup(driver.page_source, "html.parser")
        stats = extract_overview_stats(soup)
        all_stats = [player_id, opponent_id] + stats
        all_players_rivalries_stats.append(all_stats)

        # Cerrar la tabla de estadísticas antes de pasar al siguiente rival
        try:
            button.click()
            time.sleep(1)
        except Exception as e:
            print(f"Error closing stats modal: {e}")

# Función para recopilar URLs de los jugadores del top 100
def get_top_100_player_urls():
    player_urls = []
    rankings_url = 'https://www.ultimatetennisstatistics.com/rankingsTable'
    driver.get(rankings_url)
    time.sleep(3)
    # Ahora pulso dos botones para quitar mensajes de cookies
    try:
        driver.find_element(By.XPATH, '//*[@class="fc-button fc-cta-consent fc-primary-button"]').click()
        driver.find_element(By.XPATH, '//*[@class="btn btn-warning margin-left"]').click()
    except:
        pass

    for page in range(5):  # 5 páginas para cubrir el top 100
        soup = BeautifulSoup(driver.page_source, "html.parser")
        player_links = soup.select('a[title^="Show "]')
        for link in player_links:
            player_url = 'https://www.ultimatetennisstatistics.com' + link['href']
            player_urls.append(player_url)

        # Ir a la siguiente página de rankings
        try:
            next_ranking_button = driver.find_element(By.XPATH, '//a[@data-page="next" and @class="button"]')
            next_ranking_button.click()
            time.sleep(3)
        except:
            print("No more ranking pages or failed to click next ranking button")
            break
    return player_urls

# Obtener las URLs de los jugadores del top 100
top_100_player_urls = get_top_100_player_urls()

# Iterar sobre cada URL de jugador del top 100
for player_url in top_100_player_urls:
    driver.get(player_url)
    time.sleep(3)
    # Ahora pulso dos botones para quitar mensajes de cookies
    try:
        driver.find_element(By.XPATH, '//*[@class="fc-button fc-cta-consent fc-primary-button"]').click()
        driver.find_element(By.XPATH, '//*[@class="btn btn-warning margin-left"]').click()
    except:
        pass
    player_id = player_url.split('=')[1]

    # Ir a la pestaña de Rivalries
    try:
        rivalries_tab = driver.find_element(By.XPATH, '//a[@id="rivalriesPill"]')
        rivalries_tab.click()
        time.sleep(3)
    except:
        print(f"Could not find Rivalries tab for player: {player_id}")
        continue

    # Procesar las primeras 5 páginas de rivales
    for _ in range(5):
        process_rival_buttons(player_id)
        try:
            next_button = driver.find_element(By.XPATH, '//a[@data-page="next" and @class="button"]')
            driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
            time.sleep(1)
            next_button.click()
            time.sleep(3)
        except:
            print("No more pages or failed to click next button")
            break

    # Guardar las estadísticas en el archivo CSV después de cada jugador
    if all_players_rivalries_stats:
        df = pd.DataFrame(all_players_rivalries_stats, columns=columnas)
        df.to_csv(csv_file, mode='a', header=False, index=False)
        all_players_rivalries_stats.clear()

    print(f"Finished processing player with ID {player_id}")

driver.quit()

In [None]:
driver.quit()

# En caso de fallar por un baneo temporal aquí uno modificado para retomarlo a partir del último jugador scrappeado

En caso de fallar, empezaría a aparecer en los logs que no ha encontrado los rivales de todos los jugadores a partir del que falla. Retomar el scrappeo cambiando la linea df = df[df["Player ID"] != 4920] y la variable continue_from_id = 4920 por el id del jugador a partir del cual el script ha fallado. Por defecto está puesto el primer jugador, cambiar si falla.

In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import os

# Ruta a la carpeta descomprimida de la extensión uBlock Origin
extension_path = "/Users/usuario/Downloads/uBlock-Origin"  # Cambia esto a la ruta donde tienes descomprimida la extensión uBlock Origin para bloquear pop-ups

# Configuración del driver de Selenium
options = Options()
options.add_argument("--disable-notifications")
options.add_argument("--no-first-run")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument(f"--load-extension={extension_path}")  # Añadir la extensión uBlock Origin descomprimida
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
options.add_argument(f"user-agent={user_agent}")
s = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=s, options=options)

# Defino columnas para el DataFrame
columnas = ["Player ID", "Opponent ID", "Ace", "Double Fault", "1st Serve", "1st Serve Won",
            "2nd Serve Won", "Break Points Saved", "Service Points Won", "Service Games Won", "Ace Against",
            "Double Fault Against", "1st Serve Return Won", "2nd Serve Return Won", "Break Points Won",
            "Return Points Won", "Return Games Won", "Total Points Won", "Games Won", "Sets Won", "Matches Won"]

# Archivo CSV para guardar las estadísticas
csv_file = 'tennis_players_rivalries_stats_top100.csv'

# Leo jugadores ya procesados y elimino cualquier entrada previa del último jugador procesado porque ha podido fallar justo a mitad y deja ese registro a medias
if os.path.exists(csv_file):
    df = pd.read_csv(csv_file)
    df = df[df["Player ID"] != 4920]  # Por defecto está puesto el primero del top 100, cambiar al jugador en el que el scrapper falle (sale en los logs)
    df.to_csv(csv_file, index=False)
    processed_players = df["Player ID"].unique()
else:
    processed_players = []

# Lista para almacenar las estadísticas
all_players_rivalries_stats = []

# Función para extraer estadísticas de las tablas de Overview
def extract_overview_stats(soup):
    tables = soup.find_all("table", class_="table table-condensed table-hover table-striped")
    stats = []
    for table in tables[:3]:  # Recorremos las tres primeras tablas (servicio, devolución y totales)
        table_stats = [stat.text.strip().replace('%', '') for stat in table.select("th.text-right.pct-data")]
        stats.extend(table_stats)
    return [stat for stat in stats if stat]

# Función para obtener y procesar los botones de estadísticas de los rivales en la página actual
def process_rival_buttons(player_id):
    rival_buttons = driver.find_elements(By.XPATH, '//a[contains(@id, "rivalryStats-")]')
    for button in rival_buttons:
        opponent_id = button.get_attribute("id").split('-')[1]

        # Desplazarse al botón antes de hacer clic en él
        driver.execute_script("arguments[0].scrollIntoView(true);", button)
        time.sleep(1)

        # Hacer clic en el botón de estadísticas
        try:
            button.click()
            time.sleep(2)
        except Exception as e:
            print(f"Error clicking on button: {e}")
            continue

        soup = BeautifulSoup(driver.page_source, "html.parser")
        stats = extract_overview_stats(soup)
        all_stats = [player_id, opponent_id] + stats
        all_players_rivalries_stats.append(all_stats)

        # Cerrar la tabla de estadísticas antes de pasar al siguiente rival
        try:
            button.click()
            time.sleep(1)
        except Exception as e:
            print(f"Error closing stats modal: {e}")

# Función para recopilar URLs de los jugadores del top 100
def get_top_100_player_urls():
    player_urls = []
    rankings_url = 'https://www.ultimatetennisstatistics.com/rankingsTable'
    driver.get(rankings_url)
    time.sleep(3)
    # Ahora pulso dos botones para quitar mensajes de cookies
    try:
        driver.find_element(By.XPATH, '//*[@class="fc-button fc-cta-consent fc-primary-button"]').click()
        driver.find_element(By.XPATH, '//*[@class="btn btn-warning margin-left"]').click()
    except:
        pass

    for page in range(5):  # 5 páginas para cubrir el top 100
        soup = BeautifulSoup(driver.page_source, "html.parser")
        player_links = soup.select('a[title^="Show "]')
        for link in player_links:
            player_url = 'https://www.ultimatetennisstatistics.com' + link['href']
            player_urls.append(player_url)

        # Ir a la siguiente página de rankings
        try:
            next_ranking_button = driver.find_element(By.XPATH, '//a[@data-page="next" and @class="button"]')
            next_ranking_button.click()
            time.sleep(3)
        except:
            print("No more ranking pages or failed to click next ranking button")
            break
    return player_urls

# Obtener las URLs de los jugadores del top 100
top_100_player_urls = get_top_100_player_urls()

# Iterar sobre cada URL de jugador del top 100 y continuar desde el último procesado
continue_from_id = '4920'  # ID del último jugador procesado con éxito. Igual que arriba, por defecto está puesto el primero del top. En caso de fallar cambiarlo al jugador a partir del cual queremos retomar el scrappeo.
start_processing = False

for player_url in top_100_player_urls:
    player_id = player_url.split('=')[1]

    if player_id == continue_from_id:
        start_processing = True

    if not start_processing or player_id in processed_players:
        print(f"Skipping player with ID {player_id}")
        continue

    driver.get(player_url)
    time.sleep(3)
    # Ahora pulso dos botones para quitar mensajes de cookies
    try:
        driver.find_element(By.XPATH, '//*[@class="fc-button fc-cta-consent fc-primary-button"]').click()
        driver.find_element(By.XPATH, '//*[@class="btn btn-warning margin-left"]').click()
    except:
        pass

    # Ir a la pestaña de Rivalries
    try:
        rivalries_tab = driver.find_element(By.XPATH, '//a[@id="rivalriesPill"]')
        rivalries_tab.click()
        time.sleep(3)
    except:
        print(f"Could not find Rivalries tab for player: {player_id}")
        continue

    # Proceso las primeras 5 páginas de rivales
    for _ in range(5):
        process_rival_buttons(player_id)
        try:
            next_button = driver.find_element(By.XPATH, '//a[@data-page="next" and @class="button"]')
            driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
            time.sleep(1)
            next_button.click()
            time.sleep(3)
        except:
            print("No more pages or failed to click next button")
            break

    # Guardar las estadísticas en el archivo CSV después de cada jugador
    if all_players_rivalries_stats:
        df = pd.DataFrame(all_players_rivalries_stats, columns=columnas)
        df.to_csv(csv_file, mode='a', header=False, index=False)
        all_players_rivalries_stats.clear()

    print(f"Finished processing player with ID {player_id}")

driver.quit()

In [None]:
top_100_player_urls

['https://www.ultimatetennisstatistics.com/playerProfile?playerId=4920',
 'https://www.ultimatetennisstatistics.com/playerProfile?playerId=52602',
 'https://www.ultimatetennisstatistics.com/playerProfile?playerId=6407',
 'https://www.ultimatetennisstatistics.com/playerProfile?playerId=50810',
 'https://www.ultimatetennisstatistics.com/playerProfile?playerId=25897',
 'https://www.ultimatetennisstatistics.com/playerProfile?playerId=26577',
 'https://www.ultimatetennisstatistics.com/playerProfile?playerId=644',
 'https://www.ultimatetennisstatistics.com/playerProfile?playerId=52642',
 'https://www.ultimatetennisstatistics.com/playerProfile?playerId=27834',
 'https://www.ultimatetennisstatistics.com/playerProfile?playerId=26006',
 'https://www.ultimatetennisstatistics.com/playerProfile?playerId=34553',
 'https://www.ultimatetennisstatistics.com/playerProfile?playerId=45105',
 'https://www.ultimatetennisstatistics.com/playerProfile?playerId=26008',
 'https://www.ultimatetennisstatistics.com