# Setup

## Installing and Importing Libraries

Updating all existing Installing libraries and installing required libraries that doesn't come with google colab.  

**Please run this cell before importing the libraries in order to avoid errors.**

In [None]:
!apt update
!apt install -yq chromium-chromedriver
!pip install selenium

Importing libraries.

In [2]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.relative_locator import locate_with
from bs4 import BeautifulSoup

## Connecting to driver and loading website

Establishing connection to browser:

In [3]:
# Set up options for Chrome in headless mode
options = webdriver.ChromeOptions()
options.add_argument('--headless') # allows Chrome to run without a graphical user interface
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

# Start the WebDriver
driver = webdriver.Chrome(options=options)

# Creating Functions to Scarpe Data

## Retrieving the link to the player's profile


**Example of getting profile link of one player:** The following chunk of code is an example on how to get one link to the player's profile page.

In [None]:
main_url = "https://www.atptour.com/en/players"
search_term = "roger federer"

try:
    # Step 1: Navigate to the URL
    driver.get(main_url)

    # Step 2: Find the search bar by ID and input the player's name
    driver.implicitly_wait(5)

    search_bar = driver.find_element(By.ID, "playerInput")
    search_bar.send_keys(search_term)

    # Step 3: Wait for the dropdown to appear and find the first 'no-padding' class
    driver.implicitly_wait(5)

    dropdown = driver.find_element(By.ID, "playerDropdown")
    first_option = dropdown.find_element(By.CLASS_NAME,"no-padding")

    # Step 4: Retrieve the link inside the first 'no-padding' class; which is the link to the player's profile
    player_link_class = first_option.find_element(By.TAG_NAME,"a")
    player_link = str(player_link_class.get_attribute("href"))
    print(player_link)

finally:
    # Optional: Close the browser window when you are done
    driver.quit()

**Creating a function to retrieve multiple profiles links:** In following code I created a function that takes a list of players names as an argument and returns a list of links to their profiles.

In [4]:
def retrieving_players_profile_links(players_names: list[str], main_url = "https://www.atptour.com/en/players"):
  """This function takes a list with the names of tennis players and returns a list with links to their profile
  on the ATP website"""

  try:
    players_links_list = []
    for i in range(len(players_names)):
      # Step 1: Navigate to the URL
      driver.get(main_url)

      # Step 2: Find the search bar by ID and input the player's name
      driver.implicitly_wait(5)

      search_bar = driver.find_element(By.ID, "playerInput")
      search_bar.send_keys(players_names[i])

      # Step 3: Wait for the dropdown to appear and find the first 'no-padding' class
      driver.implicitly_wait(5)

      dropdown = driver.find_element(By.ID, "playerDropdown")
      first_option = dropdown.find_element(By.CLASS_NAME,"no-padding")

      # Step 4: Retrieve the link inside the first 'no-padding' class; which is the link to the player's profile
      player_link_class = first_option.find_element(By.TAG_NAME,"a")
      player_link = str(player_link_class.get_attribute("href"))
      # print(player_link)
      players_links_list.append(player_link)

  finally:
    # Optional: Close the browser window when you are done
    driver.quit()
    return players_links_list

An example on how to use the function:

In [None]:
players_profile_links = retrieving_players_profile_links(players_names=["roger federer","rafael nadal"])
players_profile_links

---

## Retrieving player's data from the profile link

**Example of getting the data of one player:** The following chunk of code is an example on how to get the data of one player from his profile page.

In [None]:
player_variabels = []
try:
  # Step 1: Navigate to the URL
  driver.get(player_link)

  # Step 2: Getting the html element that contains the player's static data (birthdate, year turned pro, weight, etc.)
  driver.implicitly_wait(5)

  players_data = driver.find_element(By.CLASS_NAME, "player-profile-hero-table") # This element contains all the static data
  players_data_html = BeautifulSoup(players_data.get_attribute("innerHTML"), "html.parser") # Turning it to a readable HTML code

  # Step 3: Getting the nested data
  player_attri = players_data_html.find_all(class_=["table-big-value","table-value"])

  # Step 4: Inserting it to a container
  for el in player_attri:
    player_variabels.append(el.get_text().strip())

  # Step 5: Cleaning the birthdate variable
  player_variabels[0] = player_variabels[0][-11:-1]
  print(player_variabels)

finally:
    # Optional: Close the browser window when you are done
    driver.quit()

**Creating a function to retrieve data for multiple players:** In following code I created a function that takes a list of players names and their profile links as arguments and returns a dataframe with the player's data.

In [5]:
def retrieving_players_data_from_profile_links(players_names: list[str], profile_links: list[str]):

  players_features_df = pd.DataFrame(columns=["Name", "Birthdate", "Year_Turned_Pro", "Birth_Place", "Weight", "Height", "Hands", "Coach"])

  for i in range(len(profile_links)):
    player_variabels = []
    # Step 1: Navigate to the URL
    driver.get(profile_links[i])

    # Step 2: Getting the html element that contains the player's static data (birthdate, year turned pro, weight, etc.)
    driver.implicitly_wait(5)

    players_data = driver.find_element(By.CLASS_NAME, "player-profile-hero-table") # This element contains all the static data
    players_data_html = BeautifulSoup(players_data.get_attribute("innerHTML"), "html.parser") # Turning it to a readable HTML code

    # Step 3: Getting the nested data
    player_attri = players_data_html.find_all(class_=["table-big-value","table-value"])

    # Step 4: Inserting it to a container
    for el in player_attri:
      player_variabels.append(el.get_text().strip())

    # Step 5: Cleaning the birthdate variable and adding the player's name
    player_variabels[0] = player_variabels[0][-11:-1]
    player_variabels.insert(0, players_names[i])

    # Step 6: Inserting the container to the DF
    players_features_df.loc[len(players_features_df.index)] = player_variabels

  return players_features_df

An example on how to use the function:

In [None]:
retrieving_players_data_from_profile_links(players_names=["roger federer","rafael nadal"], profile_links=players_profile_links)

---

# Importing Data about all the players I have

Importing matches dataset:

In [8]:
matches_df = pd.read_csv(filepath_or_buffer="/content/drive/MyDrive/Tennis_Analysis/atp_tennis.csv", header=0)
matches_df.tail()

Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,Rank_1,Rank_2,Pts_1,Pts_2,Odd_1,Odd_2,Score
61142,Zhuhai Championships,2023-09-24,ATP250,Outdoor,Hard,Quarterfinals,3,Karatsev A.,Norrie C.,Karatsev A.,63,17,853,1985,2.5,1.53,7-6 7-6
61143,Zhuhai Championships,2023-09-24,ATP250,Outdoor,Hard,Quarterfinals,3,Struff J.L.,Nishioka Y.,Nishioka Y.,23,46,1474,1025,2.2,1.67,4-6 5-7
61144,Zhuhai Championships,2023-09-25,ATP250,Outdoor,Hard,Semifinals,3,Khachanov K.,Korda S.,Khachanov K.,15,33,2135,1295,2.2,1.67,7-5 6-4
61145,Zhuhai Championships,2023-09-25,ATP250,Outdoor,Hard,Semifinals,3,Karatsev A.,Nishioka Y.,Nishioka Y.,63,46,853,1025,1.73,2.1,4-6 4-6
61146,Zhuhai Championships,2023-09-26,ATP250,Outdoor,Hard,The Final,3,Khachanov K.,Nishioka Y.,Khachanov K.,15,46,2135,1025,1.57,2.38,7-6 6-1


Importing player names data:

In [14]:
male_players_names_df = pd.read_csv("/content/drive/MyDrive/Tennis_Analysis/male_players_names.csv")
male_players_names_df.shape

(648, 4)

Grabbing only the player columns:

In [16]:
all_players_names_series = pd.concat([matches_df["Player_1"], matches_df["Player_2"]], ignore_index=True)
all_players_names_list = list(set(all_players_names_series))
all_players_names_list

['Prieto S.',
 'Monaco J.',
 'Wang J.',
 'Authom M.',
 'Slabinsky A.',
 'Biryukov M.',
 'Galarneau A.',
 'Taino E.',
 'Evans D.',
 'Fidirko N.',
 'Bourgue M.',
 'Mesaros K.',
 'Kopriva V.',
 'Etcheverry T.',
 'Venus M.',
 'Molleker R.',
 'Chaki R.',
 'Williams R.',
 'De Loore J.',
 'Choinski J.',
 'Hsu Y.',
 'Montanes A.',
 'Struvay E.',
 'Wolf J.J.',
 'Economidis K.',
 'Ryderstedt M.',
 'Matosevic M.',
 'Diallo G.',
 'Gasquet R. ',
 'Motomura G.',
 'Simoni A.',
 'Ball C.',
 'Pavel A.',
 'Daniel M.',
 'Healey N.',
 'Gabashvili T.',
 'Vaisse M.',
 'Poljicak M.',
 'Zhang Z.',
 'Stoliarov A.',
 'Weinzierl J.',
 'Podlipnik H.',
 'Ciorcila P.',
 'Davydenko N.',
 'Vliegen K.',
 'Gooding J.',
 'Zeppieri G.',
 'Ruevski P.',
 'Granollers-Pujol G.',
 'Estrella Burgos V.',
 'Berrettini M. ',
 'Mathieu P.',
 'Baluda V.',
 'Fitz S.',
 'Andersen J.F.',
 'Napolitano S.',
 'Martin A. ',
 'Mitchell B.',
 'Kiefer N.',
 'Coria F.',
 'Cakl T.',
 'Joyce M.',
 'Bailly G.',
 'Montanes A. ',
 'Rodionov J.',
 