# ATP Web Scraping

In [4]:
from bs4 import BeautifulSoup
import requests
import re 
import pandas as pd
import numpy as np
from time import sleep
from datetime import datetime, timedelta
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import sys

# Create Scraping Procedure

In [5]:
#create connection
url = "https://www.atptour.com/en/rankings/singles?rankRange=0-100&rankDate=1973-08-23"
headers={'User-Agent': ''}
page = requests.get(url,timeout=15, headers= headers)
print(page)

<Response [200]>


In [6]:
soup = BeautifulSoup(page.content, "html.parser")

In [7]:
rows = soup.find("div", {"class": "table-rankings-wrapper"}).find_all('tr')
# rows

In [8]:
#Retrieve the ranking
ranking = rows[1].find("td", {"class": "rank-cell"}).get_text().strip()
ranking

'1'

In [10]:
Retrieve Country
country = rows[1].find("div", {"class": "country-item"}).find("img")['alt']
country

'ROU'

In [11]:
# Retrieve player name
player = rows[1].find("td", {"class": "player-cell"}).get_text().strip()
player

'Ilie Nastase'

In [12]:
# Retrieve player age
age = rows[1].find("td", {"class": "age-cell"}).get_text().strip()
age

'27'

In [13]:
# Retrieve player points
points = rows[1].find("a", {"ga-label": "rankings-breakdown"}).get_text().strip()
points

'0'

In [14]:
# Retrieve tournaments
tournaments = rows[1].find("td", {"class": "tourn-cell"}).get_text().strip()
tournaments

'0'

In [15]:
# Retrieve points dropping
points_dropping = rows[1].find("td", {"class": "pts-cell"}).get_text().strip()
points_dropping

'0'

In [16]:
# Retrieve next best
next_best = rows[1].find("td", {"class": "next-cell"}).get_text().strip()
next_best

'0'

In [17]:
# Retrieve all dates
date = soup.find('div', class_= 'dropdown-holder-wrapper').find_all("li")[1].get('data-value')

date




'1-5000'

In [18]:
soup.find('div', class_= 'dropdown-layout-wrapper rank-detail-filter').find_all('ul')[2].find_all("li")

[<li class="dropdown-default-label" style="display: none">
                     2022.09.26
                 </li>,
 <li class="" data-value="2022-09-26">
                         2022.09.26
                     </li>,
 <li class="" data-value="2022-09-19">
                         2022.09.19
                     </li>,
 <li class="" data-value="2022-09-12">
                         2022.09.12
                     </li>,
 <li class="" data-value="2022-08-29">
                         2022.08.29
                     </li>,
 <li class="" data-value="2022-08-22">
                         2022.08.22
                     </li>,
 <li class="" data-value="2022-08-15">
                         2022.08.15
                     </li>,
 <li class="" data-value="2022-08-08">
                         2022.08.08
                     </li>,
 <li class="" data-value="2022-08-01">
                         2022.08.01
                     </li>,
 <li class="" data-value="2022-07-25">
                      

In [19]:
nr_rows = len(soup.find('table', class_="mega-table" ).find('tbody').find_all('tr'))
nr_rows

100

In [20]:
# Test the data for one entry
df = pd.DataFrame([{
    "ranking": ranking,
    "country": country,
    "player": player,
    "age": age,
    "points": points,
    "tournaments": tournaments,
    "points_dropping": points_dropping,
    "next_best": next_best,
    "date": date
}])

df

Unnamed: 0,ranking,country,player,age,points,tournaments,points_dropping,next_best,date
0,1,ROU,Ilie Nastase,27,0,0,0,0,1-5000


# 2. Create links

In [21]:
drop_down=soup.find('div', {'class': 'main-content'}).find('ul',{'class': 'dropdown'})
#first one repeats so removing 0th element
date_list=[]

dates =soup.find('div', class_= 'dropdown-layout-wrapper rank-detail-filter').find_all('ul')[2].find_all("li")

for date in dates[1:]:
    date_list.append(date.get('data-value'))
        

In [38]:
len(date_list)*100

221400

# 3. Scraping

In [23]:
# df = pd.DataFrame()
df = pd.DataFrame()

def scrape(date):
    urlpattern = "https://www.atptour.com/en/rankings/singles?rankDate={}&rankRange=0-200"
    url = urlpattern.format(date)
    headers={'User-Agent': ''}
    response = requests.get(url,timeout=15, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        rows = soup.find("div", {"class": "table-rankings-wrapper"}).find_all('tr') 
        lst = []
        for row in rows[1:]:
            #Ranking
            try:
                ranking = row.find("td", {"class": "rank-cell"}).get_text().strip()
            except:
                ranking = np.nan 
#             #Move
#             try:
#                 move = row.find("div", {"class": "move-text"}).get_text().strip()
#             except:
#                 move = np.nan
            #Country
            try:
                country = row.find("div", {"class": "country-item"}).find("img")['alt']
            except:
                country = np.nan
            #Player
            try:
                player = row.find("td", {"class": "player-cell"}).get_text().strip()
            except:
                player = np.nan
            #Age
            try:
                age = row.find("td", {"class": "age-cell"}).get_text().strip()
            except:
                age = np.nan
            #Points
            try:
                points = row.find("a", {"ga-label": "rankings-breakdown"}).get_text().strip()
            except:
                points = np.nan
            #Tournaments
            try:
                tournaments = row.find("td", {"class": "tourn-cell"}).get_text().strip()
            except:
                tournaments = np.nan
            #Points dropping
            try:
                points_dropping = row.find("td", {"class": "pts-cell"}).get_text().strip()
            except:
                points_dropping = np.nan
            #Next best
            try:
                next_best = row.find("td", {"class": "next-cell"}).get_text().strip()
            except:
                next_best = np.nan
            
            
            temp = {
                "ranking": ranking,
#                 "move": move,
                "country": country,
                "player": player,
                "age": age,
                "points": points,
                "tournaments": tournaments,
                "points_dropping": points_dropping,
                "next_best": next_best,
                "date": date
            }
            lst.append(temp)
            
#             df = df.append(temp, ignore_index=True)
                
    else:
        print('Scraper is down!')
            
    return pd.DataFrame(lst)

In [24]:
df = scrape(date)
df

Unnamed: 0,ranking,country,player,age,points,tournaments,points_dropping,next_best,date
0,1,ESP,Carlos Alcaraz,19,6740,17,0,0,[\r\n 1973.08.23\r\n ...
1,2,NOR,Casper Ruud,23,5850,23,250,45,[\r\n 1973.08.23\r\n ...
2,3,ESP,Rafael Nadal,36,5810,10,0,0,[\r\n 1973.08.23\r\n ...
3,4,RUS,Daniil Medvedev,26,5065,20,0,0,[\r\n 1973.08.23\r\n ...
4,5,GER,Alexander Zverev,25,5040,19,0,0,[\r\n 1973.08.23\r\n ...
...,...,...,...,...,...,...,...,...,...
195,196,BIH,Nerman Fatic,27,268,26,0,0,[\r\n 1973.08.23\r\n ...
196,197,CHN,Juncheng Shang,17,265,22,10,0,[\r\n 1973.08.23\r\n ...
197,198,AUT,Gerald Melzer,32,265,19,29,0,[\r\n 1973.08.23\r\n ...
198,199,BEL,Kimmer Coppejans,28,263,27,0,0,[\r\n 1973.08.23\r\n ...


## Scrape for all available dates

In [36]:
# with tqdm(total=len(date_list), file=sys.stdout) as pbar:
dates_df = df['date'].unique()
for date in tqdm(date_list):
    if date not in dates_df:
        temp_df = scrape(date)
        df = pd.concat([df, temp_df])
    else:
        pass
df

  0%|          | 0/2214 [00:00<?, ?it/s]

Unnamed: 0,ranking,country,player,age,points,tournaments,points_dropping,next_best,date
0,1,ESP,Carlos Alcaraz,19,6740,17,0,0,[\r\n 1973.08.23\r\n ...
1,2,NOR,Casper Ruud,23,5850,23,250,45,[\r\n 1973.08.23\r\n ...
2,3,ESP,Rafael Nadal,36,5810,10,0,0,[\r\n 1973.08.23\r\n ...
3,4,RUS,Daniil Medvedev,26,5065,20,0,0,[\r\n 1973.08.23\r\n ...
4,5,GER,Alexander Zverev,25,5040,19,0,0,[\r\n 1973.08.23\r\n ...
...,...,...,...,...,...,...,...,...,...
181,181,VEN,Humphrey Hose,26,0,0,0,0,1973-08-23
182,182,USA,Franklin Robbins,23,0,0,0,0,1973-08-23
183,183,GBR,David Lloyd,25,0,0,0,0,1973-08-23
184,184,SRB,Nicola Spear,29,0,0,0,0,1973-08-23


In [32]:
df.shape

(3000, 9)

In [33]:
df[df['date']=='1981-04-27']

Unnamed: 0,ranking,country,player,age,points,tournaments,points_dropping,next_best,date


In [34]:
df.tail()

Unnamed: 0,ranking,country,player,age,points,tournaments,points_dropping,next_best,date
195,196,ITA,Alessandro Giannessi,32,311,25,35,0,2022-06-06
196,197,GBR,Ryan Peniston,26,306,29,10,5,2022-06-06
197,198,AUT,Dominic Thiem,28,305,14,180,0,2022-06-06
198,199,ITA,Luca Nardi,18,304,27,0,0,2022-06-06
199,200,SUI,Dominic Stricker,19,303,22,45,0,2022-06-06


# 4. Export Data

Due to the large size of the DataFrame, it is recommended that the data is exported to CSV or to JSON.

CSV

In [None]:
df.to_csv('atp-rankings.csv', index=False)

JSON

In [None]:
with open('rankings.json', 'w') as f:
    f.write(df.to_json(orient='records', lines=True))