In [1]:
import numpy as np
import requests
from bs4 import BeautifulSoup
import pandas as pd
import altair as alt

In [2]:
# Base URL 
url_base = "https://www.eurosport.com/tennis/{}/standings_sea{}.shtml"

# Arrays for ATP/WTA and seasons
orgs = ['atp', 'wta']
seasons = ['340', '338', '336', '334', '99']

# Storing the data
length = len(orgs) * len(seasons)
data = np.empty(length, dtype='object')

# scrape data
index = 0
for org in orgs:
    for season in seasons:
        URL = url_base.format(org, season)
        html = requests.get(URL)
        soup = BeautifulSoup(html.content, 'html.parser')
        
        # Find table rows
        table = soup.find_all("tr")
        extracted_data = []
        
        # Extract data from the table
        for row in table:
            cells = row.find_all('td')
            if len(cells) > 3: 
                rank = cells[1].text.strip()
                player = cells[2].text.strip()
                points = cells[3].text.strip()
                extracted_data.append([rank, player, points, org, season])
        
        
        data[index] = extracted_data
        index += 1


all_data = []
for item in data:
    all_data.extend(item)


In [3]:
# Saving dataframe and processing
df = pd.DataFrame(all_data, columns=['Rank', 'Player', 'Points', 'Org', 'Season'])
df['Season'] = df['Season'].replace(['340', '338', '336', '334', '99'], [2024 ,2023, 2022, 2021, 2020])

In [4]:
df['Season'] = df['Season'].astype(int)
df['Points'] = df['Points'].astype(int)
df['Rank'] = df['Rank'].astype(int)
df.dtypes

Rank       int64
Player    object
Points     int64
Org       object
Season     int64
dtype: object

Option 1:

In [25]:
# Filter the data for recent seasons and top 10 ranks
recent_data = df[(df['Rank'] <= 10)]

# Create the visualization for ATP
atp_chart = alt.Chart(recent_data[recent_data['Org'] == 'atp']).mark_circle(size=100).encode(
    x=alt.X('Season:O', title=None),
    y=alt.Y('Points:Q', title=None),
    color=alt.Color('Org:N', 
        title='Tour',
        scale=alt.Scale(domain=['atp'], 
                       range=['#2ecc71']),
        legend=None),
    tooltip=['Player', 'Points', 'Rank', 'Season', 'Org']
).properties(
    width=300,
    height=400,
    title='ATP Tennis Rankings Distribution'
).interactive()

# Add text labels for the top 3 ATP players each year
atp_labels = alt.Chart(recent_data[(recent_data['Org'] == 'atp') & (recent_data['Rank'].astype(int) <= 3)]).mark_text(
    align='right',
    baseline='middle',
    dx=10,
    dy=-10
).encode(
    x='Season:O',
    y='Points:Q',
    text='Player',
    color=alt.value('black')
)

# Combine the ATP charts
final_atp_chart = atp_chart + atp_labels

# Create the visualization for WTA
wta_chart = alt.Chart(recent_data[recent_data['Org'] == 'wta']).mark_circle(size=100).encode(
    x=alt.X('Season:O', title=None),
    y=alt.Y('Points:Q', title=None),
    color=alt.Color('Org:N', 
        title='Tour',
        scale=alt.Scale(domain=['wta'], 
                       range=['#e74c3c']),
        legend=None),
    tooltip=['Player', 'Points', 'Rank', 'Season', 'Org']
).properties(
    width=300,
    height=400,
    title='WTA Tennis Rankings Distribution'
).interactive()

# Add text labels for the top 3 WTA players each year
wta_labels = alt.Chart(recent_data[(recent_data['Org'] == 'wta') & (recent_data['Rank'].astype(int) <= 3)]).mark_text(
    align='center',
    baseline='middle',
    dx=1,
    dy=-10
).encode(
    x='Season:O',
    y='Points:Q',
    text='Player',
    color=alt.value('black')
)

# Combine the WTA charts
final_wta_chart = wta_chart + wta_labels

# Combine the ATP and WTA charts side by side
final_chart = alt.hconcat(final_atp_chart, final_wta_chart).resolve_scale(
    color=alt.ResolveMode('independent')
)

final_chart


In [6]:
current_atp = df[(df['Season'] == 2024) & (df['Org'] == 'atp')].head(10)['Player'].tolist()
current_wta = df[(df['Season'] == 2024) & (df['Org'] == 'wta')].head(10)['Player'].tolist()

In [7]:
current_atp

['J. Sinner',
 'A. Zverev',
 'C. Alcaraz',
 'T. Fritz',
 'D. Medvedev',
 'C. Ruud',
 'N. Djokovic',
 'A. Rublev',
 'A. De Minaur',
 'G. Dimitrov']

In [8]:

# Filter data for these players across all years
atp_history = df[
    (df['Org'] == 'atp') & 
    (df['Player'].isin(current_atp)) & 
    (df['Season'] >= 2020)
]

wta_history = df[
    (df['Org'] == 'wta') & 
    (df['Player'].isin(current_wta)) & 
    (df['Season'] >= 2020)
]

# Combine the data
plot_data = pd.concat([atp_history, wta_history])

# Create the base chart
base = alt.Chart(plot_data).encode(
    x=alt.X('Season:O', title='Year'),
    y=alt.Y('Rank:Q', 
            scale=alt.Scale(domain=[30, 1], reverse=True),
            title='Ranking'),
    color=alt.Color('Player:N', 
                   legend=alt.Legend(title='Players')),
    tooltip=['Player', 'Rank', 'Points', 'Season', 'Org']
)

# Create separate charts for ATP and WTA
atp_chart = base.transform_filter(
    alt.datum.Org == 'atp'
).mark_line(point=True).properties(
    width=400,
    height=300,
    title='ATP Top 5 Players Ranking History (2015-2024)'
)

wta_chart = base.transform_filter(
    alt.datum.Org == 'wta'
).mark_line(point=True).properties(
    width=400,
    height=300,
    title='WTA Top 5 Players Ranking History (2015-2024)'
)

# Combine the charts vertically
final_chart = alt.vconcat(atp_chart, wta_chart).resolve_scale(
    color=alt.ResolveMode('independent')
)

final_chart