In [1]:
import numpy as np
import requests
from bs4 import BeautifulSoup
import pandas as pd
import altair as alt

In [2]:
# Base URL 
url_base = "https://www.eurosport.com/tennis/{}/standings_sea{}.shtml"

# Arrays for ATP/WTA and seasons
orgs = ['atp', 'wta']
seasons = ['340', '338', '336', '334', '99']

# Storing the data
length = len(orgs) * len(seasons)
data = np.empty(length, dtype='object')

# scrape data
index = 0
for org in orgs:
    for season in seasons:
        URL = url_base.format(org, season)
        html = requests.get(URL)
        soup = BeautifulSoup(html.content, 'html.parser')
        
        # Find table rows
        table = soup.find_all("tr")
        extracted_data = []
        
        # Extract data from the table
        for row in table:
            cells = row.find_all('td')
            if len(cells) > 3: 
                rank = cells[1].text.strip()
                player = cells[2].text.strip()
                points = cells[3].text.strip()
                extracted_data.append([rank, player, points, org, season])
        
        
        data[index] = extracted_data
        index += 1


all_data = []
for item in data:
    all_data.extend(item)


In [3]:
# Saving dataframe and processing
df = pd.DataFrame(all_data, columns=['Rank', 'Player', 'Points', 'Org', 'Season'])
df['Season'] = df['Season'].replace(['340', '338', '336', '334', '99'], [2024 ,2023, 2022, 2021, 2020])

In [4]:
df['Season'] = df['Season'].astype(int)
df['Points'] = df['Points'].astype(int)
df['Rank'] = df['Rank'].astype(int)
df.dtypes

Rank       int64
Player    object
Points     int64
Org       object
Season     int64
dtype: object

Option 1:

In [7]:
# Filter the data for recent seasons and top 10 ranks
recent_data = df[(df['Rank'] <= 10)]

# Create the visualization for ATP
atp_chart = alt.Chart(recent_data[recent_data['Org'] == 'atp']).mark_circle(size=100).encode(
    x=alt.X('Season:O', title=None),
    y=alt.Y('Points:Q', title=None),
    color=alt.Color('Org:N', 
        title='Tour',
        scale=alt.Scale(domain=['atp'], 
                       range=['#2ecc71']),
        legend=None),
    tooltip=['Player', 'Points', 'Rank', 'Season', 'Org']
).properties(
    width=300,
    height=400,
    title='ATP'
).interactive()



# Add text labels for the top 3 ATP players each year
atp_labels = alt.Chart(recent_data[(recent_data['Org'] == 'atp') & (recent_data['Rank'].astype(int) <= 3)]).mark_text(
    align='right',
    baseline='middle',
    dx=10,
    dy=-10
).encode(
    x='Season:O',
    y='Points:Q',
    text='Player',
    color=alt.value('black')
)

# Combine the ATP charts
final_atp_chart = atp_chart + atp_labels

# Create the visualization for WTA
wta_chart = alt.Chart(recent_data[recent_data['Org'] == 'wta']).mark_circle(size=100).encode(
    x=alt.X('Season:O', title=None),
    y=alt.Y('Points:Q', title=None),
    color=alt.Color('Org:N', 
        title='Tour',
        scale=alt.Scale(domain=['wta'], 
                       range=['#e74c3c']),
        legend=None),
    tooltip=['Player', 'Points', 'Rank', 'Season', 'Org']
).properties(
    width=300,
    height=400,
    title='WTA'
).interactive()

# Add text labels for the top 3 WTA players each year
wta_labels = alt.Chart(recent_data[(recent_data['Org'] == 'wta') & (recent_data['Rank'].astype(int) <= 3)]).mark_text(
    align='center',
    baseline='middle',
    dx=1,
    dy=-10
).encode(
    x='Season:O',
    y='Points:Q',
    text='Player',
    color=alt.value('black')
)

# Combine the WTA charts
final_wta_chart = wta_chart + wta_labels

# Add title and subtitle to the final chart
final_chart = alt.hconcat(final_atp_chart, final_wta_chart).resolve_scale(
    color=alt.ResolveMode('independent')
).properties(
    title={
        "text": "Distribution of Tennis ranking points",
        "subtitle": "Difference in year-end ranking points obtained by top 10 players in the past 5 years, by tour type"
    }
).to_json()


with open('CC5_fig1.json', 'w') as f:
    f.write(final_chart)

final_chart


'{\n  "$schema": "https://vega.github.io/schema/vega-lite/v5.20.1.json",\n  "config": {\n    "view": {\n      "continuousHeight": 300,\n      "continuousWidth": 300\n    }\n  },\n  "datasets": {\n    "data-9eecd7599f9d4e7c456f6bbaf6d53bca": [\n      {\n        "Org": "atp",\n        "Player": "J. Sinner",\n        "Points": 11830,\n        "Rank": 1,\n        "Season": 2024\n      },\n      {\n        "Org": "atp",\n        "Player": "A. Zverev",\n        "Points": 7915,\n        "Rank": 2,\n        "Season": 2024\n      },\n      {\n        "Org": "atp",\n        "Player": "C. Alcaraz",\n        "Points": 7010,\n        "Rank": 3,\n        "Season": 2024\n      },\n      {\n        "Org": "atp",\n        "Player": "T. Fritz",\n        "Points": 5100,\n        "Rank": 4,\n        "Season": 2024\n      },\n      {\n        "Org": "atp",\n        "Player": "D. Medvedev",\n        "Points": 5030,\n        "Rank": 5,\n        "Season": 2024\n      },\n      {\n        "Org": "atp",\n      