# Analysis on ATP Tennis Matches (2000 - 2024)!

Importing data into DataFrame

In [None]:
import numpy as np
import pandas as pd
import os

cwd = os.getcwd()
file = os.path.join(cwd,"..","data","processed","tennis_cleaned_data.csv")
df = pd.read_csv(file)



1) List all players who won a tournament as a lucky loser.

In [None]:
show_columns = ['winner_name','winner_entry','tourney_name','year','tourney_level']
lucky_losers = df[(df['winner_entry']=="LL") & (df['round']=="F")][show_columns]

print("Players who won a tournament as a lucky loser:")
display(lucky_losers.reset_index(drop=True))

2) List all players who won a big tournament (Grand Slam or Masters) as a qualifier or wildcard.

In [None]:
qual_or_wild = (df['winner_entry']=="Q") | (df['winner_entry']=="WC")
big_tournament = df['tourney_level'].isin(["G","M"])
winners = df[qual_or_wild & big_tournament & (df['round']=="F")][show_columns]

print("Players who won a big tournament (Grand Slam or Masters) as a qualifier or wildcard:")
display(winners.reset_index(drop=True))

3) List the top 10 youngest and oldest big tournament winners.

In [None]:
show_columns = ['winner_name','winner_age','tourney_name','year','tourney_level']
winners = df[big_tournament & (df['round']=="F")][show_columns]

#top 10 oldest
oldest_winners=winners.sort_values(by='winner_age', ascending=False).head(10)

print("Top 10 oldest big tournament (Grand Slam or Masters) champions:")

display(oldest_winners.reset_index(drop=True))


#top 10 youngest

#filters out next gen finals
next_gen_mask = winners['tourney_name'].str.contains('next', case=False, na=False) & \
       winners['tourney_name'].str.contains('gen', case=False, na=False)

winners_filtered = winners.drop(winners[next_gen_mask].index)

youngest_winners=winners_filtered.sort_values(by='winner_age', ascending=True).head(10)

print("Top 10 youngest big tournament (Grand Slam or Masters) champions:")

display(youngest_winners.reset_index(drop=True))


4) List the top 10 lowest ranked big tournament winners.

In [None]:

show_columns = ['winner_name','winner_rank','tourney_name','year','tourney_level']
winners = df[big_tournament & (df['round']=="F")][show_columns]
lowest_ranked = winners.sort_values(by='winner_rank', ascending=False).head(10)

print("Top 10 lowest ranked big tournament (Grand Slam or Masters) winners:")
display(lowest_ranked.reset_index(drop=True))

5) List the top 10 players with the most wins in the 2000s, 2010s and 2020s.

In [None]:
filtered_df = df[df['score']!='W/O']

decades = {
    '2000s': (2000, 2009),
    '2010s': (2010, 2019),
    '2020s': (2020, 2024)
}

for label,(start,end) in decades.items():
    decade_df = filtered_df[filtered_df['year'].between(start,end)]
    win_counts = decade_df['winner_name'].value_counts()
    table = win_counts.reset_index()

    print(f"Top 10 Winners in the {label}:")
    display(table.head(10))
    print("\n" + "="*40 + "\n")

6) List the top 10 players with the most wins on each surface.

In [None]:
find_unique_surfaces = df['surface'].unique()

filtered_df = df[(df['surface'].notnull()) & (df['score']!='W/O')]

unique_surfaces = find_unique_surfaces[:-1]

for surface in unique_surfaces:
    surface_df = filtered_df[filtered_df['surface'] == surface]
    win_counts = surface_df['winner_name'].value_counts()
    table = win_counts.reset_index()

    print(f"Top 10 Winners on {surface}:")
    display(table.head(10))
    print("\n" + "="*40 + "\n")
    


7) Has a player ever beaten the Big 3 (Federer, Nadal, Djokovic) in the same tournament?

In [None]:
show_columns = ['year','tourney_name','tourney_level','surface','round','winner_name','loser_name','score']

big_3_lost_mask = df['loser_name'].str.contains('federer|nadal|djokovic', case=False, na=False)

big_3_lost_filtered = df[big_3_lost_mask][show_columns]

#keeps groups that have more than 2 rows (per tournament-year)
big_3_lost = big_3_lost_filtered.groupby(['tourney_name','year','winner_name']).filter(lambda g: len(g) > 2)

print("Instances where a player beat the Big 3 (Federer, Nadal, Djokovic) in the same tournament:")
display(big_3_lost.reset_index(drop=True))

8) List all big tournament matches that went 6-0, 6-0 or 6-0, 6-0, 6-0.

In [None]:
bageled = df[big_tournament & (df['score'].isin(['6-0 6-0','6-0 6-0 6-0']))][show_columns]
print("Big tournament matches (Grand Slam or Masters) that went 6-0, 6-0 or 6-0, 6-0, 6-0.")
display(bageled.reset_index(drop=True))

9) Who is the youngest and oldest player to beat the world number one?

In [None]:
show_columns = ['year','tourney_name','tourney_level','surface','round','winner_name','winner_age','winner_rank','loser_name','loser_age','loser_rank','score']
number_one_lost = df[df['loser_rank']==1][show_columns]

youngest_winners = number_one_lost.sort_values(by='winner_age', ascending=True).head(10)

print("Top 10 youngest players to beat the world no. 1")

display(youngest_winners.reset_index(drop=True))

oldest_winners = number_one_lost.sort_values(by='winner_age', ascending=False).head(10)

print("Top 10 oldest players to beat the world no. 1")

display(oldest_winners.reset_index(drop=True))

10) List the top 10 players who have the most career aces.

In [None]:
# Sum aces when player won
aces_won = df.groupby('winner_name')['w_ace'].sum()

# Sum aces when player lost
aces_lost = df.groupby('loser_name')['l_ace'].sum()

# Combine total aces per player
total_aces = aces_won.add(aces_lost, fill_value=0)

# Sort descending to find the highest
total_aces_sorted = total_aces.sort_values(ascending=False)
total_aces = total_aces_sorted.reset_index()
total_aces.columns = ['player', 'total_aces']

total_aces['total_aces'] = total_aces['total_aces'].astype(int)

print("Top 10 player career aces:")

display(total_aces.head(10))

