# Analysis on ATP Tennis Matches (2000 - 2024)!

Importing data into DataFrame

In [75]:
import numpy as np
import pandas as pd
import os

cwd = os.getcwd()
file = os.path.join(cwd,"..","data","processed","tennis_cleaned_data.csv")
df = pd.read_csv(file)



1) List all players who won a tournament as a lucky loser.

In [76]:
show_columns = ['winner_name','winner_entry','tourney_name','year','tourney_level']
lucky_losers = df[(df['winner_entry']=="LL") & (df['round']=="F")][show_columns]

print("Players who won a tournament as a lucky loser:")
display(lucky_losers.reset_index(drop=True))

Players who won a tournament as a lucky loser:


Unnamed: 0,winner_name,winner_entry,tourney_name,year,tourney_level
0,Sergiy Stakhovsky,LL,Zagreb,2008,A
1,Rajeev Ram,LL,Newport,2009,A
2,Andrey Rublev,LL,Umag,2017,A
3,Leonardo Mayer,LL,Hamburg,2017,A
4,Marco Cecchinato,LL,Budapest,2018,A
5,Soon Woo Kwon,LL,Adelaide 2,2023,A


2) List all players who won a big tournament (Grand Slam or Masters) as a qualifier or wildcard.

In [77]:
qual_or_wild = (df['winner_entry']=="Q") | (df['winner_entry']=="WC")
big_tournament = df['tourney_level'].isin(["G","M"])
winners = df[qual_or_wild & big_tournament & (df['round']=="F")][show_columns]

print("Players who won a big tournament (Grand Slam or Masters) as a qualifier or wildcard:")
display(winners.reset_index(drop=True))

Players who won a big tournament (Grand Slam or Masters) as a qualifier or wildcard:


Unnamed: 0,winner_name,winner_entry,tourney_name,year,tourney_level
0,Albert Portas,Q,Hamburg Masters,2001,M
1,Goran Ivanisevic,WC,Wimbledon,2001,G


3) List the top 10 youngest and oldest big tournament winners.

In [78]:
show_columns = ['winner_name','winner_age','tourney_name','year','tourney_level']
winners = df[big_tournament & (df['round']=="F")][show_columns]

#top 10 oldest
oldest_winners=winners.sort_values(by='winner_age', ascending=False).head(10)

print("Top 10 oldest big tournament (Grand Slam or Masters) champions:")

display(oldest_winners.reset_index(drop=True))


#top 10 youngest

#filters out next gen finals
next_gen_mask = winners['tourney_name'].str.contains('next', case=False, na=False) & \
       winners['tourney_name'].str.contains('gen', case=False, na=False)

winners_filtered = winners.drop(winners[next_gen_mask].index)

youngest_winners=winners_filtered.sort_values(by='winner_age', ascending=True).head(10)

print("Top 10 youngest big tournament (Grand Slam or Masters) champions:")

display(youngest_winners.reset_index(drop=True))


Top 10 oldest big tournament (Grand Slam or Masters) champions:


Unnamed: 0,winner_name,winner_age,tourney_name,year,tourney_level
0,Roger Federer,37.6,Miami Masters,2019,M
1,Roger Federer,36.4,Australian Open,2018,G
2,Novak Djokovic,36.4,Paris Masters,2023,M
3,Novak Djokovic,36.2,Us Open,2023,G
4,Novak Djokovic,36.2,Cincinnati Masters,2023,M
5,Roger Federer,36.1,Shanghai Masters,2017,M
6,Novak Djokovic,36.0,Roland Garros,2023,G
7,Roger Federer,35.9,Wimbledon,2017,G
8,Rafael Nadal,35.9,Roland Garros,2022,G
9,Novak Djokovic,35.6,Australian Open,2023,G


Top 10 youngest big tournament (Grand Slam or Masters) champions:


Unnamed: 0,winner_name,winner_age,tourney_name,year,tourney_level
0,Carlos Alcaraz,18.8,Miami Masters,2022,M
1,Rafael Nadal,18.8,Monte Carlo Masters,2005,M
2,Rafael Nadal,18.9,Rome Masters,2005,M
3,Carlos Alcaraz,18.9,Madrid Masters,2022,M
4,Rafael Nadal,18.9,Roland Garros,2005,G
5,Rafael Nadal,19.1,Canada Masters,2005,M
6,Carlos Alcaraz,19.3,Us Open,2022,G
7,Rafael Nadal,19.3,Madrid Masters,2005,M
8,Holger Rune,19.5,Paris Masters,2022,M
9,Carlos Alcaraz,19.8,Indian Wells Masters,2023,M


4) List the top 10 lowest ranked big tournament winners.

In [79]:

show_columns = ['winner_name','winner_rank','tourney_name','year','tourney_level']
winners = df[big_tournament & (df['round']=="F")][show_columns]
lowest_ranked = winners.sort_values(by='winner_rank', ascending=False).head(10)

print("Top 10 lowest ranked big tournament (Grand Slam or Masters) winners:")
display(lowest_ranked.reset_index(drop=True))

Top 10 lowest ranked big tournament (Grand Slam or Masters) winners:


Unnamed: 0,winner_name,winner_rank,tourney_name,year,tourney_level
0,Borna Coric,152.0,Cincinnati Masters,2022,M
1,Goran Ivanisevic,125.0,Wimbledon,2001,G
2,Alexei Popyrin,62.0,Canada Masters,2024,M
3,Tomas Berdych,50.0,Paris Masters,2005,M
4,Felix Mantilla,47.0,Rome Masters,2003,M
5,Gaston Gaudio,44.0,Roland Garros,2004,G
6,Andrei Pavel,43.0,Canada Masters,2001,M
7,Albert Portas,42.0,Hamburg Masters,2001,M
8,Hubert Hurkacz,37.0,Miami Masters,2021,M
9,Tim Henman,31.0,Paris Masters,2003,M


5) List the top 10 players with the most wins in the 2000s, 2010s and 2020s.

In [80]:
filtered_df = df[df['score']!='W/O']

decades = {
    '2000s': (2000, 2009),
    '2010s': (2010, 2019),
    '2020s': (2020, 2024)
}

for label,(start,end) in decades.items():
    decade_df = filtered_df[filtered_df['year'].between(start,end)]
    win_counts = decade_df['winner_name'].value_counts()
    table = win_counts.reset_index()

    print(f"Top 10 Winners in the {label}:")
    display(table.head(10))
    print("\n" + "="*40 + "\n")

Top 10 Winners in the 2000s:


Unnamed: 0,winner_name,count
0,Roger Federer,663
1,Andy Roddick,505
2,Lleyton Hewitt,466
3,Juan Carlos Ferrero,405
4,Rafael Nadal,401
5,Carlos Moya,376
6,Tommy Robredo,376
7,Nikolay Davydenko,375
8,Marat Safin,366
9,Ivan Ljubicic,360




Top 10 Winners in the 2010s:


Unnamed: 0,winner_name,count
0,Novak Djokovic,630
1,Rafael Nadal,576
2,Roger Federer,559
3,Andy Murray,452
4,David Ferrer,441
5,Tomas Berdych,423
6,Marin Cilic,398
7,Kei Nishikori,380
8,John Isner,378
9,Stan Wawrinka,374




Top 10 Winners in the 2020s:


Unnamed: 0,winner_name,count
0,Jannik Sinner,252
1,Daniil Medvedev,248
2,Andrey Rublev,244
3,Alexander Zverev,240
4,Stefanos Tsitsipas,239
5,Novak Djokovic,231
6,Casper Ruud,217
7,Carlos Alcaraz,209
8,Taylor Fritz,200
9,Alex De Minaur,178






6) List the top 10 players with the most wins on each surface.

In [81]:
find_unique_surfaces = df['surface'].unique()

filtered_df = df[(df['surface'].notnull()) & (df['score']!='W/O')]

unique_surfaces = find_unique_surfaces[:-1]

for surface in unique_surfaces:
    surface_df = filtered_df[filtered_df['surface'] == surface]
    win_counts = surface_df['winner_name'].value_counts()
    table = win_counts.reset_index()

    print(f"Top 10 Winners on {surface}:")
    display(table.head(10))
    print("\n" + "="*40 + "\n")
    


Top 10 Winners on Hard:


Unnamed: 0,winner_name,count
0,Roger Federer,774
1,Novak Djokovic,709
2,Rafael Nadal,518
3,Andy Murray,503
4,Andy Roddick,424
5,Tomas Berdych,400
6,Gael Monfils,377
7,Marin Cilic,366
8,Richard Gasquet,357
9,John Isner,351




Top 10 Winners on Clay:


Unnamed: 0,winner_name,count
0,Rafael Nadal,484
1,David Ferrer,335
2,Novak Djokovic,286
3,Nicolas Almagro,279
4,Tommy Robredo,259
5,Fabio Fognini,240
6,Juan Carlos Ferrero,239
7,Fernando Verdasco,236
8,Juan Monaco,236
9,Roger Federer,226




Top 10 Winners on Grass:


Unnamed: 0,winner_name,count
0,Roger Federer,192
1,Novak Djokovic,120
2,Andy Murray,119
3,Lleyton Hewitt,118
4,Feliciano Lopez,87
5,Andy Roddick,86
6,Marin Cilic,80
7,Sam Querrey,77
8,Rafael Nadal,76
9,Richard Gasquet,75




Top 10 Winners on Carpet:


Unnamed: 0,winner_name,count
0,Ivan Ljubicic,53
1,Marat Safin,47
2,Roger Federer,44
3,Max Mirnyi,37
4,Mikhail Youzhny,36
5,Nikolay Davydenko,34
6,Arnaud Clement,32
7,Tim Henman,30
8,Thomas Johansson,30
9,Yevgeny Kafelnikov,30






7) Has a player ever beaten the Big 3 (Federer, Nadal, Djokovic) in the same tournament?

In [82]:
show_columns = ['year','tourney_name','tourney_level','surface','round','winner_name','loser_name','score']

big_3_lost_mask = df['loser_name'].str.contains('federer|nadal|djokovic', case=False, na=False)

big_3_lost_filtered = df[big_3_lost_mask][show_columns]

#keeps groups that have more than 2 rows (per tournament-year)
big_3_lost = big_3_lost_filtered.groupby(['tourney_name','year','winner_name']).filter(lambda g: len(g) > 2)

print("Instances where a player beat the Big 3 (Federer, Nadal, Djokovic) in the same tournament:")
display(big_3_lost.reset_index(drop=True))

Instances where a player beat the Big 3 (Federer, Nadal, Djokovic) in the same tournament:


Unnamed: 0,year,tourney_name,tourney_level,surface,round,winner_name,loser_name,score
0,2007,Madrid Masters,M,Hard,QF,David Nalbandian,Rafael Nadal,6-1 6-2
1,2007,Madrid Masters,M,Hard,SF,David Nalbandian,Novak Djokovic,6-4 7-6(4)
2,2007,Madrid Masters,M,Hard,F,David Nalbandian,Roger Federer,1-6 6-3 6-3


8) List all big tournament matches that went 6-0, 6-0 or 6-0, 6-0, 6-0.

In [83]:
bageled = df[big_tournament & (df['score'].isin(['6-0 6-0','6-0 6-0 6-0']))][show_columns]
print("Big tournament matches (Grand Slam or Masters) that went 6-0, 6-0 or 6-0, 6-0, 6-0.")
display(bageled.reset_index(drop=True))

Big tournament matches (Grand Slam or Masters) that went 6-0, 6-0 or 6-0, 6-0, 6-0.


Unnamed: 0,year,tourney_name,tourney_level,surface,round,winner_name,loser_name,score
0,2002,Cincinnati Masters,M,Hard,R64,Lleyton Hewitt,Robby Ginepri,6-0 6-0
1,2005,Indian Wells Masters,M,Hard,R64,Nicolas Kiefer,Sargis Sargsian,6-0 6-0
2,2005,Rome Masters,M,Clay,R32,Guillermo Canas,Juan Monaco,6-0 6-0
3,2006,Paris Masters,M,Carpet,R32,Nikolay Davydenko,Christophe Rochus,6-0 6-0
4,2006,Miami Masters,M,Hard,R128,Carlos Berlocq,Donald Young,6-0 6-0
5,2006,Miami Masters,M,Hard,R64,James Blake,Carlos Berlocq,6-0 6-0
6,2007,Miami Masters,M,Hard,R64,Jarkko Nieminen,Evgeny Korolev,6-0 6-0
7,2016,Rome Masters,M,Clay,R16,David Goffin,Tomas Berdych,6-0 6-0
8,2023,Monte Carlo Masters,M,Clay,R32,Lorenzo Musetti,Luca Nardi,6-0 6-0


9) Who is the youngest and oldest player to beat the world number one?

In [84]:
show_columns = ['year','tourney_name','tourney_level','surface','round','winner_name','winner_age','winner_rank','loser_name','loser_age','loser_rank','score']
number_one_lost = df[df['loser_rank']==1][show_columns]

youngest_winners = number_one_lost.sort_values(by='winner_age', ascending=True).head(10)

print("Top 10 youngest players to beat the world no. 1")

display(youngest_winners.reset_index(drop=True))

oldest_winners = number_one_lost.sort_values(by='winner_age', ascending=False).head(10)

print("Top 10 oldest players to beat the world no. 1")

display(oldest_winners.reset_index(drop=True))

Top 10 youngest players to beat the world no. 1


Unnamed: 0,year,tourney_name,tourney_level,surface,round,winner_name,winner_age,winner_rank,loser_name,loser_age,loser_rank,score
0,2004,Miami Masters,M,Hard,R32,Rafael Nadal,17.8,34.0,Roger Federer,22.6,1.0,6-3 6-3
1,2005,Monte Carlo Masters,M,Clay,QF,Richard Gasquet,18.8,101.0,Roger Federer,23.6,1.0,6-7(1) 6-2 7-6(8)
2,2001,Canada Masters,M,Hard,R16,Andy Roddick,18.9,35.0,Gustavo Kuerten,24.8,1.0,6-7(4) 6-4 6-2
3,2022,Madrid Masters,M,Clay,SF,Carlos Alcaraz,18.9,9.0,Novak Djokovic,34.9,1.0,6-7(5) 7-5 7-6(5)
4,2005,Roland Garros,G,Clay,SF,Rafael Nadal,18.9,5.0,Roger Federer,23.7,1.0,6-3 4-6 6-4 6-3
5,2004,Athens Olympics,A,Hard,R32,Tomas Berdych,18.9,79.0,Roger Federer,23.0,1.0,4-6 7-5 7-5
6,2014,Wimbledon,G,Grass,R16,Nick Kyrgios,19.1,144.0,Rafael Nadal,28.0,1.0,7-6(5) 5-7 7-6(5) 6-3
7,2006,Cincinnati Masters,M,Hard,R32,Andy Murray,19.2,21.0,Roger Federer,25.0,1.0,7-5 6-4
8,2022,Paris Masters,M,Hard,QF,Holger Rune,19.5,18.0,Carlos Alcaraz,19.4,1.0,6-3 6-6 RET
9,2006,Dubai,A,Hard,F,Rafael Nadal,19.7,2.0,Roger Federer,24.5,1.0,2-6 6-4 6-4


Top 10 oldest players to beat the world no. 1


Unnamed: 0,year,tourney_name,tourney_level,surface,round,winner_name,winner_age,winner_rank,loser_name,loser_age,loser_rank,score
0,2023,Cincinnati Masters,M,Hard,F,Novak Djokovic,36.2,2.0,Carlos Alcaraz,20.2,1.0,5-7 7-6(7) 7-6(4)
1,2017,Shanghai Masters,M,Hard,F,Roger Federer,36.1,2.0,Rafael Nadal,31.3,1.0,6-4 6-3
2,2023,Roland Garros,G,Clay,SF,Novak Djokovic,36.0,3.0,Carlos Alcaraz,20.0,1.0,6-3 5-7 6-1 6-1
3,2022,Roland Garros,G,Clay,QF,Rafael Nadal,35.9,5.0,Novak Djokovic,35.0,1.0,6-2 4-6 6-2 7-6(4)
4,2015,Doha,A,Hard,QF,Ivo Karlovic,35.8,27.0,Novak Djokovic,27.6,1.0,6-7(2) 7-6(6) 6-4
5,2022,Indian Wells Masters,M,Hard,R32,Gael Monfils,35.5,28.0,Daniil Medvedev,26.0,1.0,4-6 6-3 6-1
6,2019,Indian Wells Masters,M,Hard,R32,Philipp Kohlschreiber,35.3,39.0,Novak Djokovic,31.7,1.0,6-4 6-4
7,2013,Miami Masters,M,Hard,R16,Tommy Haas,34.9,18.0,Novak Djokovic,25.8,1.0,6-2 6-4
8,2000,Queen's Club,A,Grass,R16,Gianluca Pozzi,34.9,76.0,Andre Agassi,30.1,1.0,4-6 3-2 RET
9,2021,Rome Masters,M,Clay,F,Rafael Nadal,34.9,3.0,Novak Djokovic,33.9,1.0,7-5 1-6 6-3


10) List the top 10 players who have the most career aces.

In [85]:
# Sum aces when player won
aces_won = df.groupby('winner_name')['w_ace'].sum()

# Sum aces when player lost
aces_lost = df.groupby('loser_name')['l_ace'].sum()

# Combine total aces per player
total_aces = aces_won.add(aces_lost, fill_value=0)

# Sort descending to find the highest
total_aces_sorted = total_aces.sort_values(ascending=False)
total_aces = total_aces_sorted.reset_index()
total_aces.columns = ['player', 'total_aces']

total_aces['total_aces'] = total_aces['total_aces'].astype(int)

print("Top 10 player career aces:")

display(total_aces.head(10))



Top 10 player career aces:


Unnamed: 0,player,total_aces
0,John Isner,14663
1,Ivo Karlovic,13762
2,Roger Federer,11271
3,Feliciano Lopez,10259
4,Andy Roddick,9040
5,Sam Querrey,8939
6,Milos Raonic,8513
7,Marin Cilic,8459
8,Ivan Ljubicic,7901
9,Kevin Anderson,7729
