#### FreeAgent job application - Data Analyst (Mid/Senior)<br>
Applicant: Rocio Martinez<br>
Date: March 2023<br>
Task 2 - Most exciting tennis tournament


In [3]:
import pandas as pd

pd.options.display.max_rows = None
pd.options.display.max_columns = None

In [4]:
def standardise(column):
    return (column - column.mean()) / column.std()

data = pd.read_csv("test_data__atp_matches_2018.csv").assign(  
    new_tourney_name=lambda _df: _df.tourney_name.str.replace(r"^Davis Cup.*", "Davis Cup", regex=True),
    new_best_of= lambda _df: _df.best_of.where(_df.new_tourney_name!='NextGen Finals',5),
    n_sets=lambda _df: _df.score.str.split(" ").apply(len),
    n_sets_std=lambda _df: standardise(_df.n_sets),
    diff_winner_loser_rank=lambda _df: _df.winner_rank - _df.loser_rank,
    diff_winner_loser_rank_std=lambda _df: standardise(_df.diff_winner_loser_rank),
    rank=lambda _df: -(_df.winner_rank + _df.loser_rank) / 2,
    rank_std=lambda _df: standardise(_df["rank"])  
)

data = data.merge(
    data[(data.n_sets / data.new_best_of) >= 0.8][[
        "new_best_of",
        "minutes"
    ]]
    .groupby("new_best_of")
    .mean()
    .rename(columns=dict(minutes="mean_minutes_many_sets")), 
    on=["new_best_of"], 
    how="left"
).assign(
    is_long_game=lambda _df: (_df.minutes > _df.mean_minutes_many_sets).astype(int),
    is_long_game_std=lambda _df: standardise(_df.is_long_game),
    total_score=lambda _df: _df.n_sets_std + _df.diff_winner_loser_rank_std + _df.rank_std + _df.is_long_game_std
)

Data notes: 
- The tournament name variable for the Davis Cup did not match the rest of the tournaments as it had extra information in it which was making it look as it was different cups, this had to be cleaned and a new variable was created called: **new_tourney_name**, this was also addressed in the SQL task 
- The Davis Cup best_of variable had a value of 3, but the number of sets was up to 5. Therefore best_of was changed to be equal to 5. The new variable created was called: **new_best_of**
- The Davis Cup has all 3 surfaces in place (Clay, Grass and Hard), this was not modified but it is good to know as this will affect question 3 of the SQL task (task 1)

In [5]:
data.head()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,new_tourney_name,new_best_of,n_sets,n_sets_std,diff_winner_loser_rank,diff_winner_loser_rank_std,rank,rank_std,mean_minutes_many_sets,is_long_game,is_long_game_std,total_score
0,2018-M020,Brisbane,Hard,32,A,20180101,271,105992,,,Ryan Harrison,R,183.0,USA,25.653662,104919,,,Leonardo Mayer,R,188.0,ARG,30.633812,6-4 3-6 6-2,3,R32,123.0,9.0,2.0,82.0,49.0,39.0,20.0,13.0,8.0,9.0,10.0,3.0,80.0,47.0,33.0,19.0,14.0,1.0,4.0,47.0,1010.0,52.0,909.0,Brisbane,3,3,0.433651,-5.0,0.180387,-49.5,0.326004,128.288,0,-0.516733,0.423309
1,2018-M020,Brisbane,Hard,32,A,20180101,272,111577,,,Jared Donaldson,R,,USA,21.229295,111442,,WC,Jordan Thompson,R,183.0,AUS,23.701574,6-2 6-4,3,R32,90.0,5.0,3.0,58.0,32.0,25.0,14.0,9.0,4.0,5.0,3.0,5.0,62.0,41.0,25.0,7.0,9.0,7.0,11.0,54.0,890.0,94.0,593.0,Brisbane,3,2,-0.826729,-40.0,-0.033119,-74.0,0.150846,128.288,0,-0.516733,-1.225735
2,2018-M020,Brisbane,Hard,32,A,20180101,273,104797,,,Denis Istomin,R,188.0,UZB,31.318275,106000,7.0,,Damir Dzumhur,R,172.0,BIH,25.61807,6-7(4) 6-3 6-2,3,R32,145.0,7.0,0.0,94.0,66.0,48.0,12.0,14.0,9.0,11.0,8.0,6.0,120.0,53.0,37.0,29.0,15.0,10.0,16.0,63.0,809.0,30.0,1391.0,Brisbane,3,3,0.433651,33.0,0.412193,-46.5,0.347452,128.288,1,1.934566,3.127863
3,2018-M020,Brisbane,Hard,32,A,20180101,275,200282,,WC,Alex De Minaur,R,183.0,AUS,18.872005,105449,,,Steve Johnson,R,188.0,USA,28.021903,7-6(7) 6-4,3,R32,104.0,9.0,3.0,66.0,37.0,32.0,17.0,11.0,2.0,3.0,6.0,2.0,80.0,43.0,33.0,17.0,11.0,4.0,6.0,208.0,245.0,44.0,1055.0,Brisbane,3,2,-0.826729,164.0,1.211316,-126.0,-0.220919,128.288,0,-0.516733,-0.353064
4,2018-M020,Brisbane,Hard,32,A,20180101,276,111581,,Q,Michael Mmoh,R,,USA,19.975359,105643,,,Federico Delbonis,L,190.0,ARG,27.241615,6-3 6-4,3,R32,69.0,5.0,4.0,55.0,38.0,32.0,11.0,10.0,3.0,3.0,4.0,0.0,45.0,35.0,28.0,5.0,9.0,0.0,2.0,175.0,299.0,68.0,755.0,Brisbane,3,2,-0.826729,107.0,0.863606,-121.5,-0.188747,128.288,0,-0.516733,-0.668602


In [6]:
# Calculating excitement score
(
    data[[
        "new_tourney_name",
        "total_score"
    ]]
    .groupby("new_tourney_name")
    .median()
    .sort_values(by="total_score", ascending=False)
    .head(10)
)

Unnamed: 0_level_0,total_score
new_tourney_name,Unnamed: 1_level_1
NextGen Finals,1.358709
US Open,0.985379
Wimbledon,0.849429
Roland Garros,0.798881
Australian Open,0.772508
Vienna,0.400114
Shenzhen,0.226333
Sao Paulo,-0.077646
Brisbane,-0.159568
Quito,-0.258477


#### Further data investigations

In [7]:
# number of tournaments using new variable
data.new_tourney_name.nunique()

69

In [26]:
# surface information
data.groupby(['new_tourney_name','surface'])[['surface']].count().loc[["Davis Cup"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,surface
new_tourney_name,surface,Unnamed: 2_level_1
Davis Cup,Clay,87
Davis Cup,Grass,7
Davis Cup,Hard,142


In [21]:
# round
data['round'].unique()

array(['R32', 'R16', 'QF', 'SF', 'F', 'R128', 'R64', 'RR', 'BR'],
      dtype=object)

In [22]:
# best_of investigation
data['best_of'].unique()

array([3, 5])

In [24]:
# mean of minutes played for games with 4 games or more (best of 5) and for 3 games out of 3 (best of 3).
means_df = (
    data[(data.n_sets / data.new_best_of) >= 0.8][[
        "new_best_of",
        "minutes"
    ]]
    .groupby("new_best_of")
    .mean()
    .rename(columns=dict(minutes="mean_minutes_many_sets"))
)
means_df

Unnamed: 0_level_0,mean_minutes_many_sets
new_best_of,Unnamed: 1_level_1
3,128.288
5,185.303797


In [27]:
data.total_score.max()

7.095777817410691

In [33]:
#data.total_score.idxmax()

In [32]:
#data.iloc[data.total_score.idxmax()]