In [None]:
# Pricing Inefficiencies in Professional Tennis Match Odds
## Data Loading and Initial Exploration

This notebook loads and explores historical ATP match data from the 2024 season.
The goal is to understand the structure, key variables, and data quality before
any feature engineering or probabilistic modelling.


In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv("../data/raw/2024matches.csv")


In [5]:
df.head()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,2024-0339,Brisbane,Hard,32,A,20240101,300,105777,2.0,,...,58.0,44.0,16.0,11.0,8.0,9.0,14.0,2570.0,8.0,3660.0
1,2024-0339,Brisbane,Hard,32,A,20240101,299,208029,1.0,,...,35.0,31.0,10.0,11.0,5.0,7.0,8.0,3660.0,39.0,1122.0
2,2024-0339,Brisbane,Hard,32,A,20240101,298,105777,2.0,,...,39.0,24.0,14.0,10.0,5.0,7.0,14.0,2570.0,55.0,902.0
3,2024-0339,Brisbane,Hard,32,A,20240101,297,208029,1.0,,...,51.0,31.0,16.0,10.0,3.0,5.0,8.0,3660.0,116.0,573.0
4,2024-0339,Brisbane,Hard,32,A,20240101,296,126128,,,...,37.0,27.0,16.0,10.0,5.0,8.0,39.0,1122.0,44.0,1021.0


In [6]:
df.shape

(3076, 49)

In [7]:
df.columns

Index(['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'score', 'best_of', 'round',
       'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
       'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points'],
      dtype='object')

In [8]:
df.info



<bound method DataFrame.info of                             tourney_id                  tourney_name surface  \
0                            2024-0339                      Brisbane    Hard   
1                            2024-0339                      Brisbane    Hard   
2                            2024-0339                      Brisbane    Hard   
3                            2024-0339                      Brisbane    Hard   
4                            2024-0339                      Brisbane    Hard   
...                                ...                           ...     ...   
3071  2024-M-DC-2024-WG2-PO-URU-MDA-01  Davis Cup WG2 PO: URU vs MDA    Clay   
3072  2024-M-DC-2024-WG2-PO-VIE-RSA-01  Davis Cup WG2 PO: VIE vs RSA    Hard   
3073  2024-M-DC-2024-WG2-PO-VIE-RSA-01  Davis Cup WG2 PO: VIE vs RSA    Hard   
3074  2024-M-DC-2024-WG2-PO-VIE-RSA-01  Davis Cup WG2 PO: VIE vs RSA    Hard   
3075  2024-M-DC-2024-WG2-PO-VIE-RSA-01  Davis Cup WG2 PO: VIE vs RSA    Hard   

      d

In [9]:
df.isna().mean().sort_values(ascending=False).head(10)

winner_entry    0.844928
loser_entry     0.766580
loser_seed      0.753901
winner_seed     0.579324
minutes         0.077373
l_SvGms         0.019831
w_SvGms         0.019831
l_svpt          0.019506
w_svpt          0.019506
w_df            0.019506
dtype: float64

In [10]:
### Initial Observations
- The dataset contains match-level data for ATP matches from the 2024 season
- Each row represents a single match, with winner and loser information stored in separate columns
- The dataset does not contain bookmaker odds; odds data will be sourced separately and merged at a later stage
- Player rankings and other performance-related variables contain missing values
- Match context variables such as surface and tournament level are available


SyntaxError: invalid syntax (4096812804.py, line 2)

In [11]:
### Feature Considerations
- The dataset contains both pre-match contextual variables and post-match performance statistics
- Only variables known prior to match start (e.g. rankings, surface, tournament level) are suitable for probability estimation
- Post-match statistics (e.g. aces, break points, match duration) will be excluded to avoid data leakage
- Match outcomes will be restructured from a winner/loser format into a player/opponent format for modelling


SyntaxError: invalid syntax (3010535469.py, line 2)

In [12]:
# Pre-match contextual columns only
base_cols = [
    "tourney_date", "surface", "tourney_level", "round", "best_of",
    
    "winner_id", "winner_name", "winner_age", "winner_ht",
    "winner_hand", "winner_rank", "winner_rank_points",
    
    "loser_id", "loser_name", "loser_age", "loser_ht",
    "loser_hand", "loser_rank", "loser_rank_points"
]

df_base = df[base_cols].copy()


In [13]:
winner_view = df_base.rename(columns={
    "winner_id": "player_id",
    "winner_name": "player_name",
    "winner_age": "player_age",
    "winner_ht": "player_ht",
    "winner_hand": "player_hand",
    "winner_rank": "player_rank",
    "winner_rank_points": "player_rank_points",
    
    "loser_id": "opp_id",
    "loser_name": "opp_name",
    "loser_age": "opp_age",
    "loser_ht": "opp_ht",
    "loser_hand": "opp_hand",
    "loser_rank": "opp_rank",
    "loser_rank_points": "opp_rank_points"
})

winner_view["win"] = 1


In [14]:
loser_view = df_base.rename(columns={
    "loser_id": "player_id",
    "loser_name": "player_name",
    "loser_age": "player_age",
    "loser_ht": "player_ht",
    "loser_hand": "player_hand",
    "loser_rank": "player_rank",
    "loser_rank_points": "player_rank_points",
    
    "winner_id": "opp_id",
    "winner_name": "opp_name",
    "winner_age": "opp_age",
    "winner_ht": "opp_ht",
    "winner_hand": "opp_hand",
    "winner_rank": "opp_rank",
    "winner_rank_points": "opp_rank_points"
})

loser_view["win"] = 0


In [15]:
df_long = pd.concat([winner_view, loser_view], ignore_index=True)


In [16]:
df_long.shape


(6152, 20)

In [17]:
df_long["win"].value_counts(normalize=True)


win
1    0.5
0    0.5
Name: proportion, dtype: float64