In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

The goal here is to find the accuracy of the Logistic Regression Model when working without Elo Ratings as a feature. So first we need to load the data from the cleaned handled data with elo ratings. We will also have to implement feature engineering for differences between player1 and player2 aspects.

In [3]:
modelLR_df = pd.read_csv("atp_matches_2010_2024_missing_handled.csv")

In [4]:
modelLR_df.head() # verification

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_name,winner_hand,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,2010-339,Brisbane,Hard,32,A,20100103,1,104053,Andy Roddick,R,...,34.0,29.0,11.0,10.0,3.0,5.0,7.0,4410.0,77.0,598.0
1,2010-339,Brisbane,Hard,32,A,20100103,30,103285,Radek Stepanek,R,...,27.0,14.0,7.0,7.0,3.0,7.0,12.0,2625.0,13.0,2610.0
2,2010-339,Brisbane,Hard,32,A,20100103,29,104053,Andy Roddick,R,...,43.0,34.0,21.0,13.0,10.0,12.0,7.0,4410.0,20.0,1655.0
3,2010-339,Brisbane,Hard,32,A,20100103,28,103285,Radek Stepanek,R,...,40.0,25.0,11.0,10.0,6.0,10.0,12.0,2625.0,105.0,521.0
4,2010-339,Brisbane,Hard,32,A,20100103,27,104792,Gael Monfils,R,...,50.0,38.0,17.0,14.0,3.0,6.0,13.0,2610.0,44.0,935.0


aspects that need to be feature_engineered:

rank_diff = winner_rank - loser_rank

ace_diff = w_ace - l_ace

df_diff = w_df - l_df

svpt_diff = w_svpt - l_svpt

1stIn_diff = w_1stIn - l_1stIn

1stWon_diff = w_1stWon - l_1stWon

2ndWon_diff = w_2ndWon - l_2ndWon

SvGms_diff = w_SvGms - l_SvGms

bpSaved_diff = w_bpSaved - l_bpSaved

bpFaced_diff = w_bpFaced - l_bpFaced

age_diff = winner_age - loser_age

In [5]:
modelLR_df["rank_diff"] = modelLR_df["winner_rank"] - modelLR_df["loser_rank"]
modelLR_df["ace_diff"] = modelLR_df["w_ace"] - modelLR_df["l_ace"]
modelLR_df["df_diff"] = modelLR_df["w_df"] - modelLR_df["l_df"]
modelLR_df["svpt_diff"] = modelLR_df["w_svpt"] - modelLR_df["l_svpt"]
modelLR_df["1stIn_diff"] = modelLR_df["w_1stIn"] - modelLR_df["l_1stIn"]
modelLR_df["1stWon_diff"] = modelLR_df["w_1stWon"] - modelLR_df["l_1stWon"]
modelLR_df["2ndWon_diff"] = modelLR_df["w_2ndWon"] - modelLR_df["l_2ndWon"]
modelLR_df["SvGms_diff"] = modelLR_df["w_SvGms"] - modelLR_df["l_SvGms"]
modelLR_df["bpSaved_diff"] = modelLR_df["w_bpSaved"] - modelLR_df["l_bpSaved"]
modelLR_df["bpFaced_diff"] = modelLR_df["w_bpFaced"] - modelLR_df["l_bpFaced"]
modelLR_df["age_diff"] = modelLR_df["winner_age"] - modelLR_df["loser_age"]

In [7]:
modelLR_df.head()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_name,winner_hand,...,ace_diff,df_diff,svpt_diff,1stIn_diff,1stWon_diff,2ndWon_diff,SvGms_diff,bpSaved_diff,bpFaced_diff,age_diff
0,2010-339,Brisbane,Hard,32,A,20100103,1,104053,Andy Roddick,R,...,11.0,-2.0,7.0,8.0,7.0,3.0,0.0,0.0,-2.0,-3.0
1,2010-339,Brisbane,Hard,32,A,20100103,30,103285,Radek Stepanek,R,...,2.0,-3.0,5.0,4.0,6.0,8.0,1.0,-2.0,-6.0,7.8
2,2010-339,Brisbane,Hard,32,A,20100103,29,104053,Andy Roddick,R,...,-5.0,-1.0,-7.0,17.0,9.0,-10.0,0.0,-6.0,-6.0,3.1
3,2010-339,Brisbane,Hard,32,A,20100103,28,103285,Radek Stepanek,R,...,4.0,1.0,-8.0,-7.0,-1.0,6.0,-1.0,-5.0,-8.0,7.0
4,2010-339,Brisbane,Hard,32,A,20100103,27,104792,Gael Monfils,R,...,9.0,-1.0,2.0,5.0,3.0,1.0,0.0,4.0,3.0,-6.7


In [9]:
# Encode categorical variable: surface
modelLR_df = pd.get_dummies(modelLR_df, columns=["surface"], drop_first=True)

# Convert bool to int for one-hot encoded columns
for col in ["surface_Grass", "surface_Hard"]:
    modelLR_df[col] = modelLR_df[col].astype(int)

In [10]:
# Features to use
feature_cols = [
    "rank_diff", "ace_diff", "df_diff", "svpt_diff", "1stIn_diff", "1stWon_diff",
    "2ndWon_diff", "SvGms_diff", "bpSaved_diff", "bpFaced_diff", "age_diff",
    "surface_Grass", "surface_Hard"  # Assuming Clay is the reference
]

Debugging/Verification methods

In [11]:
print("Data types of features:")
print(modelLR_df[feature_cols].dtypes)
print("\nNaN check in features:")
print(modelLR_df[feature_cols].isna().sum())

Data types of features:
rank_diff        float64
ace_diff         float64
df_diff          float64
svpt_diff        float64
1stIn_diff       float64
1stWon_diff      float64
2ndWon_diff      float64
SvGms_diff       float64
bpSaved_diff     float64
bpFaced_diff     float64
age_diff         float64
surface_Grass      int64
surface_Hard       int64
dtype: object

NaN check in features:
rank_diff        0
ace_diff         0
df_diff          0
svpt_diff        0
1stIn_diff       0
1stWon_diff      0
2ndWon_diff      0
SvGms_diff       0
bpSaved_diff     0
bpFaced_diff     0
age_diff         0
surface_Grass    0
surface_Hard     0
dtype: int64


In [12]:
# Target: 1 if winner is Player A (recorded winner), 0 if loser wins (flip for symmetry later)
modelLR_df["target"] = 1

In [13]:
# Create a symmetric dataset by flipping winner/loser
df_flipped = modelLR_df.copy()

df_flipped["rank_diff"] = -df_flipped["rank_diff"]
df_flipped["ace_diff"] = -df_flipped["ace_diff"]
df_flipped["df_diff"] = -df_flipped["df_diff"]
df_flipped["svpt_diff"] = -df_flipped["svpt_diff"]
df_flipped["1stIn_diff"] = -df_flipped["1stIn_diff"]
df_flipped["1stWon_diff"] = -df_flipped["1stWon_diff"]
df_flipped["2ndWon_diff"] = -df_flipped["2ndWon_diff"]
df_flipped["SvGms_diff"] = -df_flipped["SvGms_diff"]
df_flipped["bpSaved_diff"] = -df_flipped["bpSaved_diff"]
df_flipped["bpFaced_diff"] = -df_flipped["bpFaced_diff"]
df_flipped["age_diff"] = -df_flipped["age_diff"]
df_flipped["target"] = 0

In [15]:
# Combine original and flipped data
df_symmetric = pd.concat([modelLR_df, df_flipped], ignore_index=True)

In [17]:
# Step 2: Train-Test Split
df_symmetric["tourney_date"] = pd.to_datetime(df_symmetric["tourney_date"], format="%Y%m%d", errors="coerce")
df_symmetric = df_symmetric.dropna(subset=["tourney_date"])
df_symmetric["year"] = df_symmetric["tourney_date"].dt.year
train_data = df_symmetric[df_symmetric["year"] <= 2022]
test_data = df_symmetric[df_symmetric["year"] > 2022]

X_train = train_data[feature_cols].values
y_train = train_data["target"].values
X_test = test_data[feature_cols].values
y_test = test_data["target"].values

# Debug: Check array shape and type
print("\nX_train shape:", X_train.shape)
print("X_train dtype:", X_train.dtype)
print("X_test shape: ", X_test.shape)


X_train shape: (73010, 13)
X_train dtype: float64
X_test shape:  (12018, 13)


In [18]:
df_symmetric.head()

Unnamed: 0,tourney_id,tourney_name,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_name,winner_hand,winner_ht,...,2ndWon_diff,SvGms_diff,bpSaved_diff,bpFaced_diff,age_diff,surface_Clay,surface_Grass,surface_Hard,target,year
0,2010-339,Brisbane,32,A,2010-01-03,1,104053,Andy Roddick,R,188.0,...,3.0,0.0,0.0,-2.0,-3.0,False,0,1,1,2010
1,2010-339,Brisbane,32,A,2010-01-03,30,103285,Radek Stepanek,R,185.0,...,8.0,1.0,-2.0,-6.0,7.8,False,0,1,1,2010
2,2010-339,Brisbane,32,A,2010-01-03,29,104053,Andy Roddick,R,188.0,...,-10.0,0.0,-6.0,-6.0,3.1,False,0,1,1,2010
3,2010-339,Brisbane,32,A,2010-01-03,28,103285,Radek Stepanek,R,185.0,...,6.0,-1.0,-5.0,-8.0,7.0,False,0,1,1,2010
4,2010-339,Brisbane,32,A,2010-01-03,27,104792,Gael Monfils,R,193.0,...,1.0,0.0,4.0,3.0,-6.7,False,0,1,1,2010


In [19]:
df_symmetric.tail()

Unnamed: 0,tourney_id,tourney_name,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_name,winner_hand,winner_ht,...,2ndWon_diff,SvGms_diff,bpSaved_diff,bpFaced_diff,age_diff,surface_Clay,surface_Grass,surface_Hard,target,year
85023,2024-7696,Next Gen Finals,8,F,2024-12-18,387,210460,Nishesh Basavareddy,R,180.0,...,9.0,1.0,1.0,5.0,0.2,False,0,1,0,2024
85024,2024-7696,Next Gen Finals,8,F,2024-12-18,388,209414,Luca Van Assche,R,178.0,...,-2.0,-1.0,9.0,9.0,-0.7,False,0,1,0,2024
85025,2024-7696,Next Gen Finals,8,F,2024-12-18,389,210506,Alex Michelsen,R,193.0,...,3.0,-0.0,2.0,2.0,-0.7,False,0,1,0,2024
85026,2024-7696,Next Gen Finals,8,F,2024-12-18,391,210506,Alex Michelsen,R,193.0,...,-3.0,-1.0,1.0,2.0,-0.5,False,0,1,0,2024
85027,2024-7696,Next Gen Finals,8,F,2024-12-18,395,211663,Joao Fonseca,R,185.0,...,4.0,-1.0,4.0,6.0,2.2,False,0,1,0,2024


In [20]:
# Step 3: Normalize features
X_train_mean = np.nanmean(X_train, axis=0)
X_train_std = np.nanstd(X_train, axis=0)
X_train_std[X_train_std == 0] = 1e-10  # Avoid division by zero with small epsilon
X_train = np.where(np.isnan(X_train), 0, X_train)
X_train = (X_train - X_train_mean) / X_train_std
X_test = np.where(np.isnan(X_test), 0, X_test)
X_test = (X_test - X_train_mean) / X_train_std

In [21]:
# Add intercept term
X_train = np.hstack([np.ones((X_train.shape[0], 1)), X_train])
X_test = np.hstack([np.ones((X_test.shape[0], 1)), X_test])

In [23]:
# Step 4: Implement Logistic Regression
def sigmoid(z):
    return 1 / (1 + np.exp(-np.clip(z, -500, 500)))  # Clip to avoid overflow

def compute_loss(X, y, beta):
    z = np.dot(X, beta)
    y_pred = sigmoid(z)
    loss = -np.mean(y * np.log(y_pred + 1e-10) + (1 - y) * np.log(1 - y_pred + 1e-10))
    return loss

def compute_gradient(X, y, beta):
    z = np.dot(X, beta)
    y_pred = sigmoid(z)
    gradient = np.dot(X.T, (y_pred - y)) / len(y)
    return gradient

# Gradient Descent
beta = np.zeros(X_train.shape[1])  # Initialize coefficients
learning_rate = 0.01
n_iterations = 5000

for i in range(n_iterations):
    gradient = compute_gradient(X_train, y_train, beta)
    beta -= learning_rate * gradient
    if i % 100 == 0:
        loss = compute_loss(X_train, y_train, beta)
        print(f"Iteration {i}, Loss: {loss:.4f}")

Iteration 0, Loss: 0.6906
Iteration 100, Loss: 0.5261
Iteration 200, Loss: 0.4503
Iteration 300, Loss: 0.4062
Iteration 400, Loss: 0.3767
Iteration 500, Loss: 0.3552
Iteration 600, Loss: 0.3385
Iteration 700, Loss: 0.3250
Iteration 800, Loss: 0.3137
Iteration 900, Loss: 0.3042
Iteration 1000, Loss: 0.2959
Iteration 1100, Loss: 0.2886
Iteration 1200, Loss: 0.2821
Iteration 1300, Loss: 0.2762
Iteration 1400, Loss: 0.2709
Iteration 1500, Loss: 0.2661
Iteration 1600, Loss: 0.2617
Iteration 1700, Loss: 0.2576
Iteration 1800, Loss: 0.2538
Iteration 1900, Loss: 0.2503
Iteration 2000, Loss: 0.2470
Iteration 2100, Loss: 0.2439
Iteration 2200, Loss: 0.2410
Iteration 2300, Loss: 0.2383
Iteration 2400, Loss: 0.2357
Iteration 2500, Loss: 0.2333
Iteration 2600, Loss: 0.2310
Iteration 2700, Loss: 0.2289
Iteration 2800, Loss: 0.2268
Iteration 2900, Loss: 0.2248
Iteration 3000, Loss: 0.2229
Iteration 3100, Loss: 0.2212
Iteration 3200, Loss: 0.2195
Iteration 3300, Loss: 0.2178
Iteration 3400, Loss: 0.21

In [24]:
# Step 5: Predict and Evaluate
def predict(X, beta):
    z = np.dot(X, beta)
    y_pred = sigmoid(z)
    return (y_pred >= 0.5).astype(int)

# Training accuracy
y_train_pred = predict(X_train, beta)
train_accuracy = np.mean(y_train_pred == y_train)
print(f"\nTraining Accuracy: {train_accuracy:.2%}")

# Test accuracy
y_test_pred = predict(X_test, beta)
test_accuracy = np.mean(y_test_pred == y_test)
print(f"Test Accuracy: {test_accuracy:.2%}")


Training Accuracy: 92.96%
Test Accuracy: 93.26%
