In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [13]:
modelLR_df = pd.read_csv("atp_matches_2010_2024_missing_handled.csv")

In [14]:
modelLR_df.head() # verification

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_name,winner_hand,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,2010-339,Brisbane,Hard,32,A,20100103,1,104053,Andy Roddick,R,...,34.0,29.0,11.0,10.0,3.0,5.0,7.0,4410.0,77.0,598.0
1,2010-339,Brisbane,Hard,32,A,20100103,30,103285,Radek Stepanek,R,...,27.0,14.0,7.0,7.0,3.0,7.0,12.0,2625.0,13.0,2610.0
2,2010-339,Brisbane,Hard,32,A,20100103,29,104053,Andy Roddick,R,...,43.0,34.0,21.0,13.0,10.0,12.0,7.0,4410.0,20.0,1655.0
3,2010-339,Brisbane,Hard,32,A,20100103,28,103285,Radek Stepanek,R,...,40.0,25.0,11.0,10.0,6.0,10.0,12.0,2625.0,105.0,521.0
4,2010-339,Brisbane,Hard,32,A,20100103,27,104792,Gael Monfils,R,...,50.0,38.0,17.0,14.0,3.0,6.0,13.0,2610.0,44.0,935.0


In [15]:
# Encode categorical variable: surface
modelLR_df = pd.get_dummies(modelLR_df, columns=["surface"], drop_first=True)

# Convert bool to int for one-hot encoded columns
for col in ["surface_Grass", "surface_Hard"]:
    modelLR_df[col] = modelLR_df[col].astype(int)

In [16]:
modelLR_df.columns

Index(['tourney_id', 'tourney_name', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_name', 'winner_hand',
       'winner_ht', 'winner_ioc', 'winner_age', 'loser_id', 'loser_name',
       'loser_hand', 'loser_ht', 'loser_ioc', 'loser_age', 'score', 'best_of',
       'round', 'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon',
       'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df',
       'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved',
       'l_bpFaced', 'winner_rank', 'winner_rank_points', 'loser_rank',
       'loser_rank_points', 'surface_Clay', 'surface_Grass', 'surface_Hard'],
      dtype='object')

In [19]:
feature_cols = ['w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon',
       'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df',
       'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved',
       'l_bpFaced', 'winner_rank', 'loser_rank', 'surface_Grass', 'surface_Hard']

In [20]:
print("Data types of features:")
print(modelLR_df[feature_cols].dtypes)
print("\nNaN check in features:")
print(modelLR_df[feature_cols].isna().sum())

Data types of features:
w_ace            float64
w_df             float64
w_svpt           float64
w_1stIn          float64
w_1stWon         float64
w_2ndWon         float64
w_SvGms          float64
w_bpSaved        float64
w_bpFaced        float64
l_ace            float64
l_df             float64
l_svpt           float64
l_1stIn          float64
l_1stWon         float64
l_2ndWon         float64
l_SvGms          float64
l_bpSaved        float64
l_bpFaced        float64
winner_rank      float64
loser_rank       float64
surface_Grass      int64
surface_Hard       int64
dtype: object

NaN check in features:
w_ace            0
w_df             0
w_svpt           0
w_1stIn          0
w_1stWon         0
w_2ndWon         0
w_SvGms          0
w_bpSaved        0
w_bpFaced        0
l_ace            0
l_df             0
l_svpt           0
l_1stIn          0
l_1stWon         0
l_2ndWon         0
l_SvGms          0
l_bpSaved        0
l_bpFaced        0
winner_rank      0
loser_rank       0
surface_

In [21]:
# Target: 1 if winner is Player A (recorded winner), 0 if loser wins (flip for symmetry later)
modelLR_df["target"] = 1

In [22]:
df_symmetric = modelLR_df.copy()

df_symmetric["tourney_date"] = pd.to_datetime(df_symmetric["tourney_date"], format="%Y%m%d", errors="coerce")
df_symmetric = df_symmetric.dropna(subset=["tourney_date"])
df_symmetric["year"] = df_symmetric["tourney_date"].dt.year
train_data = df_symmetric[df_symmetric["year"] <= 2022]
test_data = df_symmetric[df_symmetric["year"] > 2022]

X_train = train_data[feature_cols].values
y_train = train_data["target"].values
X_test = test_data[feature_cols].values
y_test = test_data["target"].values

# Debug: Check array shape and type
print("\nX_train shape:", X_train.shape)
print("X_train dtype:", X_train.dtype)
print("X_test shape: ", X_test.shape)


X_train shape: (36505, 22)
X_train dtype: float64
X_test shape:  (6009, 22)


In [23]:
# Step 3: Normalize features
X_train_mean = np.nanmean(X_train, axis=0)
X_train_std = np.nanstd(X_train, axis=0)
X_train_std[X_train_std == 0] = 1e-10  # Avoid division by zero with small epsilon
X_train = np.where(np.isnan(X_train), 0, X_train)
X_train = (X_train - X_train_mean) / X_train_std
X_test = np.where(np.isnan(X_test), 0, X_test)
X_test = (X_test - X_train_mean) / X_train_std

In [24]:
# Add intercept term
X_train = np.hstack([np.ones((X_train.shape[0], 1)), X_train])
X_test = np.hstack([np.ones((X_test.shape[0], 1)), X_test])

In [25]:
# Step 4: Implement Logistic Regression
def sigmoid(z):
    return 1 / (1 + np.exp(-np.clip(z, -500, 500)))  # Clip to avoid overflow

def compute_loss(X, y, beta):
    z = np.dot(X, beta)
    y_pred = sigmoid(z)
    loss = -np.mean(y * np.log(y_pred + 1e-10) + (1 - y) * np.log(1 - y_pred + 1e-10))
    return loss

def compute_gradient(X, y, beta):
    z = np.dot(X, beta)
    y_pred = sigmoid(z)
    gradient = np.dot(X.T, (y_pred - y)) / len(y)
    return gradient

# Gradient Descent
beta = np.zeros(X_train.shape[1])  # Initialize coefficients
learning_rate = 0.01
n_iterations = 5000

for i in range(n_iterations):
    gradient = compute_gradient(X_train, y_train, beta)
    beta -= learning_rate * gradient
    if i % 100 == 0:
        loss = compute_loss(X_train, y_train, beta)
        print(f"Iteration {i}, Loss: {loss:.4f}")

Iteration 0, Loss: 0.6907
Iteration 100, Loss: 0.4943
Iteration 200, Loss: 0.3724
Iteration 300, Loss: 0.2931
Iteration 400, Loss: 0.2390
Iteration 500, Loss: 0.2003
Iteration 600, Loss: 0.1716
Iteration 700, Loss: 0.1497
Iteration 800, Loss: 0.1324
Iteration 900, Loss: 0.1186
Iteration 1000, Loss: 0.1072
Iteration 1100, Loss: 0.0977
Iteration 1200, Loss: 0.0897
Iteration 1300, Loss: 0.0829
Iteration 1400, Loss: 0.0770
Iteration 1500, Loss: 0.0719
Iteration 1600, Loss: 0.0673
Iteration 1700, Loss: 0.0633
Iteration 1800, Loss: 0.0598
Iteration 1900, Loss: 0.0566
Iteration 2000, Loss: 0.0537
Iteration 2100, Loss: 0.0511
Iteration 2200, Loss: 0.0487
Iteration 2300, Loss: 0.0466
Iteration 2400, Loss: 0.0446
Iteration 2500, Loss: 0.0428
Iteration 2600, Loss: 0.0411
Iteration 2700, Loss: 0.0395
Iteration 2800, Loss: 0.0381
Iteration 2900, Loss: 0.0367
Iteration 3000, Loss: 0.0355
Iteration 3100, Loss: 0.0343
Iteration 3200, Loss: 0.0332
Iteration 3300, Loss: 0.0322
Iteration 3400, Loss: 0.03

In [26]:
# Step 5: Predict and Evaluate
def predict(X, beta):
    z = np.dot(X, beta)
    y_pred = sigmoid(z)
    return (y_pred >= 0.5).astype(int)

# Training accuracy
y_train_pred = predict(X_train, beta)
train_accuracy = np.mean(y_train_pred == y_train)
print(f"\nTraining Accuracy: {train_accuracy:.2%}")

# Test accuracy
y_test_pred = predict(X_test, beta)
test_accuracy = np.mean(y_test_pred == y_test)
print(f"Test Accuracy: {test_accuracy:.2%}")


Training Accuracy: 100.00%
Test Accuracy: 100.00%
