# Bayesian Logistic Regression 


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

np.random.seed(42)

In [None]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))


def log_likelihood(w, X, y):
    z = np.dot(X, w)
    # Clip values to avoid log(0) issues
    sig = np.clip(sigmoid(z), 1e-10, 1 - 1e-10)
    return np.sum(y * np.log(sig) + (1 - y) * np.log(1 - sig))


def log_prior(w, tau):
    return -0.5 * np.sum(w ** 2) / (tau ** 2)


def log_posterior(w, X, y, tau):
    return log_likelihood(w, X, y) + log_prior(w, tau)


def grad_log_posterior(w, X, y, tau):
    predictions = sigmoid(np.dot(X, w))
    # Gradient from the likelihood and the prior
    gradient = np.dot(X.T, (y - predictions)) - w / (tau ** 2)
    return gradient


def gradient_ascent(X, y, tau, learning_rate=0.01, n_iter=1000):
    # Initialize weights to zeros
    w = np.zeros(X.shape[1])
    for i in range(n_iter):
        grad = grad_log_posterior(w, X, y, tau)
        w += learning_rate * grad
        # Optionally, print progress every 100 iterations
        if (i+1) % 100 == 0:
            current_lp = log_posterior(w, X, y, tau)
            print(f"Iteration {i+1}, Log-Posterior: {current_lp:.4f}")
    return w


def predict_proba(X, w):
    return sigmoid(np.dot(X, w))


def predict(X, w, threshold=0.5):
    probs = predict_proba(X, w)
    return (probs >= threshold).astype(int)


In [None]:

df = pd.read_csv("atp_matches_2010_2024_missing_handled.csv")

print("Dataset Head:")
print(df.head())


print("\nColumns in Dataset:")
print(df.columns)

target_column = "match_winner"  

feature_columns = [col for col in df.columns if col != target_column]

# Convert the DataFrame into NumPy arrays for use in our model functions.
X = df[feature_columns].values
y = df[target_column].values

# Verifying the shapes of the feature matrix and target vector
print("\nFeature Matrix Shape:", X.shape)
print("Target Vector Shape:", y.shape)

# Training the BLR Model

In [None]:
# Hyperparameters
tau = 1.0
learning_rate = 0.01
n_iter = 1000

# Train the model using gradient ascent
w_est = gradient_ascent(X, y, tau, learning_rate, n_iter)
print("Estimated weights:", w_est)

# Evaluating the Model

In [None]:

y_pred = predict(X, w_est)

# Compute accuracy
accuracy = np.mean(y_pred == y)
print("Training Accuracy:", accuracy)

# Optional: Plot probabilities vs true labels
probs_pred = predict_proba(X, w_est)
plt.figure(figsize=(8, 4))
plt.scatter(range(len(y)), probs_pred, label='Predicted Probabilities', alpha=0.7)
plt.scatter(range(len(y)), y, label='True Labels', marker='x')
plt.xlabel('Sample Index')
plt.ylabel('Probability / Label')
plt.legend()
plt.title('Predicted Probabilities vs True Labels')
plt.show()