In [1]:
import numpy as np
from collections import Counter
from scipy.stats import mode
import pandas as pd

# Simple Decision Tree using sklearn-like API
class SimpleDecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2, max_features=None):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_features = max_features
    
    def fit(self, X, y, depth=0):
        n_samples, n_features = X.shape
        self.n_classes = len(np.unique(y))
        
        # Stop conditions
        if (depth >= self.max_depth or n_samples < self.min_samples_split 
                or self.n_classes == 1):
            self.leaf_value = mode(y).mode[0]
            return
        
        # Select random subset of features
        if self.max_features is not None:
            feats_idx = np.random.choice(n_features, self.max_features, replace=False)
        else:
            feats_idx = np.arange(n_features)
        
        # Find best split
        best_feat, best_thresh, best_gain = None, None, -1
        for feat in feats_idx:
            thresholds = np.unique(X[:, feat])
            for threshold in thresholds:
                gain = self._information_gain(y, X[:, feat], threshold)
                if gain > best_gain:
                    best_feat, best_thresh, best_gain = feat, threshold, gain
        
        if best_gain == -1:
            self.leaf_value = mode(y).mode[0]
            return
        
        # Store best split parameters
        self.feat_idx = best_feat
        self.threshold = best_thresh

        # Split data
        left_idxs = X[:, best_feat] <= best_thresh
        right_idxs = X[:, best_feat] > best_thresh

        # Recursive building
        self.left = SimpleDecisionTree(
            max_depth=self.max_depth, 
            min_samples_split=self.min_samples_split,
            max_features=self.max_features
        )
        self.left.fit(X[left_idxs], y[left_idxs], depth+1)

        self.right = SimpleDecisionTree(
            max_depth=self.max_depth, 
            min_samples_split=self.min_samples_split,
            max_features=self.max_features
        )
        self.right.fit(X[right_idxs], y[right_idxs], depth+1)

    def predict(self, X):
        return np.array([self._predict(inputs) for inputs in X])

    def _predict(self, inputs):
        if hasattr(self, 'leaf_value'):
            return self.leaf_value
        if inputs[self.feat_idx] <= self.threshold:
            return self.left._predict(inputs)
        else:
            return self.right._predict(inputs)
    
    def _entropy(self, y):
        counts = np.bincount(y)
        probabilities = counts / len(y)
        return -np.sum([p * np.log2(p) for p in probabilities if p > 0])

    def _information_gain(self, y, X_col, split_thresh):
        parent_entropy = self._entropy(y)

        left_idxs = X_col <= split_thresh
        right_idxs = X_col > split_thresh
        
        if sum(left_idxs) == 0 or sum(right_idxs) == 0:
            return 0
        
        n = len(y)
        n_left, n_right = sum(left_idxs), sum(right_idxs)

        e_left, e_right = self._entropy(y[left_idxs]), self._entropy(y[right_idxs])
        child_entropy = (n_left / n) * e_left + (n_right / n) * e_right
        
        return parent_entropy - child_entropy

# Random Forest classifier built from scratch
class CustomRandomForest:
    def __init__(self, n_estimators=10, max_depth=10, min_samples_split=2, max_features='sqrt', random_state=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_features = max_features
        self.trees = []
        self.random_state = random_state

    def fit(self, X, y):
        np.random.seed(self.random_state)
        self.trees = []
        n_samples, n_features = X.shape

        if self.max_features == 'sqrt':
            max_features = int(np.sqrt(n_features))
        elif self.max_features == 'log2':
            max_features = int(np.log2(n_features))
        else:
            max_features = n_features

        for _ in range(self.n_estimators):
            idxs = np.random.choice(n_samples, n_samples, replace=True)
            X_sample, y_sample = X[idxs], y[idxs]
            tree = SimpleDecisionTree(
                max_depth=self.max_depth, 
                min_samples_split=self.min_samples_split,
                max_features=max_features
            )
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        return mode(tree_preds, axis=0).mode[0]

    def accuracy(self, X, y):
        preds = self.predict(X)
        return np.mean(preds == y)

# Evaluation functions
def custom_accuracy_score(y_true, y_pred):
    return np.mean(y_true == y_pred)

def custom_classification_report(y_true, y_pred, target_names):
    unique_labels = np.unique(y_true)
    report = {}
    for label, name in zip(unique_labels, target_names):
        support = np.sum(y_true == label)
        tp = np.sum((y_true == label) & (y_pred == label))
        fp = np.sum((y_true != label) & (y_pred == label))
        fn = np.sum((y_true == label) & (y_pred != label))
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
        report[name] = {"precision": precision, "recall": recall, "f1-score": f1, "support": support}
    overall_accuracy = custom_accuracy_score(y_true, y_pred)
    return report, overall_accuracy

# Data preprocessing
modelLR_df = pd.read_csv("atp_matches_2010_2024_missing_handled.csv")

# Create engineered features
modelLR_df["rank_diff"] = modelLR_df["winner_rank"] - modelLR_df["loser_rank"]
modelLR_df["ace_diff"] = modelLR_df["w_ace"] - modelLR_df["l_ace"]
modelLR_df["df_diff"] = modelLR_df["w_df"] - modelLR_df["l_df"]
modelLR_df["svpt_diff"] = modelLR_df["w_svpt"] - modelLR_df["l_svpt"]
modelLR_df["1stIn_diff"] = modelLR_df["w_1stIn"] - modelLR_df["l_1stIn"]
modelLR_df["1stWon_diff"] = modelLR_df["w_1stWon"] - modelLR_df["l_1stWon"]
modelLR_df["2ndWon_diff"] = modelLR_df["w_2ndWon"] - modelLR_df["l_2ndWon"]
modelLR_df["SvGms_diff"] = modelLR_df["w_SvGms"] - modelLR_df["l_SvGms"]
modelLR_df["bpSaved_diff"] = modelLR_df["w_bpSaved"] - modelLR_df["l_bpSaved"]
modelLR_df["bpFaced_diff"] = modelLR_df["w_bpFaced"] - modelLR_df["l_bpFaced"]
modelLR_df["age_diff"] = modelLR_df["winner_age"] - modelLR_df["loser_age"]

# One-hot encode the categorical variable: surface
modelLR_df = pd.get_dummies(modelLR_df, columns=["surface"], drop_first=True)

# Create symmetric dataset
modelLR_df["target"] = 1  # Original records: target=1 (Player A wins)
df_flipped = modelLR_df.copy()
flip_cols = ["rank_diff", "ace_diff", "df_diff", "svpt_diff", "1stIn_diff",
             "1stWon_diff", "2ndWon_diff", "SvGms_diff", "bpSaved_diff", "bpFaced_diff", "age_diff"]
for col in flip_cols:
    df_flipped[col] = -df_flipped[col]
df_flipped["target"] = 0  # Flipped records: target=0 (Player B wins)
df_symmetric = pd.concat([modelLR_df, df_flipped], ignore_index=True)

# Split into train and test
df_symmetric["tourney_date"] = pd.to_datetime(df_symmetric["tourney_date"], format="%Y%m%d", errors="coerce")
df_symmetric = df_symmetric.dropna(subset=["tourney_date"])
df_symmetric["year"] = df_symmetric["tourney_date"].dt.year

train_data = df_symmetric[df_symmetric["year"] <= 2022]
test_data = df_symmetric[df_symmetric["year"] > 2022]

# Feature selection
feature_cols = [
    "ace_diff", "df_diff", "1stIn_diff", "SvGms_diff", "age_diff",
    "1stWon_diff", "2ndWon_diff", "bpSaved_diff", "bpFaced_diff", "svpt_diff",
    "rank_diff", "surface_Grass", "surface_Hard"  # Assuming Clay is the reference
]

X_train = train_data[feature_cols].values
y_train = train_data["target"].values
X_test = test_data[feature_cols].values
y_test = test_data["target"].values

# Train model
rf_model = CustomRandomForest(n_estimators=10, max_depth=10, min_samples_split=2,
                              max_features='sqrt', random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate
train_preds = rf_model.predict(X_train)
test_preds = rf_model.predict(X_test)

acc_train = custom_accuracy_score(y_train, train_preds)
acc_test = custom_accuracy_score(y_test, test_preds)

print("Training Accuracy: {:.2%}".format(acc_train))
print("Test Accuracy: {:.2%}".format(acc_test))

# Generate classification report
target_names = ["Player B Win", "Player A Win"]
report, overall_accuracy = custom_classification_report(y_test, test_preds, target_names)

print("\nClassification Report (Test Set):")
for label, metrics in report.items():
    print(f"{label}:")
    print("  Precision: {:.2f}".format(metrics["precision"]))
    print("  Recall:    {:.2f}".format(metrics["recall"]))
    print("  F1-Score:  {:.2f}".format(metrics["f1-score"]))
    print("  Support:   {}".format(metrics["support"]))
print("Overall Accuracy: {:.2%}".format(overall_accuracy))