In [14]:
# === CapBot v1a_alt – Smart Filter Tuning (Train Once, Predict on 100%) ===
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from datetime import datetime
from pathlib import Path
import os

# === Config ===
notebook_dir = Path(__file__).parent if "__file__" in globals() else Path().resolve()
today = datetime.today().strftime("%Y%m%d")
version = "v1a"
version_dir = notebook_dir

# Filters to test (Updated for realism)
filter_configs = {
    "conf_60_odds_150": {"proba": 0.60, "odds": 1.50},
    "conf_65_odds_140": {"proba": 0.65, "odds": 1.40},
    "conf_70_odds_135": {"proba": 0.70, "odds": 1.35},
    "conf_75_odds_130": {"proba": 0.75, "odds": 1.30},
}

summary_xlsx_path = version_dir / f"CapBot_{version}_Report_{today}.xlsx"
readme_path = version_dir / f"CapBot_{version}_README.md"

# === Load Data ===
file_path = version_dir / "../../../data/historical/processed/matches_2015_2025_combined_balanced.csv"
df = pd.read_csv(file_path)
numeric_cols = ['rank_A', 'rank_B', 'pts_A', 'pts_B', 'odds_A', 'odds_B']
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
df = df.dropna(subset=numeric_cols + ['winner_code'])

# === Train Once and Predict on Full Dataset ===
X = df[numeric_cols]
y = df['winner_code']
X_train, _, y_train, _ = train_test_split(X, y, test_size=0.20, stratify=y, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

preds_proba = model.predict_proba(X)[:, 1]
preds = (preds_proba > 0.5).astype(int)

full_df = df.copy()
full_df["pred_proba"] = preds_proba
full_df["predicted"] = preds
full_df["correct"] = (preds == y).astype(int)
full_df["bet"] = 200
full_df["target"] = y.values

meta_cols = ['date', 'player_A', 'player_B', 'odds_A']
full_df[meta_cols] = df[meta_cols].reset_index(drop=True)

# === Apply Filters ===
all_summary_dfs = []
with pd.ExcelWriter(summary_xlsx_path, engine="xlsxwriter") as writer:
    for label, config in filter_configs.items():
        print(f"\n🚀 Running Filter: {label} => proba >= {config['proba']}, odds >= {config['odds']}")

        bets = full_df[(full_df["pred_proba"] >= config['proba']) & (full_df["odds_A"] >= config['odds'])]
        bets = bets[["date", "player_A", "player_B", "odds_A", "pred_proba", "target"]].copy()
        bets.loc[:, "profit"] = np.where(
            bets["target"] == 1,
            200 * (bets["odds_A"] - 1),
            -200
        )
        bets["Run"] = 1  # Single run

        total_bets = len(bets)
        total_profit = bets["profit"].sum()
        roi = (total_profit / (200 * total_bets)) * 100 if total_bets else 0
        bet_accuracy = (bets['target'] == (bets['pred_proba'] > 0.5).astype(int)).mean() if total_bets else 0

        stats = pd.DataFrame([{
            "Run": "Single",
            "Accuracy": round(accuracy_score(y, preds), 4),
            "Total Bets": total_bets,
            "Profit ($)": round(total_profit, 2),
            "ROI (%)": round(roi, 2),
            "Bet Accuracy": round(bet_accuracy, 4)
        }])

        stats.to_excel(writer, sheet_name=f"{label}_summary", index=False)
        bets.to_excel(writer, sheet_name=f"{label}_bets", index=False)
        all_summary_dfs.append((label, stats))

# === Generate README ===
readme = f"""\
# 📄 CapBot {version.upper()} – Filter Comparison Report

**Date:** {today}  
**Excel Report:** `{summary_xlsx_path.name}`

---

## 🎯 Filters Evaluated
| Label | pred_proba ≥ | odds_A ≥ | Description |
|-------|--------------|----------|-------------|
| conf_60_odds_150 | 0.60 | 1.50 | ✅ Realistic baseline filter |
| conf_65_odds_140 | 0.65 | 1.40 | ⚖️ Balanced confidence & odds |
| conf_70_odds_135 | 0.70 | 1.35 | 🎯 Slightly aggressive confidence |
| conf_75_odds_130 | 0.75 | 1.30 | 🔒 Very confident, low risk |

---

# 🧠 Model & Evaluation Details
- **Model Type:** Logistic Regression (`sklearn`)
- **Training Data:** 80% of dataset (randomized)
- **Predictions Evaluated On:** Entire dataset (100%)
- **Runs:** Single unified prediction run (no simulation loop)

---
"""

for label, stats_df in all_summary_dfs:
    readme += f"\n## 📊 Summary for `{label}`\n"
    readme += stats_df.to_markdown(index=False)
    readme += "\n---\n"

with open(readme_path, "w", encoding="utf-8") as f:
    f.write(readme)

print(f"📄 README saved to {readme_path.resolve()}")
print(f"📊 XLSX Report saved to {summary_xlsx_path.resolve()}")


  df = pd.read_csv(file_path)



🚀 Running Filter: conf_60_odds_150 => proba >= 0.6, odds >= 1.5

🚀 Running Filter: conf_65_odds_140 => proba >= 0.65, odds >= 1.4

🚀 Running Filter: conf_70_odds_135 => proba >= 0.7, odds >= 1.35

🚀 Running Filter: conf_75_odds_130 => proba >= 0.75, odds >= 1.3
📄 README saved to /Users/boroni_4/Documents/CapBot/capbot/notebooks/versions/v1a/CapBot_v1a_README.md
📊 XLSX Report saved to /Users/boroni_4/Documents/CapBot/capbot/notebooks/versions/v1a/CapBot_v1a_Report_20250516.xlsx
