In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import joblib
import warnings

warnings.filterwarnings('ignore')

print("Starting Football Match Outcome Predictor with ELO Engine...")
# ==========================================
# 1. LOAD & CLEAN
# ==========================================
df = pd.read_csv("match_data.csv") 

# Clean numeric columns
df['home_possession'] = pd.to_numeric(df['home_team_possession'].astype(str).str.rstrip('%'), errors='coerce').fillna(50) / 100.0
df['away_possession'] = (
    pd.to_numeric(
        df['away_team_possession']
        .astype(str)
        .str.rstrip('%'),
        errors='coerce'
    )
    .fillna(50)
    / 100.0
)

df['attendance'] = pd.to_numeric(df['attendance'].astype(str).str.replace(',', ''), errors='coerce').fillna(10000)

if 'date_utc' in df.columns:
    df['match_date'] = pd.to_datetime(df['date_utc'], utc=True)
    df = df.sort_values('match_date')
else:
    df['match_date'] = df.index

# ==========================================
# 2. ELO ENGINE FUNCTIONS
# ==========================================
def calculate_expected_score(rating_a, rating_b):
    return 1 / (1 + 10 ** ((rating_b - rating_a) / 400))

def update_elo(rating, expected, actual, k_factor=30):
    return rating + k_factor * (actual - expected)

# ==========================================
# 3. ADVANCED FEATURE ENGINEERING
# ==========================================
team_stats = {} 

features = {
    # Elo Ratings
    'Home_Elo': [], 'Away_Elo': [],
    
    # Split Stats (Home specific vs Away specific)
    'Home_HomeForm': [], 'Away_AwayForm': [], # How does Home play AT HOME?
    'Home_HomeXG': [],   'Away_AwayXG': [],
    
    # General Rolling
    'Home_Gen_XG': [], 'Away_Gen_XG': [],
    'Home_Gen_Poss': [], 'Away_Gen_Poss': []
}

def get_avg(hist, window=5):
    if not hist: return 0
    return sum(hist[-window:]) / min(len(hist), window)

print("Running ELO Engine & Generating Features...")

for index, row in df.iterrows():
    home = row['home_team_name']
    away = row['away_team_name']
    
    # Initialize Teams
    if home not in team_stats: 
        # Elo starts at 1500 for everyone
        team_stats[home] = {'elo': 1500, 'pts_home': [], 'pts_away': [], 'pts_all': [], 'xg_home': [], 'xg_away': [], 'xg_all': [], 'poss_all': []}
    if away not in team_stats: 
        team_stats[away] = {'elo': 1500, 'pts_home': [], 'pts_away': [], 'pts_all': [], 'xg_home': [], 'xg_away': [], 'xg_all': [], 'poss_all': []}
        
    h = team_stats[home]
    a = team_stats[away]
    
    # --- A. CAPTURE PRE-MATCH FEATURES ---
    
    # 1. ELO RATINGS (The Power Metric)
    features['Home_Elo'].append(h['elo'])
    features['Away_Elo'].append(a['elo'])
    
    # 2. HOME/AWAY SPECIFIC FORM
    # (How good is this team specifically when playing at Home/Away?)
    features['Home_HomeForm'].append(get_avg(h['pts_home'], 5)) # Home's form at home
    features['Away_AwayForm'].append(get_avg(a['pts_away'], 5)) # Away's form away
    
    features['Home_HomeXG'].append(get_avg(h['xg_home'], 5))
    features['Away_AwayXG'].append(get_avg(a['xg_away'], 5))
    
    # 3. GENERAL FORM (Overall)
    features['Home_Gen_XG'].append(get_avg(h['xg_all'], 5))
    features['Away_Gen_XG'].append(get_avg(a['xg_all'], 5))
    features['Home_Gen_Poss'].append(get_avg(h['poss_all'], 5))
    features['Away_Gen_Poss'].append(get_avg(a['poss_all'], 5))
    
    # --- B. UPDATE STATS AFTER MATCH ---
    
    # Determine Points & Result for ELO
    if row['home_team_score'] > row['away_team_score']:
        h_pts, a_pts = 3, 0
        h_actual, a_actual = 1, 0 # For Elo
    elif row['home_team_score'] == row['away_team_score']:
        h_pts, a_pts = 1, 1
        h_actual, a_actual = 0.5, 0.5
    else:
        h_pts, a_pts = 0, 3
        h_actual, a_actual = 0, 1
        
    # UPDATE ELO
    h_expected = calculate_expected_score(h['elo'], a['elo'])
    a_expected = calculate_expected_score(a['elo'], h['elo'])
    
    # Update ratings (Home gets a small bonus +20 for advantage in calc, but here we just update raw)
    h['elo'] = update_elo(h['elo'], h_expected, h_actual)
    a['elo'] = update_elo(a['elo'], a_expected, a_actual)
    
    # Update Lists
    h['pts_all'].append(h_pts); h['pts_home'].append(h_pts)
    a['pts_all'].append(a_pts); a['pts_away'].append(a_pts)
    
    h['xg_all'].append(row['home_team_xg']); h['xg_home'].append(row['home_team_xg'])
    a['xg_all'].append(row['away_team_xg']); a['xg_away'].append(row['away_team_xg'])
    
    h['poss_all'].append(df.at[index, 'home_possession'])
    a['poss_all'].append(df.at[index, 'away_possession'])

# Add to DF
for k, v in features.items(): df[k] = v

# Interaction Features
df['Elo_Diff'] = df['Home_Elo'] - df['Away_Elo'] # HUGE PREDICTOR
df['Specific_Form_Diff'] = df['Home_HomeForm'] - df['Away_AwayForm']

# ==========================================
# 4. TRAINING
# ==========================================
# Encode
le = LabelEncoder()
all_teams = pd.concat([df['home_team_name'], df['away_team_name']]).unique()
le.fit(all_teams)
df['HomeTeam_ID'] = le.transform(df['home_team_name'])
df['AwayTeam_ID'] = le.transform(df['away_team_name'])

# Target
conditions = [
    (df['home_team_score'] > df['away_team_score']),
    (df['home_team_score'] == df['away_team_score']),
    (df['home_team_score'] < df['away_team_score'])
]
df['match_outcome'] = np.select(conditions, [2, 1, 0])

X = df[[
    'Home_Elo', 'Away_Elo', 'Elo_Diff',
    'Home_HomeForm', 'Away_AwayForm', 'Specific_Form_Diff',
    'Home_Gen_XG', 'Away_Gen_XG',
    'Home_Gen_Poss', 'Away_Gen_Poss'
]]
y = df['match_outcome']

split = int(len(df) * 0.85)
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

# Time Series Grid Search (Standard for "Pro" models)
tscv = TimeSeriesSplit(n_splits=4)
param_grid = {
    'n_estimators': [200, 300],
    'max_depth': [3, 4], 
    'learning_rate': [0.01, 0.02],
    'colsample_bytree': [0.8],
    'subsample': [0.8]
}

print("Searching for optimal parameters...")
xgb = XGBClassifier(objective='multi:softmax', num_class=3, random_state=42, n_jobs=1)
grid = GridSearchCV(xgb, param_grid, cv=tscv, n_jobs=-1, scoring='accuracy')
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
preds = best_model.predict(X_test)
probs = best_model.predict_proba(X_test)
acc = accuracy_score(y_test, preds)

print(f"-------------------------------")
print(f"ELO MODEL ACCURACY: {acc:.2%}")
print(f"-------------------------------")

# High Confidence Check
results = pd.DataFrame({'Actual': y_test, 'Pred': preds, 'Conf': np.max(probs, axis=1)})
for t in [0.55, 0.60]:
    sub = results[results['Conf'] > t]
    print(f"Threshold {t}: {len(sub)} matches | Accuracy: {accuracy_score(sub['Actual'], sub['Pred']):.2%}")

# Save
joblib.dump({'model': best_model, 'stats': team_stats, 'encoder': le}, 'football_elo_model.pkl')

Starting Football Match Outcome Predictor with ELO Engine...
Running ELO Engine & Generating Features...
Searching for optimal parameters...
-------------------------------
ELO MODEL ACCURACY: 52.60%
-------------------------------
Threshold 0.55: 69 matches | Accuracy: 66.67%
Threshold 0.6: 43 matches | Accuracy: 69.77%


['football_elo_model.pkl']

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Optional: Show what the model actually learned (Feature Importance)
importance = pd.DataFrame({
    'Feature': X.columns,
    'Score': best_model.feature_importances_
}).sort_values(by='Score', ascending=False)

print("Top 5 Key Predictors:")
print(importance.head(5))

# Get probabilities
probs = best_model.predict_proba(X_test)
results = pd.DataFrame({
    'Actual': y_test,
    'Pred': preds,
    'Conf': np.max(probs, axis=1) # Max probability (Confidence)
})

print(f"{'Threshold':<10} | {'Matches':<8} | {'Accuracy':<10} | {'Status'}")
print("-" * 50)

# Loop through thresholds from 40% to 60%
for threshold in [0.40, 0.45, 0.50, 0.55, 0.60]:
    # Filter bets with confidence > threshold
    subset = results[results['Conf'] > threshold]
    
    if len(subset) > 0:
        acc = accuracy_score(subset['Actual'], subset['Pred'])
        
        # Determine if this is a good strategy
        status = "POOR"
        if acc > 0.50: status = "OK"
        if acc > 0.55: status = "GOOD"
        if acc > 0.60: status = "EXCELLENT"
        
        print(f"{threshold:.2f}       | {len(subset):<8} | {acc:.2%}     | {status}")
    else:
        print(f"{threshold:.2f}       | 0        | N/A        | -")

Top 5 Key Predictors:
              Feature     Score
5  Specific_Form_Diff  0.215567
3       Home_HomeForm  0.140673
9       Away_Gen_Poss  0.121955
8       Home_Gen_Poss  0.120526
6         Home_Gen_XG  0.079521
Threshold  | Matches  | Accuracy   | Status
--------------------------------------------------
0.40       | 176      | 54.55%     | OK
0.45       | 140      | 57.14%     | GOOD
0.50       | 109      | 61.47%     | EXCELLENT
0.55       | 69       | 66.67%     | EXCELLENT
0.60       | 43       | 69.77%     | EXCELLENT


## Loading and Using model

In [None]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize

# ==========================================
# 1. SETUP: YOUR WEEKEND PORTFOLIO
# ==========================================
# Input your model's predictions and the BOOKMAKER'S ODDS here.
# You must look up the real odds for these matches.

portfolio_data = [
    # Format: (Match, Team_Pick, Model_Prob, Real_Odds)
    
    # HIGH CONFIDENCE (The Snipers)
    ("Real Madrid vs Sevilla", "Home", 0.63, 1.60),
    ("Man City vs West Ham",   "Home", 0.75, 1.25),
    
    # MEDIUM CONFIDENCE (Good bets, but riskier)
    ("Arsenal vs Everton",     "Home", 0.58, 1.85),
    ("Napoli vs Lazio",        "Home", 0.54, 2.10),
    
    # LOW CONFIDENCE (Value bets / Longshots)
    # Even if model is unsure (40%), if odds are high (3.00), it might be a value bet!
    ("Girona vs Atletico",     "Away", 0.45, 2.40),
    ("Spurs vs Liverpool",     "Draw", 0.32, 3.80),
    ("Villa vs Man Utd",       "Away", 0.41, 2.50)
]

# SETTINGS
BANKROLL = 50        # Your total money
MAX_TOTAL_RISK = 0.20  # Never risk more than 20% of bankroll on one weekend
KELLY_FRACTION = 0.25  # Safety factor (1/4 Kelly is standard for pros)

# ==========================================
# 2. THE OPTIMIZATION ENGINE
# ==========================================

def solve_portfolio(bets, bankroll, max_risk, fraction):
    """
    Finds the optimal stake for multiple simultaneous bets
    to maximize expected geometric growth (Growth Rate).
    """
    n = len(bets)
    
    # Extract vectors
    probs = np.array([b[2] for b in bets])
    odds = np.array([b[3] for b in bets])
    names = [b[0] for b in bets]
    teams = [b[1] for b in bets]
    
    # 1. Filter Negative EV bets immediately
    # EV = (Prob * Odds) - 1. If < 0, mathematically we should never bet.
    evs = (probs * odds) - 1
    valid_indices = np.where(evs > 0)[0]
    
    if len(valid_indices) == 0:
        return pd.DataFrame()

    print(f"Analyzing {n} matches... Found {len(valid_indices)} +EV opportunities.")
    
    # Working only with positive EV bets
    active_probs = probs[valid_indices]
    active_odds = odds[valid_indices] - 1 # Net odds (b)
    
    # Objective Function: Maximize Expected Logarithmic Growth
    # We minimize the NEGATIVE of the growth to use the minimizer
    def objective(stakes):
        # stakes is a vector of % of bankroll allocated to each bet
        # This simplified formula approximates growth for independent events
        growth = np.sum(active_probs * np.log(1 + stakes * active_odds) + (1 - active_probs) * np.log(1 - stakes))
        return -growth

    # Constraints
    # 1. Sum of all stakes <= MAX_TOTAL_RISK
    cons = ({'type': 'ineq', 'fun': lambda x: max_risk - np.sum(x)})
    
    # 2. Each stake must be >= 0 and < 1
    bnds = tuple((0, max_risk) for _ in range(len(valid_indices)))
    
    # Initial guess (equal small stakes)
    init_guess = np.array([0.01] * len(valid_indices))
    
    # Run Solver
    result = minimize(objective, init_guess, method='SLSQP', bounds=bnds, constraints=cons)
    
    # Process Results
    optimal_stakes = result.x * fraction # Apply Fractional Kelly Safety
    
    output = []
    for i, idx in enumerate(valid_indices):
        stake_pct = optimal_stakes[i]
        cash = bankroll * stake_pct
        
        # Determine Edge
        edge = evs[idx]
        
        # Only list bets where the solver suggests money
        if cash > 1.00: 
            output.append({
                'Match': names[idx],
                'Pick': teams[idx],
                'Odds': odds[idx],
                'Model_%': f"{probs[idx]:.1%}",
                'Edge': f"{edge:.1%}",
                'Stake_%': f"{stake_pct:.2%}",
                'Cash': f"${cash:.2f}"
            })
            
    return pd.DataFrame(output)

# ==========================================
# 3. RUN OPTIMIZATION
# ==========================================
df_results = solve_portfolio(portfolio_data, BANKROLL, MAX_TOTAL_RISK, KELLY_FRACTION)

if not df_results.empty:
    print("\nOPTIMIZED BETTING SLIP (Weekend Portfolio)")
    print("=" * 85)
    print(df_results.to_string(index=False))
    print("-" * 85)
    
    total_cash = df_results['Cash'].str.replace('$','').astype(float).sum()
    print(f"TOTAL INVESTMENT: ${total_cash:.2f}  (Risking {total_cash/BANKROLL:.1%} of Bankroll)")
    print("=" * 85)
else:
    print("No profitable bets found. Save your money this weekend.")

Analyzing 7 matches... Found 6 +EV opportunities.
No profitable bets found. Save your money this weekend.


In [None]:
import pandas as pd
import numpy as np
import joblib

# ==========================================
# 1. LOAD THE SAVED MODEL
# ==========================================
print("Loading model...")
data = joblib.load('football_model_optimized.pkl')

best_model = data['model']
team_stats = data['team_stats']
le = data['encoder']

print("Model Loaded Successfully!")

# ==========================================
# 2. PREDICTION FUNCTION
# ==========================================
def predict_weekend(mananger,matchups, model, team_stats, encoder):
    print(f"\n{'MATCHUP':<40} | {'HOME %':<7} | {'DRAW %':<7} | {'AWAY %':<7} | {'PREDICTION':<12} | {'ACTION'}")
    print("-" * 105)
    
    # Helper to calculate stats exactly like training
    def get_avg(hist, window=5):
        if not hist: return 0
        return sum(hist[-window:]) / min(len(hist), window)

    for home, away in matchups:
        # Check data exists
        if home not in team_stats or away not in team_stats:
            print(f"{home:<18} vs {away:<18} | ???     | ???     | ???     | N/A          | ‚ùå Unknown Team")
            continue
            
        h = team_stats[home]
        a = team_stats[away]
        
        # Build Features
        features = {
            'HomeTeam_ID': encoder.transform([home])[0],
            'AwayTeam_ID': encoder.transform([away])[0],
            'Home_Form_Pts': get_avg(h['pts']),
            'Away_Form_Pts': get_avg(a['pts']),
            'Form_Diff': get_avg(h['pts']) - get_avg(a['pts']),
            'Home_xG_Avg': get_avg(h['xg']),
            'Away_xG_Avg': get_avg(a['xg']),
            'xG_Diff': get_avg(h['xg']) - get_avg(a['xg']),
            'Home_xGA_Avg': get_avg(h['xga']),
            'Away_xGA_Avg': get_avg(a['xga']),
            'Def_Diff': get_avg(h['xga']) - get_avg(a['xga']),
            'Home_Poss_Avg': get_avg(h['poss']),
            'Away_Poss_Avg': get_avg(a['poss']),
            'Poss_Diff': get_avg(h['poss']) - get_avg(a['poss'])
        }
        
        row = pd.DataFrame([features])
        
        # Predict
        probs = model.predict_proba(row)[0]
        p_away, p_draw, p_home = probs[0], probs[1], probs[2]
        odds_away, odds_draw, odds_home = 1/probs[0], 1/probs[1], 1/probs[2]
        confidence = np.max(probs)
        pred_label = ['Away Win', 'Draw', 'Home Win'][np.argmax(probs)]
        
        # --- ACTION LOGIC (Based on your 72% Accuracy) ---
        action = "-"
        if confidence > 0.50: action = "Low Bet"
        if confidence > 0.55: action = "‚úÖ BET"         # 65% Accuracy Zone
        if confidence > 0.60: action = "üî• SNIPER BET"  # 72% Accuracy Zone
        
        if pred_label != 'Draw' and p_draw > 0.32: action += " (‚ö†Ô∏è Risky)"
        manager.calculate_bet(odds=odds_home, model_confidence=confidence, historic_accuracy=0.725)

        print(f"{home:<18} vs {away:<18} | {p_home:.1%} {odds_home:.1f}   | {p_draw:.1%} {odds_draw:.1f}   | {p_away:.1%} {odds_away:.1f}   | {pred_label:<12} | {action}")

# ==========================================
# 3. REAL FIXTURES (DEC 20-21, 2025)
# ==========================================
# Update this list with the matches you want to check today
weekend_games = [
    # Premier League
    ('Tottenham Hotspur', 'Liverpool'),
    ('Manchester City', 'West Ham United'),
    ('Aston Villa', 'Manchester United'),
    ('Everton', 'Arsenal'),
    ('Newcastle United', 'Chelsea'),
    
    # La Liga
    ('Real Madrid', 'Sevilla'),
    ('Villarreal', 'Barcelona'),
    ('Girona', 'Atl√©tico Madrid'),
    
    # Serie A
    ('Juventus', 'Roma'),
    ('Napoli', 'Lazio')
]

# Run
predict_weekend(manager,weekend_games, best_model, team_stats, le)

Loading model...
Model Loaded Successfully!

MATCHUP                                  | HOME %  | DRAW %  | AWAY %  | PREDICTION   | ACTION
---------------------------------------------------------------------------------------------------------
Tottenham Hotspur  vs Liverpool          | 41.6% 2.4   | 25.4% 3.9   | 32.9% 3.0   | Home Win     | -
Manchester City    vs West Ham United    | 60.8% 1.6   | 22.9% 4.4   | 16.3% 6.1   | Home Win     | üî• SNIPER BET
Aston Villa        vs Manchester United  | 52.7% 1.9   | 22.0% 4.5   | 25.3% 3.9   | Home Win     | Low Bet
Everton            vs Arsenal            | 35.6% 2.8   | 20.8% 4.8   | 43.6% 2.3   | Away Win     | -
Newcastle United   vs Chelsea            | 37.8% 2.6   | 26.9% 3.7   | 35.3% 2.8   | Home Win     | -
Real Madrid        vs Sevilla            | 40.2% 2.5   | 26.6% 3.8   | 33.2% 3.0   | Home Win     | -
Villarreal         vs Barcelona          | 24.2% 4.1   | 20.7% 4.8   | 55.2% 1.8   | Away Win     | ‚úÖ BET
Girona        

In [None]:
def predict_weekend(matchups, model, team_stats, encoder):
