In [2]:
import xgboost
print(xgboost.build_info())

{'BUILTIN_PREFETCH_PRESENT': True, 'CUDA_VERSION': [12, 8], 'DEBUG': False, 'GCC_VERSION': [10, 3, 1], 'GLIBC_VERSION': [2, 28], 'MM_PREFETCH_PRESENT': True, 'NCCL_VERSION': [2, 27, 7], 'THRUST_VERSION': [2, 7, 0], 'USE_CUDA': True, 'USE_DLOPEN_NCCL': True, 'USE_FEDERATED': True, 'USE_NCCL': True, 'USE_NVCOMP': False, 'USE_OPENMP': True, 'USE_RMM': False, 'libxgboost': '/usr/local/lib/python3.12/dist-packages/xgboost/lib/libxgboost.so'}


In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import time

# 1. LOAD DATA
df = pd.read_csv("football_matches_2024_2025.csv")

# 2. PREPROCESSING
df['date_utc'] = pd.to_datetime(df['date_utc'], utc=True)
df = df.sort_values('date_utc')

# 3. ADVANCED FEATURE ENGINEERING
team_stats = {} 

features = {
    # Existing Powerful Features
    'Home_Season_PPG': [], 'Away_Season_PPG': [],
    'Home_Attack_Avg': [], 'Away_Attack_Avg': [],
    'Home_Defense_Avg': [], 'Away_Defense_Avg': [],
    
    # NEW: Fatigue & Momentum
    'Home_Rest_Days': [], 'Away_Rest_Days': [],
    'Home_Weighted_Form': [], 'Away_Weighted_Form': [],
    'Home_Win_Streak': [], 'Away_Win_Streak': []
}

def get_weighted_form(history_pts, window=5):
    # Gives higher weight to recent games: [1, 2, 3, 4, 5]
    if not history_pts: return 0
    relevant = history_pts[-window:]
    weights = range(1, len(relevant) + 1)
    return sum(p * w for p, w in zip(relevant, weights)) / sum(weights)

def get_rolling_avg(history_list, window=5):
    if not history_list: return 0
    return sum(history_list[-window:]) / min(len(history_list), window)

print("Generating features (Rest Days, Weighted Form, Streaks)...")

for index, row in df.iterrows():
    home = row['home_team']
    away = row['away_team']
    match_date = row['date_utc']
    
    # Initialize dictionary if new
    if home not in team_stats: 
        team_stats[home] = {'pts': [], 'gf': [], 'ga': [], 'last_date': None, 'streak': 0}
    if away not in team_stats: 
        team_stats[away] = {'pts': [], 'gf': [], 'ga': [], 'last_date': None, 'streak': 0}
    
    h_stats = team_stats[home]
    a_stats = team_stats[away]
    
    # --- A. CALCULATE INPUT FEATURES (Pre-Match) ---
    
    # 1. REST DAYS (Fatigue)
    # If first game, assume 7 days rest.
    h_rest = (match_date - h_stats['last_date']).days if h_stats['last_date'] else 7
    a_rest = (match_date - a_stats['last_date']).days if a_stats['last_date'] else 7
    # Cap at 14 days (long breaks don't add infinite stamina)
    features['Home_Rest_Days'].append(min(h_rest, 14))
    features['Away_Rest_Days'].append(min(a_rest, 14))
    
    # 2. WEIGHTED FORM (Recent form matters more)
    features['Home_Weighted_Form'].append(get_weighted_form(h_stats['pts']))
    features['Away_Weighted_Form'].append(get_weighted_form(a_stats['pts']))
    
    # 3. WIN STREAK (Momentum)
    features['Home_Win_Streak'].append(h_stats['streak'])
    features['Away_Win_Streak'].append(a_stats['streak'])
    
    # 4. Standard Stats (PPG, Attack, Defense)
    features['Home_Season_PPG'].append(np.mean(h_stats['pts']) if h_stats['pts'] else 1.35)
    features['Away_Season_PPG'].append(np.mean(a_stats['pts']) if a_stats['pts'] else 1.35)
    features['Home_Attack_Avg'].append(get_rolling_avg(h_stats['gf']))
    features['Away_Attack_Avg'].append(get_rolling_avg(a_stats['gf']))
    features['Home_Defense_Avg'].append(get_rolling_avg(h_stats['ga']))
    features['Away_Defense_Avg'].append(get_rolling_avg(a_stats['ga']))
    
    # --- B. UPDATE STATS (Post-Match) ---
    if row['fulltime_home'] > row['fulltime_away']:
        h_pts, a_pts = 3, 0
        h_stats['streak'] += 1 # Increase win streak
        a_stats['streak'] = 0  # Reset streak
    elif row['fulltime_home'] == row['fulltime_away']:
        h_pts, a_pts = 1, 1
        h_stats['streak'] = 0
        a_stats['streak'] = 0
    else:
        h_pts, a_pts = 0, 3
        h_stats['streak'] = 0
        a_stats['streak'] += 1
        
    h_stats['pts'].append(h_pts); a_stats['pts'].append(a_pts)
    h_stats['gf'].append(row['fulltime_home']); a_stats['gf'].append(row['fulltime_away'])
    h_stats['ga'].append(row['fulltime_away']); a_stats['ga'].append(row['fulltime_home'])
    
    # Update Last Date Played
    h_stats['last_date'] = match_date
    a_stats['last_date'] = match_date

# Add features to DF
for k, v in features.items():
    df[k] = v

# 4. INTERACTION FEATURES (Comparisons)
df['PPG_Diff'] = df['Home_Season_PPG'] - df['Away_Season_PPG']
df['Weighted_Form_Diff'] = df['Home_Weighted_Form'] - df['Away_Weighted_Form']
df['Rest_Diff'] = df['Home_Rest_Days'] - df['Away_Rest_Days'] # Is one team more tired?

# 5. ENCODE
le = LabelEncoder()
all_teams = pd.concat([df['home_team'], df['away_team']]).unique()
le.fit(all_teams)
df['HomeTeam_ID'] = le.transform(df['home_team'])
df['AwayTeam_ID'] = le.transform(df['away_team'])
y = LabelEncoder().fit_transform(df['match_outcome'])

# 6. SELECT FINAL FEATURES
X = df[[
    'HomeTeam_ID', 'AwayTeam_ID',
    'Home_Season_PPG', 'Away_Season_PPG', 'PPG_Diff',
    'Home_Weighted_Form', 'Away_Weighted_Form', 'Weighted_Form_Diff',
    'Home_Rest_Days', 'Away_Rest_Days', 'Rest_Diff',
    'Home_Win_Streak', 'Away_Win_Streak',
    'Home_Attack_Avg', 'Away_Attack_Avg',
    'Home_Defense_Avg', 'Away_Defense_Avg'
]]

# 7. SPLIT DATA
# Use last 20% as test set (Standard for time-series)
split_idx = int(len(df) * 0.8)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

# ============================================================
# 8. GRID SEARCH (CPU PARALLELIZED)
# ============================================================
print(f"Starting Grid Search on {len(X_train)} rows...")
start_time = time.time()

# Base model
xgb = XGBClassifier(objective='multi:softmax', num_class=3, random_state=42, n_jobs=1)

# Optimized Grid (Focused on the range that worked for you before)
param_grid = {
    'n_estimators': [200, 300, 500],
    'max_depth': [3, 4, 5], 
    'learning_rate': [0.01, 0.03],   # Low LR is usually best for sports
    'subsample': [0.7, 0.8],         # Prevent overfitting
    'colsample_bytree': [0.7, 0.8],
    'min_child_weight': [1, 3]
}

# TimeSeriesSplit prevents looking into the future during validation
cv = TimeSeriesSplit(n_splits=5)

grid = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='accuracy',
    cv=cv,
    n_jobs=-1,  # Uses ALL CPU cores
    verbose=1
)

grid.fit(X_train, y_train)

print(f"Grid Search finished in {time.time() - start_time:.1f} seconds")
print(f"Best Params: {grid.best_params_}")

# 9. FINAL EVALUATION
best_model = grid.best_estimator_
preds = best_model.predict(X_test)
acc = accuracy_score(y_test, preds)

print(f"-------------------------------")
print(f"FINAL ACCURACY: {acc:.2%}")
print(f"-------------------------------")

# Feature Importance
imp = pd.DataFrame({'Feature': X.columns, 'Score': best_model.feature_importances_})
print(imp.sort_values(by='Score', ascending=False).head(5))

Generating features (Rest Days, Weighted Form, Streaks)...
Starting Grid Search on 1552 rows...
Fitting 5 folds for each of 144 candidates, totalling 720 fits
Grid Search finished in 30.9 seconds
Best Params: {'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 200, 'subsample': 0.7}
-------------------------------
FINAL ACCURACY: 52.44%
-------------------------------
               Feature     Score
4             PPG_Diff  0.167583
2      Home_Season_PPG  0.080039
3      Away_Season_PPG  0.066357
7   Weighted_Form_Diff  0.057318
10           Rest_Diff  0.056186


In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

# Get probabilities
probs = best_model.predict_proba(X_test)
results = pd.DataFrame({
    'Actual': y_test,
    'Pred': preds,
    'Conf': np.max(probs, axis=1) # Max probability (Confidence)
})

print(f"{'Threshold':<10} | {'Matches':<8} | {'Accuracy':<10} | {'Status'}")
print("-" * 50)

# Loop through thresholds from 40% to 60%
for threshold in [0.40, 0.45, 0.50, 0.55, 0.60]:
    # Filter bets with confidence > threshold
    subset = results[results['Conf'] > threshold]
    
    if len(subset) > 0:
        acc = accuracy_score(subset['Actual'], subset['Pred'])
        
        # Determine if this is a good strategy
        status = "POOR"
        if acc > 0.50: status = "OK"
        if acc > 0.55: status = "GOOD"
        if acc > 0.60: status = "EXCELLENT"
        
        print(f"{threshold:.2f}       | {len(subset):<8} | {acc:.2%}     | {status}")
    else:
        print(f"{threshold:.2f}       | 0        | N/A        | -")

Threshold  | Matches  | Accuracy   | Status
--------------------------------------------------
0.40       | 297      | 56.57%     | GOOD
0.45       | 205      | 61.46%     | EXCELLENT
0.50       | 118      | 62.71%     | EXCELLENT
0.55       | 66       | 68.18%     | EXCELLENT
0.60       | 12       | 83.33%     | EXCELLENT


In [7]:
import joblib

# Create a dictionary to store everything needed for prediction
model_package = {
    'model': best_model,
    'le_team': le,        # Team ID encoder
    'team_stats': team_stats, # The history of every team (Crucial for next match)
    'features': features  # Feature lists
}

# Save to a file
joblib.dump(model_package, 'football_sniper_model_83acc.pkl')
print("Model saved successfully as 'football_sniper_model_83acc.pkl'")

Model saved successfully as 'football_sniper_model_83acc.pkl'


In [None]:
# =========================================================
#  PREDICT FUTURE MATCHES (PRODUCTION TOOL)
# =========================================================
import pandas as pd
import numpy as np

def predict_upcoming_games(match_list, model, team_stats, encoder):
    """
    match_list: List of dicts [{'Date': '2025-05-12', 'Home': 'Arsenal', 'Away': 'Aston Villa'}, ...]
    """
    
    predictions = []
    
    # Helper functions (Re-defining here to ensure they are available inside function)
    def get_weighted_form(history_pts, window=5):
        if not history_pts: return 0
        relevant = history_pts[-window:]
        weights = range(1, len(relevant) + 1)
        return sum(p * w for p, w in zip(relevant, weights)) / sum(weights)

    def get_rolling_avg(history_list, window=5):
        if not history_list: return 0
        return sum(history_list[-window:]) / min(len(history_list), window)

    print(f"{'Matchup':<40} | {'Pred':<10} | {'Conf':<6} | {'Status'}")
    print("-" * 80)

    for match in match_list:
        home = match['Home']
        away = match['Away']
        date = pd.to_datetime(match['Date'], utc=True)
        
        # 1. CHECK IF TEAMS EXIST
        if home not in team_stats or away not in team_stats:
            print(f"{home} vs {away}: Team not found in training data (Skipping)")
            continue
            
        h_stats = team_stats[home]
        a_stats = team_stats[away]
        
        # 2. CALCULATE LIVE FEATURES
        # Rest Days
        h_rest = (date - h_stats['last_date']).days if h_stats['last_date'] else 7
        a_rest = (date - a_stats['last_date']).days if a_stats['last_date'] else 7
        
        # Stats
        features = {
            'Home_Season_PPG': np.mean(h_stats['pts']) if h_stats['pts'] else 1.35,
            'Away_Season_PPG': np.mean(a_stats['pts']) if a_stats['pts'] else 1.35,
            'Home_Weighted_Form': get_weighted_form(h_stats['pts']),
            'Away_Weighted_Form': get_weighted_form(a_stats['pts']),
            'Home_Rest_Days': min(h_rest, 14),
            'Away_Rest_Days': min(a_rest, 14),
            'Home_Win_Streak': h_stats['streak'],
            'Away_Win_Streak': a_stats['streak'],
            'Home_Attack_Avg': get_rolling_avg(h_stats['gf']),
            'Away_Attack_Avg': get_rolling_avg(a_stats['gf']),
            'Home_Defense_Avg': get_rolling_avg(h_stats['ga']),
            'Away_Defense_Avg': get_rolling_avg(a_stats['ga'])
        }
        
        # Interactions
        features['PPG_Diff'] = features['Home_Season_PPG'] - features['Away_Season_PPG']
        features['Weighted_Form_Diff'] = features['Home_Weighted_Form'] - features['Away_Weighted_Form']
        features['Rest_Diff'] = features['Home_Rest_Days'] - features['Away_Rest_Days']
        
        # Encode Teams
        # Handle cases where teams might throw error if not in encoder (Safety check)
        try:
            features['HomeTeam_ID'] = encoder.transform([home])[0]
            features['AwayTeam_ID'] = encoder.transform([away])[0]
        except:
            print(f"Error encoding {home} or {away}")
            continue

        # Create DataFrame Row (Must match training columns EXACTLY)
        row = pd.DataFrame([features])
        
        # Ensure column order matches the trained model
        # (X_train is from your previous code block)
        row = row[X_train.columns]
        
        # 3. PREDICT
        pred_class = model.predict(row)[0]
        probs = model.predict_proba(row)[0]
        confidence = np.max(probs)
        
        # Map class to string
        outcomes = {0: 'Away Win', 1: 'Draw', 2: 'Home Win'}
        result_str = outcomes[pred_class]
        
        # 4. FILTER BY YOUR SWEET SPOT
        status = "IGNORE"
        if confidence > 0.45: status = "BET"      # The 61% Acc Zone
        if confidence > 0.55: status = "HIGH BET" # The 68% Acc Zone
        
        print(f"{home} vs {away:<20} | {result_str:<10} | {confidence:.2f}   | {status}")
        
        predictions.append({
            'Home': home, 'Away': away, 'Prediction': result_str, 
            'Confidence': confidence, 'Status': status
        })
        
    return pd.DataFrame(predictions)

# ==========================================
# EXAMPLE USAGE
# ==========================================

# Define some upcoming matches (Use real names from your dataset!)
# IMPORTANT: Use 'Home' and 'Away' names exactly as they appear in the CSV
next_fixtures = [
    {'Date': '2025-02-22', 'Home': 'Manchester City', 'Away': 'Chelsea'},
    {'Date': '2025-02-22', 'Home': 'Arsenal', 'Away': 'Everton'},
    {'Date': '2025-02-22', 'Home': 'Liverpool', 'Away': 'Newcastle United'},
    {'Date': '2025-02-22', 'Home': 'Real Madrid', 'Away': 'Barcelona'},
    {'Date': '2025-02-22', 'Home': 'Bayern Munich', 'Away': 'Dortmund'}
]

# Run prediction
# Note: 'best_model', 'team_stats', 'le' must be in memory from your previous training run
print("\nPREDICTING NEXT FIXTURES:\n")
preds_df = predict_upcoming_games(next_fixtures, best_model, team_stats, le)