In [2]:
import xgboost
print(xgboost.build_info())

{'BUILTIN_PREFETCH_PRESENT': True, 'CUDA_VERSION': [12, 8], 'DEBUG': False, 'GCC_VERSION': [10, 3, 1], 'GLIBC_VERSION': [2, 28], 'MM_PREFETCH_PRESENT': True, 'NCCL_VERSION': [2, 27, 7], 'THRUST_VERSION': [2, 7, 0], 'USE_CUDA': True, 'USE_DLOPEN_NCCL': True, 'USE_FEDERATED': True, 'USE_NCCL': True, 'USE_NVCOMP': False, 'USE_OPENMP': True, 'USE_RMM': False, 'libxgboost': '/usr/local/lib/python3.12/dist-packages/xgboost/lib/libxgboost.so'}


In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import warnings

warnings.filterwarnings('ignore')

# ==========================================
# 1. LOAD & CLEAN DATA
# ==========================================
# Replace with your actual filename
df = pd.read_csv("match_data copy.csv") 

# --- CLEANING THE NEW COLUMNS ---
# 1. Possession: "70%" -> 0.70
df['home_possession'] = df['home_team_possession'].astype(str).str.rstrip('%').astype(float) / 100.0
df['away_possession'] = df['away_team_possession'].astype(str).str.rstrip('%').astype(float) / 100.0

# 2. Attendance: "68,407" -> 68407 (Handle NaNs)
df['attendance'] = df['attendance'].astype(str).str.replace(',', '')
df['attendance'] = pd.to_numeric(df['attendance'], errors='coerce').fillna(20000)

# 3. Date Handling
# If you have a date column, use it. If not, we assume the file is sorted by time.
if 'date_utc' in df.columns:
    df['match_date'] = pd.to_datetime(df['date_utc'], utc=True)
    df = df.sort_values('match_date')
elif 'Date' in df.columns:
    df['match_date'] = pd.to_datetime(df['Date'])
    df = df.sort_values('match_date')
else:
    # Create a dummy date based on index if missing (Assumes file is sorted)
    df['match_date'] = df.index

# ==========================================
# 2. FEATURE ENGINEERING (GLOBAL LOOP)
# ==========================================
team_stats = {} 

# We will track rolling averages for these metrics
features = {
    'Home_Form_Pts': [], 'Away_Form_Pts': [],
    'Home_xG_Avg': [],   'Away_xG_Avg': [],    # Attack Quality
    'Home_xGA_Avg': [],  'Away_xGA_Avg': [],   # Defense Quality (xG Allowed)
    'Home_Poss_Avg': [], 'Away_Poss_Avg': []   # Control
}

def get_rolling_avg(history, window=5):
    if not history: return 0
    return sum(history[-window:]) / min(len(history), window)

print("Generating Advanced xG & Possession Features...")

for index, row in df.iterrows():
    # Use the new column names
    home = row['home_team_name']
    away = row['away_team_name']
    
    # Initialize team history if new
    if home not in team_stats: 
        team_stats[home] = {'pts': [], 'xg': [], 'xga': [], 'poss': []}
    if away not in team_stats: 
        team_stats[away] = {'pts': [], 'xg': [], 'xga': [], 'poss': []}
    
    h_stats = team_stats[home]
    a_stats = team_stats[away]
    
    # --- A. CALCULATE INPUT FEATURES (Before Match) ---
    features['Home_Form_Pts'].append(get_rolling_avg(h_stats['pts']))
    features['Away_Form_Pts'].append(get_rolling_avg(a_stats['pts']))
    
    features['Home_xG_Avg'].append(get_rolling_avg(h_stats['xg']))
    features['Away_xG_Avg'].append(get_rolling_avg(a_stats['xg']))
    
    features['Home_xGA_Avg'].append(get_rolling_avg(h_stats['xga']))
    features['Away_xGA_Avg'].append(get_rolling_avg(a_stats['xga']))
    
    features['Home_Poss_Avg'].append(get_rolling_avg(h_stats['poss']))
    features['Away_Poss_Avg'].append(get_rolling_avg(a_stats['poss']))
    
    # --- B. UPDATE HISTORY (After Match) ---
    # Points
    if row['home_team_score'] > row['away_team_score']:
        h_pts, a_pts = 3, 0
    elif row['home_team_score'] == row['away_team_score']:
        h_pts, a_pts = 1, 1
    else:
        h_pts, a_pts = 0, 3
        
    # Append Stats
    h_stats['pts'].append(h_pts)
    a_stats['pts'].append(a_pts)
    
    # xG (Expected Goals)
    h_stats['xg'].append(row['home_team_xg'])
    a_stats['xg'].append(row['away_team_xg'])
    
    # xGA (Expected Goals Allowed) - Crucial for Defense!
    h_stats['xga'].append(row['away_team_xg']) # Home allowed what Away created
    a_stats['xga'].append(row['home_team_xg']) # Away allowed what Home created
    
    # Possession
    h_stats['poss'].append(row['home_possession'])
    a_stats['poss'].append(row['away_possession'])

# Add features to DF
for k, v in features.items():
    df[k] = v

# ==========================================
# 3. INTERACTION FEATURES (Comparisons)
# ==========================================
# Compare Home Attack (xG) vs Away Defense (xGA)
df['Home_xG_vs_Away_Def'] = df['Home_xG_Avg'] - df['Away_xGA_Avg']
df['Away_xG_vs_Home_Def'] = df['Away_xG_Avg'] - df['Home_xGA_Avg']

# Compare Possession Styles
df['Possession_Diff'] = df['Home_Poss_Avg'] - df['Away_Poss_Avg']

# ==========================================
# 4. PREPARE TRAINING DATA
# ==========================================
# Target
# 0: Away Win, 1: Draw, 2: Home Win
conditions = [
    (df['home_team_score'] > df['away_team_score']),
    (df['home_team_score'] == df['away_team_score']),
    (df['home_team_score'] < df['away_team_score'])
]
df['match_outcome'] = np.select(conditions, [2, 1, 0])

# Encode Teams
le = LabelEncoder()
all_teams = pd.concat([df['home_team_name'], df['away_team_name']]).unique()
le.fit(all_teams)
df['HomeTeam_ID'] = le.transform(df['home_team_name'])
df['AwayTeam_ID'] = le.transform(df['away_team_name'])

# Select Features for Model
X = df[[
    'HomeTeam_ID', 'AwayTeam_ID',
    'Home_Form_Pts', 'Away_Form_Pts',
    'Home_xG_Avg', 'Away_xG_Avg',
    'Home_xGA_Avg', 'Away_xGA_Avg',
    'Home_Poss_Avg', 'Away_Poss_Avg',
    'Home_xG_vs_Away_Def', 'Away_xG_vs_Home_Def',
    'Possession_Diff'
]]
y = df['match_outcome']

# Time Split
split = int(len(df) * 0.85)
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

# ==========================================
# 5. TRAIN MODEL (Optimized Params)
# ==========================================
print(f"Training on {len(X_train)} matches...")

bst = XGBClassifier(
    n_estimators=300,
    max_depth=4,           # Depth 4 captures the xG interactions well
    learning_rate=0.02,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='multi:softmax',
    num_class=3,
    random_state=42,
    n_jobs=-1
)

bst.fit(X_train, y_train)

# ==========================================
# 6. EVALUATE
# ==========================================
preds = bst.predict(X_test)
probs = bst.predict_proba(X_test)
acc = accuracy_score(y_test, preds)

print(f"-------------------------------")
print(f"ACCURACY: {acc:.2%}")
print(f"-------------------------------")

# High Confidence Filter
results = pd.DataFrame({'Actual': y_test, 'Pred': preds, 'Conf': np.max(probs, axis=1)})
high_conf = results[results['Conf'] > 0.55] # Filter for strong signals
if len(high_conf) > 0:
    print(f"High Confidence Accuracy (>55%): {accuracy_score(high_conf['Actual'], high_conf['Pred']):.2%} (on {len(high_conf)} games)")

# ==========================================
# 7. PREDICTION TOOL (NEXT MATCHES)
# ==========================================
def predict_match(home_name, away_name):
    if home_name not in team_stats or away_name not in team_stats:
        return "Teams not found in history"
    
    h = team_stats[home_name]
    a = team_stats[away_name]
    
    # Build Feature Row based on history
    row = pd.DataFrame([{
        'HomeTeam_ID': le.transform([home_name])[0],
        'AwayTeam_ID': le.transform([away_name])[0],
        'Home_Form_Pts': get_rolling_avg(h['pts']),
        'Away_Form_Pts': get_rolling_avg(a['pts']),
        'Home_xG_Avg': get_rolling_avg(h['xg']),
        'Away_xG_Avg': get_rolling_avg(a['xg']),
        'Home_xGA_Avg': get_rolling_avg(h['xga']),
        'Away_xGA_Avg': get_rolling_avg(a['xga']),
        'Home_Poss_Avg': get_rolling_avg(h['poss']),
        'Away_Poss_Avg': get_rolling_avg(a['poss']),
        # Interactions
        'Home_xG_vs_Away_Def': get_rolling_avg(h['xg']) - get_rolling_avg(a['xga']),
        'Away_xG_vs_Home_Def': get_rolling_avg(a['xg']) - get_rolling_avg(h['xga']),
        'Possession_Diff': get_rolling_avg(h['poss']) - get_rolling_avg(a['poss'])
    }])
    
    # Predict
    prob = bst.predict_proba(row)[0]
    pred = np.argmax(prob)
    conf = np.max(prob)
    
    labels = {0: 'Away Win', 1: 'Draw', 2: 'Home Win'}
    
    return f"{home_name} vs {away_name}: {labels[pred]} ({conf:.1%} confidence)"

print("\n--- TEST PREDICTIONS ---")
# Replace these names with teams that exist in your CSV
# Example check:
teams_list = list(team_stats.keys())[:5]
print(f"Available teams example: {teams_list}")

if len(teams_list) >= 2:
    print(predict_match(teams_list[0], teams_list[1]))

Generating features (Rest Days, Weighted Form, Streaks)...
Starting Grid Search on 1552 rows...
Fitting 5 folds for each of 144 candidates, totalling 720 fits
Grid Search finished in 30.9 seconds
Best Params: {'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 200, 'subsample': 0.7}
-------------------------------
FINAL ACCURACY: 52.44%
-------------------------------
               Feature     Score
4             PPG_Diff  0.167583
2      Home_Season_PPG  0.080039
3      Away_Season_PPG  0.066357
7   Weighted_Form_Diff  0.057318
10           Rest_Diff  0.056186


In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

# Get probabilities
probs = best_model.predict_proba(X_test)
results = pd.DataFrame({
    'Actual': y_test,
    'Pred': preds,
    'Conf': np.max(probs, axis=1) # Max probability (Confidence)
})

print(f"{'Threshold':<10} | {'Matches':<8} | {'Accuracy':<10} | {'Status'}")
print("-" * 50)

# Loop through thresholds from 40% to 60%
for threshold in [0.40, 0.45, 0.50, 0.55, 0.60]:
    # Filter bets with confidence > threshold
    subset = results[results['Conf'] > threshold]
    
    if len(subset) > 0:
        acc = accuracy_score(subset['Actual'], subset['Pred'])
        
        # Determine if this is a good strategy
        status = "POOR"
        if acc > 0.50: status = "OK"
        if acc > 0.55: status = "GOOD"
        if acc > 0.60: status = "EXCELLENT"
        
        print(f"{threshold:.2f}       | {len(subset):<8} | {acc:.2%}     | {status}")
    else:
        print(f"{threshold:.2f}       | 0        | N/A        | -")

Threshold  | Matches  | Accuracy   | Status
--------------------------------------------------
0.40       | 297      | 56.57%     | GOOD
0.45       | 205      | 61.46%     | EXCELLENT
0.50       | 118      | 62.71%     | EXCELLENT
0.55       | 66       | 68.18%     | EXCELLENT
0.60       | 12       | 83.33%     | EXCELLENT


In [7]:
import joblib

# Create a dictionary to store everything needed for prediction
model_package = {
    'model': best_model,
    'le_team': le,        # Team ID encoder
    'team_stats': team_stats, # The history of every team (Crucial for next match)
    'features': features  # Feature lists
}

# Save to a file
joblib.dump(model_package, 'football_sniper_model_83acc.pkl')
print("Model saved successfully as 'football_sniper_model_83acc.pkl'")

Model saved successfully as 'football_sniper_model_83acc.pkl'


In [8]:
# =========================================================
#  PREDICT FUTURE MATCHES (PRODUCTION TOOL)
# =========================================================
import pandas as pd
import numpy as np

def predict_upcoming_games(match_list, model, team_stats, encoder):
    """
    match_list: List of dicts [{'Date': '2025-05-12', 'Home': 'Arsenal', 'Away': 'Aston Villa'}, ...]
    """
    
    predictions = []
    
    # Helper functions (Re-defining here to ensure they are available inside function)
    def get_weighted_form(history_pts, window=5):
        if not history_pts: return 0
        relevant = history_pts[-window:]
        weights = range(1, len(relevant) + 1)
        return sum(p * w for p, w in zip(relevant, weights)) / sum(weights)

    def get_rolling_avg(history_list, window=5):
        if not history_list: return 0
        return sum(history_list[-window:]) / min(len(history_list), window)

    print(f"{'Matchup':<40} | {'Pred':<10} | {'Conf':<6} | {'Status'}")
    print("-" * 80)

    for match in match_list:
        home = match['Home']
        away = match['Away']
        date = pd.to_datetime(match['Date'], utc=True)
        
        # 1. CHECK IF TEAMS EXIST
        if home not in team_stats or away not in team_stats:
            print(f"{home} vs {away}: Team not found in training data (Skipping)")
            continue
            
        h_stats = team_stats[home]
        a_stats = team_stats[away]
        
        # 2. CALCULATE LIVE FEATURES
        # Rest Days
        h_rest = (date - h_stats['last_date']).days if h_stats['last_date'] else 7
        a_rest = (date - a_stats['last_date']).days if a_stats['last_date'] else 7
        
        # Stats
        features = {
            'Home_Season_PPG': np.mean(h_stats['pts']) if h_stats['pts'] else 1.35,
            'Away_Season_PPG': np.mean(a_stats['pts']) if a_stats['pts'] else 1.35,
            'Home_Weighted_Form': get_weighted_form(h_stats['pts']),
            'Away_Weighted_Form': get_weighted_form(a_stats['pts']),
            'Home_Rest_Days': min(h_rest, 14),
            'Away_Rest_Days': min(a_rest, 14),
            'Home_Win_Streak': h_stats['streak'],
            'Away_Win_Streak': a_stats['streak'],
            'Home_Attack_Avg': get_rolling_avg(h_stats['gf']),
            'Away_Attack_Avg': get_rolling_avg(a_stats['gf']),
            'Home_Defense_Avg': get_rolling_avg(h_stats['ga']),
            'Away_Defense_Avg': get_rolling_avg(a_stats['ga'])
        }
        
        # Interactions
        features['PPG_Diff'] = features['Home_Season_PPG'] - features['Away_Season_PPG']
        features['Weighted_Form_Diff'] = features['Home_Weighted_Form'] - features['Away_Weighted_Form']
        features['Rest_Diff'] = features['Home_Rest_Days'] - features['Away_Rest_Days']
        
        # Encode Teams
        # Handle cases where teams might throw error if not in encoder (Safety check)
        try:
            features['HomeTeam_ID'] = encoder.transform([home])[0]
            features['AwayTeam_ID'] = encoder.transform([away])[0]
        except:
            print(f"Error encoding {home} or {away}")
            continue

        # Create DataFrame Row (Must match training columns EXACTLY)
        row = pd.DataFrame([features])
        
        # Ensure column order matches the trained model
        # (X_train is from your previous code block)
        row = row[X_train.columns]
        
        # 3. PREDICT
        pred_class = model.predict(row)[0]
        probs = model.predict_proba(row)[0]
        confidence = np.max(probs)
        
        # Map class to string
        outcomes = {0: 'Away Win', 1: 'Draw', 2: 'Home Win'}
        result_str = outcomes[pred_class]
        
        # 4. FILTER BY YOUR SWEET SPOT
        status = "IGNORE"
        if confidence > 0.45: status = "BET"      # The 61% Acc Zone
        if confidence > 0.55: status = "HIGH BET" # The 68% Acc Zone
        
        print(f"{home} vs {away:<20} | {result_str:<10} | {confidence:.2f}   | {status}")
        
        predictions.append({
            'Home': home, 'Away': away, 'Prediction': result_str, 
            'Confidence': confidence, 'Status': status
        })
        
    return pd.DataFrame(predictions)

# ==========================================
# EXAMPLE USAGE
# ==========================================

# Define some upcoming matches (Use real names from your dataset!)
# IMPORTANT: Use 'Home' and 'Away' names exactly as they appear in the CSV
next_fixtures = [
    {'Date': '2025-02-22', 'Home': 'Manchester City', 'Away': 'Chelsea'},
    {'Date': '2025-02-22', 'Home': 'Arsenal', 'Away': 'Everton'},
    {'Date': '2025-02-22', 'Home': 'Liverpool', 'Away': 'Newcastle United'},
    {'Date': '2025-02-22', 'Home': 'Real Madrid', 'Away': 'Barcelona'},
    {'Date': '2025-02-22', 'Home': 'Bayern Munich', 'Away': 'Dortmund'}
]

# Run prediction
# Note: 'best_model', 'team_stats', 'le' must be in memory from your previous training run
print("\nPREDICTING NEXT FIXTURES:\n")
preds_df = predict_upcoming_games(next_fixtures, best_model, team_stats, le)


PREDICTING NEXT FIXTURES:

Matchup                                  | Pred       | Conf   | Status
--------------------------------------------------------------------------------
Manchester City vs Chelsea: Team not found in training data (Skipping)
Arsenal vs Everton: Team not found in training data (Skipping)
Liverpool vs Newcastle United: Team not found in training data (Skipping)
Real Madrid vs Barcelona: Team not found in training data (Skipping)
Bayern Munich vs Dortmund: Team not found in training data (Skipping)
