In [44]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import warnings

warnings.filterwarnings('ignore')

# ==========================================
# 1. LOAD & CLEAN DATA
# ==========================================
# Replace with your actual filename
df = pd.read_csv("match_data.csv") 

# --- CLEANING THE NEW COLUMNS ---
# 1. Possession: "70%" -> 0.70
df['home_possession'] = pd.to_numeric(df['home_team_possession'].astype(str).str.rstrip('%'), errors='coerce') / 100.0
df['away_possession'] = pd.to_numeric(df['away_team_possession'].astype(str).str.rstrip('%'), errors='coerce') / 100.0
# If possession is missing, assume 50% (0.50)
df['home_possession'] = df['home_possession'].fillna(0.50)
df['away_possession'] = df['away_possession'].fillna(0.50)

# 2. Attendance: "68,407" -> 68407 (Handle NaNs)
df['attendance'] = df['attendance'].astype(str).str.replace(',', '')
df['attendance'] = pd.to_numeric(df['attendance'], errors='coerce').fillna(20000)

# 3. Date Handling
# If you have a date column, use it. If not, we assume the file is sorted by time.
if 'date_utc' in df.columns:
    df['match_date'] = pd.to_datetime(df['date_utc'], utc=True)
    df = df.sort_values('match_date')
elif 'Date' in df.columns:
    df['match_date'] = pd.to_datetime(df['Date'])
    df = df.sort_values('match_date')
else:
    # Create a dummy date based on index if missing (Assumes file is sorted)
    df['match_date'] = df.index

# ==========================================
# 2. FEATURE ENGINEERING (GLOBAL LOOP)
# ==========================================
team_stats = {} 

# We will track rolling averages for these metrics
features = {
    'Home_Form_Pts': [], 'Away_Form_Pts': [],
    'Home_xG_Avg': [],   'Away_xG_Avg': [],    # Attack Quality
    'Home_xGA_Avg': [],  'Away_xGA_Avg': [],   # Defense Quality (xG Allowed)
    'Home_Poss_Avg': [], 'Away_Poss_Avg': []   # Control
}

def get_rolling_avg(history, window=5):
    if not history: return 0
    return sum(history[-window:]) / min(len(history), window)

print("Generating Advanced xG & Possession Features...")

for index, row in df.iterrows():
    # Use the new column names
    home = row['home_team_name']
    away = row['away_team_name']
    
    # Initialize team history if new
    if home not in team_stats: 
        team_stats[home] = {'pts': [], 'xg': [], 'xga': [], 'poss': []}
    if away not in team_stats: 
        team_stats[away] = {'pts': [], 'xg': [], 'xga': [], 'poss': []}
    
    h_stats = team_stats[home]
    a_stats = team_stats[away]
    
    # --- A. CALCULATE INPUT FEATURES (Before Match) ---
    features['Home_Form_Pts'].append(get_rolling_avg(h_stats['pts']))
    features['Away_Form_Pts'].append(get_rolling_avg(a_stats['pts']))
    
    features['Home_xG_Avg'].append(get_rolling_avg(h_stats['xg']))
    features['Away_xG_Avg'].append(get_rolling_avg(a_stats['xg']))
    
    features['Home_xGA_Avg'].append(get_rolling_avg(h_stats['xga']))
    features['Away_xGA_Avg'].append(get_rolling_avg(a_stats['xga']))
    
    features['Home_Poss_Avg'].append(get_rolling_avg(h_stats['poss']))
    features['Away_Poss_Avg'].append(get_rolling_avg(a_stats['poss']))
    
    # --- B. UPDATE HISTORY (After Match) ---
    # Points
    if row['home_team_score'] > row['away_team_score']:
        h_pts, a_pts = 3, 0
    elif row['home_team_score'] == row['away_team_score']:
        h_pts, a_pts = 1, 1
    else:
        h_pts, a_pts = 0, 3
        
    # Append Stats
    h_stats['pts'].append(h_pts)
    a_stats['pts'].append(a_pts)
    
    # xG (Expected Goals)
    h_stats['xg'].append(row['home_team_xg'])
    a_stats['xg'].append(row['away_team_xg'])
    
    # xGA (Expected Goals Allowed) - Crucial for Defense!
    h_stats['xga'].append(row['away_team_xg']) # Home allowed what Away created
    a_stats['xga'].append(row['home_team_xg']) # Away allowed what Home created
    
    # Possession
    h_stats['poss'].append(row['home_possession'])
    a_stats['poss'].append(row['away_possession'])

# Add features to DF
for k, v in features.items():
    df[k] = v

# ==========================================
# 3. INTERACTION FEATURES (Comparisons)
# ==========================================
# Compare Home Attack (xG) vs Away Defense (xGA)
df['Home_xG_vs_Away_Def'] = df['Home_xG_Avg'] - df['Away_xGA_Avg']
df['Away_xG_vs_Home_Def'] = df['Away_xG_Avg'] - df['Home_xGA_Avg']

# Compare Possession Styles
df['Possession_Diff'] = df['Home_Poss_Avg'] - df['Away_Poss_Avg']

# ==========================================
# 4. PREPARE TRAINING DATA
# ==========================================
# Target
# 0: Away Win, 1: Draw, 2: Home Win
conditions = [
    (df['home_team_score'] > df['away_team_score']),
    (df['home_team_score'] == df['away_team_score']),
    (df['home_team_score'] < df['away_team_score'])
]
df['match_outcome'] = np.select(conditions, [2, 1, 0])

# Encode Teams
le = LabelEncoder()
all_teams = pd.concat([df['home_team_name'], df['away_team_name']]).unique()
le.fit(all_teams)
df['HomeTeam_ID'] = le.transform(df['home_team_name'])
df['AwayTeam_ID'] = le.transform(df['away_team_name'])

# Select Features for Model
X = df[[
    'HomeTeam_ID', 'AwayTeam_ID',
    'Home_Form_Pts', 'Away_Form_Pts',
    'Home_xG_Avg', 'Away_xG_Avg',
    'Home_xGA_Avg', 'Away_xGA_Avg',
    'Home_Poss_Avg', 'Away_Poss_Avg',
    'Home_xG_vs_Away_Def', 'Away_xG_vs_Home_Def',
    'Possession_Diff'
]]
y = df['match_outcome']

# Time Split
split = int(len(df) * 0.85)
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

# ==========================================
# 5. TRAIN MODEL (Optimized Params)
# ==========================================
print(f"Training on {len(X_train)} matches...")

bst = XGBClassifier(
    n_estimators=100,
    max_depth=5,           # Depth 4 captures the xG interactions well
    learning_rate=0.02,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='multi:softmax',
    num_class=3,
    random_state=42,
    n_jobs=-1
)

bst.fit(X_train, y_train)

# ==========================================
# 6. EVALUATE
# ==========================================
preds = bst.predict(X_test)
probs = bst.predict_proba(X_test)
acc = accuracy_score(y_test, preds)

print(f"-------------------------------")
print(f"ACCURACY: {acc:.2%}")
print(f"-------------------------------")

# High Confidence Filter
results = pd.DataFrame({'Actual': y_test, 'Pred': preds, 'Conf': np.max(probs, axis=1)})
high_conf = results[results['Conf'] > 0.55] # Filter for strong signals
if len(high_conf) > 0:
    print(f"High Confidence Accuracy (>55%): {accuracy_score(high_conf['Actual'], high_conf['Pred']):.2%} (on {len(high_conf)} games)")

# ==========================================
# 7. PREDICTION TOOL (NEXT MATCHES)
# ==========================================
def predict_match(home_name, away_name):
    if home_name not in team_stats or away_name not in team_stats:
        return "Teams not found in history"
    
    h = team_stats[home_name]
    a = team_stats[away_name]
    
    # Build Feature Row based on history
    row = pd.DataFrame([{
        'HomeTeam_ID': le.transform([home_name])[0],
        'AwayTeam_ID': le.transform([away_name])[0],
        'Home_Form_Pts': get_rolling_avg(h['pts']),
        'Away_Form_Pts': get_rolling_avg(a['pts']),
        'Home_xG_Avg': get_rolling_avg(h['xg']),
        'Away_xG_Avg': get_rolling_avg(a['xg']),
        'Home_xGA_Avg': get_rolling_avg(h['xga']),
        'Away_xGA_Avg': get_rolling_avg(a['xga']),
        'Home_Poss_Avg': get_rolling_avg(h['poss']),
        'Away_Poss_Avg': get_rolling_avg(a['poss']),
        # Interactions
        'Home_xG_vs_Away_Def': get_rolling_avg(h['xg']) - get_rolling_avg(a['xga']),
        'Away_xG_vs_Home_Def': get_rolling_avg(a['xg']) - get_rolling_avg(h['xga']),
        'Possession_Diff': get_rolling_avg(h['poss']) - get_rolling_avg(a['poss'])
    }])
    
    # Predict
    prob = bst.predict_proba(row)[0]
    pred = np.argmax(prob)
    conf = np.max(prob)
    
    labels = {0: 'Away Win', 1: 'Draw', 2: 'Home Win'}
    
    return f"{home_name} vs {away_name}: {labels[pred]} ({conf:.1%} confidence)"

print("\n--- TEST PREDICTIONS ---")
# Replace these names with teams that exist in your CSV
# Example check:
teams_list = list(team_stats.keys())
print(f"Available teams example: {teams_list}")

if len(teams_list) >= 2:
    print(predict_match(teams_list[0], teams_list[1]))

Generating Advanced xG & Possession Features...
Training on 668 matches...
-------------------------------
ACCURACY: 61.02%
-------------------------------
High Confidence Accuracy (>55%): 72.46% (on 69 games)

--- TEST PREDICTIONS ---
Available teams example: ['Real Madrid', 'Osasuna', 'Anderlecht', 'Club Brugge', 'Arsenal', 'West Ham United', 'Newcastle United', 'Liverpool', 'Real Betis', 'Mallorca', 'Girona', 'Espanyol', 'RB Leipzig', 'Eintracht Frankfurt', 'Chelsea', 'Crystal Palace', 'Slavia Prague', 'Ban√≠k Ostrava', 'Olympiacos', 'AEK Athens', 'Barcelona', 'Real Sociedad', 'Tottenham Hotspur', 'Villarreal', 'Atalanta', 'Pisa', 'Getafe', 'Manchester City', 'Bayer Leverkusen', 'Fiorentina', 'Napoli', 'Athletic Club', 'Rayo Vallecano', 'Union SG', 'Charleroi', 'Celta Vigo', 'FC Neman Grodno', 'Hoffenheim', 'Internazionale', 'OH Leuven', 'Bod√∏/Glimt', 'Juventus', 'FK Pardubice', 'Cagliari', 'FC Copenhagen', 'AGF', 'Elche', 'Qarabaƒü', 'Shk√´ndija 79', 'Brentford', 'Dortmund', 'Stut

In [45]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Optional: Show what the model actually learned (Feature Importance)
importance = pd.DataFrame({
    'Feature': X.columns,
    'Score': bst.feature_importances_
}).sort_values(by='Score', ascending=False)

print("Top 5 Key Predictors:")
print(importance.head(5))

# Get probabilities
probs = bst.predict_proba(X_test)
results = pd.DataFrame({
    'Actual': y_test,
    'Pred': preds,
    'Conf': np.max(probs, axis=1) # Max probability (Confidence)
})

print(f"{'Threshold':<10} | {'Matches':<8} | {'Accuracy':<10} | {'Status'}")
print("-" * 50)

# Loop through thresholds from 40% to 60%
for threshold in [0.40, 0.45, 0.50, 0.55, 0.60]:
    # Filter bets with confidence > threshold
    subset = results[results['Conf'] > threshold]
    
    if len(subset) > 0:
        acc = accuracy_score(subset['Actual'], subset['Pred'])
        
        # Determine if this is a good strategy
        status = "POOR"
        if acc > 0.50: status = "OK"
        if acc > 0.55: status = "GOOD"
        if acc > 0.60: status = "EXCELLENT"
        
        print(f"{threshold:.2f}       | {len(subset):<8} | {acc:.2%}     | {status}")
    else:
        print(f"{threshold:.2f}       | 0        | N/A        | -")

Top 5 Key Predictors:
            Feature     Score
12  Possession_Diff  0.191911
2     Home_Form_Pts  0.087780
8     Home_Poss_Avg  0.077441
3     Away_Form_Pts  0.073726
4       Home_xG_Avg  0.071277
Threshold  | Matches  | Accuracy   | Status
--------------------------------------------------
0.40       | 116      | 61.21%     | EXCELLENT
0.45       | 98       | 69.39%     | EXCELLENT
0.50       | 85       | 72.94%     | EXCELLENT
0.55       | 69       | 72.46%     | EXCELLENT
0.60       | 47       | 72.34%     | EXCELLENT


In [None]:
import pandas as pd
import numpy as np

def predict_with_probabilities(matchups, model, team_stats, encoder):
    # Header for the results table
    print(f"\n{'MATCHUP':<40} | {'HOME %':<7} | {'DRAW %':<7} | {'AWAY %':<7} | {'PREDICTION':<10} | {'ACTION'}")
    print("-" * 100)
    
    predictions = []
    
    # Helper for rolling averages
    def get_avg(hist, window=5):
        if not hist: return 0
        return sum(hist[-window:]) / min(len(hist), window)

    for home, away in matchups:
        # 1. Check Data Availability
        if home not in team_stats or away not in team_stats:
            print(f"{home:<18} vs {away:<18} | ???     | ???     | ???     | N/A        | ‚ùå Missing Data")
            continue
            
        h = team_stats[home]
        a = team_stats[away]
        
        # 2. Build Feature Row
        features = {
            'HomeTeam_ID': encoder.transform([home])[0],
            'AwayTeam_ID': encoder.transform([away])[0],
            'Home_Form_Pts': get_avg(h['pts']),
            'Away_Form_Pts': get_avg(a['pts']),
            'Home_xG_Avg': get_avg(h['xg']),
            'Away_xG_Avg': get_avg(a['xg']),
            'Home_xGA_Avg': get_avg(h['xga']),
            'Away_xGA_Avg': get_avg(a['xga']),
            'Home_Poss_Avg': get_avg(h['poss']),
            'Away_Poss_Avg': get_avg(a['poss']),
            'Home_xG_vs_Away_Def': get_avg(h['xg']) - get_avg(a['xga']),
            'Away_xG_vs_Home_Def': get_avg(a['xg']) - get_avg(h['xga']),
            'Possession_Diff': get_avg(h['poss']) - get_avg(a['poss'])
        }
        
        row = pd.DataFrame([features])
        
        # 3. Get Probabilities
        # probs returns [Prob_Class_0, Prob_Class_1, Prob_Class_2]
        # Based on our training: 0=Away, 1=Draw, 2=Home
        probs = model.predict_proba(row)[0]
        
        p_away = probs[0]
        p_draw = probs[1]
        p_home = probs[2]
        
        # Determine Prediction
        pred_idx = np.argmax(probs)
        confidence = np.max(probs)
        
        labels = {0: 'Away Win', 1: 'Draw', 2: 'Home Win'}
        prediction = labels[pred_idx]
        
        # 4. Action Logic
        action = "-"
        if confidence > 0.50: action = "Low Bet"
        if confidence > 0.55: action = "‚úÖ BET"
        if confidence > 0.65: action = "üî• BIG BET"
        
        # Highlight risky draws
        if prediction != 'Draw' and p_draw > 0.30:
            action += " (‚ö†Ô∏è High Draw Risk)"

        # Print Row
        print(f"{home:<18} vs {away:<18} | {p_home:.1%}   | {p_draw:.1%}   | {p_away:.1%}   | {prediction:<10} | {action}")
        
        predictions.append({
            'Home': home, 
            'Away': away, 
            'Home_Prob': p_home, 
            'Draw_Prob': p_draw, 
            'Away_Prob': p_away, 
            'Prediction': prediction, 
            'Action': action
        })

    return pd.DataFrame(predictions)

# ==========================================
# INPUT MATCHES HERE
# ==========================================
weekend_games = [
    # --- PREMIER LEAGUE ---
    ('Manchester City', 'West Ham United'),
    ('Tottenham Hotspur', 'Liverpool'),   # Huge Game
    ('Everton', 'Arsenal'),
    ('Newcastle United', 'Chelsea'),
    ('Aston Villa', 'Manchester United'), # Sunday

    # --- LA LIGA ---
    ('Real Madrid', 'Sevilla'),           # Saturday Night
    ('Villarreal', 'Barcelona'),          # Sunday
    ('Girona', 'Atl√©tico Madrid'),        # Sunday
    ('Real Betis', 'Getafe'),

    # --- SERIE A ---
    ('Juventus', 'Roma'),                 # Saturday Big Match
    ('Lazio', 'Cremonese'),
    ('Genoa', 'Atalanta')
]

# Run it
df_probs = predict_with_probabilities(weekend_games, bst, team_stats, le)


MATCHUP                                  | HOME %  | DRAW %  | AWAY %  | PREDICTION | ACTION
----------------------------------------------------------------------------------------------------
Valencia           vs Mallorca           | 55.4%   | 20.8%   | 23.8%   | Home Win   | ‚úÖ BET
Arsenal            vs Chelsea            | 61.6%   | 18.3%   | 20.1%   | Home Win   | ‚úÖ BET
Barcelona          vs Getafe             | 70.9%   | 14.6%   | 14.6%   | Home Win   | üî• BIG BET
Manchester City    vs Liverpool          | 43.0%   | 20.6%   | 36.4%   | Home Win   | -
Bayern Munich      vs Dortmund           | 61.4%   | 20.3%   | 18.3%   | Home Win   | ‚úÖ BET
Girona             vs Espanyol           | 48.0%   | 26.5%   | 25.5%   | Home Win   | -
Juventus           vs Milan              | 62.9%   | 20.0%   | 17.0%   | Home Win   | ‚úÖ BET


## Save the Model

In [47]:
import joblib

# 1. Bundle everything into one file
model_package = {
    'model': bst,              # The trained XGBoost brain
    'team_stats': team_stats,  # The history of every team (Crucial!)
    'encoder': le,             # The name-to-ID converter
    'accuracy': 0.5776         # Metadata
}

# 2. Save
joblib.dump(model_package, 'football_model_v1.pkl')
print("Model saved successfully as 'football_model_v1.pkl'")

Model saved successfully as 'football_model_v1.pkl'


## Load the Model

In [48]:
# CODE TO RUN NEXT TIME (Instead of Training)
import joblib
import pandas as pd
import numpy as np

# Load
data = joblib.load('football_model_v1.pkl')
bst = data['model']
team_stats = data['team_stats']
le = data['encoder']

print("Model Loaded! Ready to predict.")

Model Loaded! Ready to predict.
