In [20]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import warnings

warnings.filterwarnings('ignore')

# ==========================================
# 1. LOAD & CLEAN DATA
# ==========================================
# Replace with your actual filename
df = pd.read_csv("match_data.csv") 

# --- CLEANING THE NEW COLUMNS ---
# 1. Possession: "70%" -> 0.70
df['home_possession'] = df['home_team_possession'].astype(str).str.rstrip('%').astype(float) / 100.0
df['away_possession'] = df['away_team_possession'].astype(str).str.rstrip('%').astype(float) / 100.0

# 2. Attendance: "68,407" -> 68407 (Handle NaNs)
df['attendance'] = df['attendance'].astype(str).str.replace(',', '')
df['attendance'] = pd.to_numeric(df['attendance'], errors='coerce').fillna(20000)

# 3. Date Handling
# If you have a date column, use it. If not, we assume the file is sorted by time.
if 'date_utc' in df.columns:
    df['match_date'] = pd.to_datetime(df['date_utc'], utc=True)
    df = df.sort_values('match_date')
elif 'Date' in df.columns:
    df['match_date'] = pd.to_datetime(df['Date'])
    df = df.sort_values('match_date')
else:
    # Create a dummy date based on index if missing (Assumes file is sorted)
    df['match_date'] = df.index

# ==========================================
# 2. FEATURE ENGINEERING (GLOBAL LOOP)
# ==========================================
team_stats = {} 

# We will track rolling averages for these metrics
features = {
    'Home_Form_Pts': [], 'Away_Form_Pts': [],
    'Home_xG_Avg': [],   'Away_xG_Avg': [],    # Attack Quality
    'Home_xGA_Avg': [],  'Away_xGA_Avg': [],   # Defense Quality (xG Allowed)
    'Home_Poss_Avg': [], 'Away_Poss_Avg': []   # Control
}

def get_rolling_avg(history, window=5):
    if not history: return 0
    return sum(history[-window:]) / min(len(history), window)

print("Generating Advanced xG & Possession Features...")

for index, row in df.iterrows():
    # Use the new column names
    home = row['home_team_name']
    away = row['away_team_name']
    
    # Initialize team history if new
    if home not in team_stats: 
        team_stats[home] = {'pts': [], 'xg': [], 'xga': [], 'poss': []}
    if away not in team_stats: 
        team_stats[away] = {'pts': [], 'xg': [], 'xga': [], 'poss': []}
    
    h_stats = team_stats[home]
    a_stats = team_stats[away]
    
    # --- A. CALCULATE INPUT FEATURES (Before Match) ---
    features['Home_Form_Pts'].append(get_rolling_avg(h_stats['pts']))
    features['Away_Form_Pts'].append(get_rolling_avg(a_stats['pts']))
    
    features['Home_xG_Avg'].append(get_rolling_avg(h_stats['xg']))
    features['Away_xG_Avg'].append(get_rolling_avg(a_stats['xg']))
    
    features['Home_xGA_Avg'].append(get_rolling_avg(h_stats['xga']))
    features['Away_xGA_Avg'].append(get_rolling_avg(a_stats['xga']))
    
    features['Home_Poss_Avg'].append(get_rolling_avg(h_stats['poss']))
    features['Away_Poss_Avg'].append(get_rolling_avg(a_stats['poss']))
    
    # --- B. UPDATE HISTORY (After Match) ---
    # Points
    if row['home_team_score'] > row['away_team_score']:
        h_pts, a_pts = 3, 0
    elif row['home_team_score'] == row['away_team_score']:
        h_pts, a_pts = 1, 1
    else:
        h_pts, a_pts = 0, 3
        
    # Append Stats
    h_stats['pts'].append(h_pts)
    a_stats['pts'].append(a_pts)
    
    # xG (Expected Goals)
    h_stats['xg'].append(row['home_team_xg'])
    a_stats['xg'].append(row['away_team_xg'])
    
    # xGA (Expected Goals Allowed) - Crucial for Defense!
    h_stats['xga'].append(row['away_team_xg']) # Home allowed what Away created
    a_stats['xga'].append(row['home_team_xg']) # Away allowed what Home created
    
    # Possession
    h_stats['poss'].append(row['home_possession'])
    a_stats['poss'].append(row['away_possession'])

# Add features to DF
for k, v in features.items():
    df[k] = v

# ==========================================
# 3. INTERACTION FEATURES (Comparisons)
# ==========================================
# Compare Home Attack (xG) vs Away Defense (xGA)
df['Home_xG_vs_Away_Def'] = df['Home_xG_Avg'] - df['Away_xGA_Avg']
df['Away_xG_vs_Home_Def'] = df['Away_xG_Avg'] - df['Home_xGA_Avg']

# Compare Possession Styles
df['Possession_Diff'] = df['Home_Poss_Avg'] - df['Away_Poss_Avg']

# ==========================================
# 4. PREPARE TRAINING DATA
# ==========================================
# Target
# 0: Away Win, 1: Draw, 2: Home Win
conditions = [
    (df['home_team_score'] > df['away_team_score']),
    (df['home_team_score'] == df['away_team_score']),
    (df['home_team_score'] < df['away_team_score'])
]
df['match_outcome'] = np.select(conditions, [2, 1, 0])

# Encode Teams
le = LabelEncoder()
all_teams = pd.concat([df['home_team_name'], df['away_team_name']]).unique()
le.fit(all_teams)
df['HomeTeam_ID'] = le.transform(df['home_team_name'])
df['AwayTeam_ID'] = le.transform(df['away_team_name'])

# Select Features for Model
X = df[[
    'HomeTeam_ID', 'AwayTeam_ID',
    'Home_Form_Pts', 'Away_Form_Pts',
    'Home_xG_Avg', 'Away_xG_Avg',
    'Home_xGA_Avg', 'Away_xGA_Avg',
    'Home_Poss_Avg', 'Away_Poss_Avg',
    'Home_xG_vs_Away_Def', 'Away_xG_vs_Home_Def',
    'Possession_Diff'
]]
y = df['match_outcome']

# Time Split
split = int(len(df) * 0.85)
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

# ==========================================
# 5. TRAIN MODEL (Optimized Params)
# ==========================================
print(f"Training on {len(X_train)} matches...")

bst = XGBClassifier(
    n_estimators=100,
    max_depth=5,           # Depth 4 captures the xG interactions well
    learning_rate=0.02,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='multi:softmax',
    num_class=3,
    random_state=42,
    n_jobs=-1
)

bst.fit(X_train, y_train)

# ==========================================
# 6. EVALUATE
# ==========================================
preds = bst.predict(X_test)
probs = bst.predict_proba(X_test)
acc = accuracy_score(y_test, preds)

print(f"-------------------------------")
print(f"ACCURACY: {acc:.2%}")
print(f"-------------------------------")

# High Confidence Filter
results = pd.DataFrame({'Actual': y_test, 'Pred': preds, 'Conf': np.max(probs, axis=1)})
high_conf = results[results['Conf'] > 0.55] # Filter for strong signals
if len(high_conf) > 0:
    print(f"High Confidence Accuracy (>55%): {accuracy_score(high_conf['Actual'], high_conf['Pred']):.2%} (on {len(high_conf)} games)")

# ==========================================
# 7. PREDICTION TOOL (NEXT MATCHES)
# ==========================================
def predict_match(home_name, away_name):
    if home_name not in team_stats or away_name not in team_stats:
        return "Teams not found in history"
    
    h = team_stats[home_name]
    a = team_stats[away_name]
    
    # Build Feature Row based on history
    row = pd.DataFrame([{
        'HomeTeam_ID': le.transform([home_name])[0],
        'AwayTeam_ID': le.transform([away_name])[0],
        'Home_Form_Pts': get_rolling_avg(h['pts']),
        'Away_Form_Pts': get_rolling_avg(a['pts']),
        'Home_xG_Avg': get_rolling_avg(h['xg']),
        'Away_xG_Avg': get_rolling_avg(a['xg']),
        'Home_xGA_Avg': get_rolling_avg(h['xga']),
        'Away_xGA_Avg': get_rolling_avg(a['xga']),
        'Home_Poss_Avg': get_rolling_avg(h['poss']),
        'Away_Poss_Avg': get_rolling_avg(a['poss']),
        # Interactions
        'Home_xG_vs_Away_Def': get_rolling_avg(h['xg']) - get_rolling_avg(a['xga']),
        'Away_xG_vs_Home_Def': get_rolling_avg(a['xg']) - get_rolling_avg(h['xga']),
        'Possession_Diff': get_rolling_avg(h['poss']) - get_rolling_avg(a['poss'])
    }])
    
    # Predict
    prob = bst.predict_proba(row)[0]
    pred = np.argmax(prob)
    conf = np.max(prob)
    
    labels = {0: 'Away Win', 1: 'Draw', 2: 'Home Win'}
    
    return f"{home_name} vs {away_name}: {labels[pred]} ({conf:.1%} confidence)"

print("\n--- TEST PREDICTIONS ---")
# Replace these names with teams that exist in your CSV
# Example check:
teams_list = list(team_stats.keys())
print(f"Available teams example: {teams_list}")

if len(teams_list) >= 2:
    print(predict_match(teams_list[0], teams_list[1]))

Generating Advanced xG & Possession Features...
Training on 261 matches...
-------------------------------
ACCURACY: 53.19%
-------------------------------
High Confidence Accuracy (>55%): 68.42% (on 19 games)

--- TEST PREDICTIONS ---
Available teams example: ['Real Madrid', 'Osasuna', 'Oviedo', 'Mallorca', 'Real Sociedad', 'Marseille', 'Espanyol', 'Levante', 'AtlÃ©tico Madrid', 'QaÄ±rat Almaty', 'Villarreal', 'Getafe', 'Juventus', 'Barcelona', 'Valencia', 'Liverpool', 'Elche', 'Olympiacos', 'Girona', 'Athletic Club', 'Rayo Vallecano', 'Newcastle United', 'Paris Saint-Germain', 'Sevilla', 'Club Brugge', 'Chelsea', 'AlavÃ©s', 'Celta Vigo', 'Tottenham Hotspur', 'Real Betis', 'Manchester City', 'Ciudad de Lucena', 'Dortmund', 'Eintracht Frankfurt', 'Arsenal', 'Union SG', 'Internazionale', 'AtlÃ¨tic Lleida', 'Nottingham Forest', 'Ludogorets Razgrad', 'Genk', 'AtlÃ©tico Palma del RÃ­o', 'Utrecht', 'QarabaÄŸ', 'Slavia Prague', 'Stuttgart', 'PAOK', 'Nice', 'Puerto de Vega', 'CD Toledo', 'Int

In [24]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Optional: Show what the model actually learned (Feature Importance)
importance = pd.DataFrame({
    'Feature': X.columns,
    'Score': bst.feature_importances_
}).sort_values(by='Score', ascending=False)

print("Top 5 Key Predictors:")
print(importance.head(5))

# Get probabilities
probs = bst.predict_proba(X_test)
results = pd.DataFrame({
    'Actual': y_test,
    'Pred': preds,
    'Conf': np.max(probs, axis=1) # Max probability (Confidence)
})

print(f"{'Threshold':<10} | {'Matches':<8} | {'Accuracy':<10} | {'Status'}")
print("-" * 50)

# Loop through thresholds from 40% to 60%
for threshold in [0.40, 0.45, 0.50, 0.55, 0.60]:
    # Filter bets with confidence > threshold
    subset = results[results['Conf'] > threshold]
    
    if len(subset) > 0:
        acc = accuracy_score(subset['Actual'], subset['Pred'])
        
        # Determine if this is a good strategy
        status = "POOR"
        if acc > 0.50: status = "OK"
        if acc > 0.55: status = "GOOD"
        if acc > 0.60: status = "EXCELLENT"
        
        print(f"{threshold:.2f}       | {len(subset):<8} | {acc:.2%}     | {status}")
    else:
        print(f"{threshold:.2f}       | 0        | N/A        | -")

Top 5 Key Predictors:
            Feature     Score
12  Possession_Diff  0.100571
4       Home_xG_Avg  0.095953
7      Away_xGA_Avg  0.091298
9     Away_Poss_Avg  0.090031
6      Home_xGA_Avg  0.079881
Threshold  | Matches  | Accuracy   | Status
--------------------------------------------------
0.40       | 42       | 52.38%     | OK
0.45       | 29       | 58.62%     | GOOD
0.50       | 21       | 71.43%     | EXCELLENT
0.55       | 19       | 68.42%     | EXCELLENT
0.60       | 17       | 70.59%     | EXCELLENT


In [None]:
# ==============================================================================
#  PREDICTION TOOL: NEXT WEEK'S GAMES
# ==============================================================================
import pandas as pd
import numpy as np

def predict_next_week(matchups, model, team_stats, encoder):
    print(f"\n{'MATCHUP':<45} | {'PREDICTION':<15} | {'CONFIDENCE':<10} | {'ACTION'}")
    print("-" * 85)
    
    predictions = []
    
    # Helper to calculate the exact same stats the model was trained on
    def get_avg(hist, window=5):
        if not hist: return 0
        return sum(hist[-window:]) / min(len(hist), window)

    for home, away in matchups:
        # 1. Check if teams exist in our database
        if home not in team_stats or away not in team_stats:
            print(f"{home:<20} vs {away:<20} | ???             | N/A        | Unknown Team")
            continue
            
        h = team_stats[home]
        a = team_stats[away]
        
        # 2. Build the Feature Row (Must match training columns EXACTLY)
        # Calculate Rolling Averages from the team_stats dictionary
        features = {
            'HomeTeam_ID': encoder.transform([home])[0],
            'AwayTeam_ID': encoder.transform([away])[0],
            
            'Home_Form_Pts': get_avg(h['pts']),
            'Away_Form_Pts': get_avg(a['pts']),
            
            'Home_xG_Avg': get_avg(h['xg']),
            'Away_xG_Avg': get_avg(a['xg']),
            
            'Home_xGA_Avg': get_avg(h['xga']),
            'Away_xGA_Avg': get_avg(a['xga']),
            
            'Home_Poss_Avg': get_avg(h['poss']),
            'Away_Poss_Avg': get_avg(a['poss']),
            
            # Interactions
            'Home_xG_vs_Away_Def': get_avg(h['xg']) - get_avg(a['xga']),
            'Away_xG_vs_Home_Def': get_avg(a['xg']) - get_avg(h['xga']),
            'Possession_Diff': get_avg(h['poss']) - get_avg(a['poss'])
        }
        
        # Convert to DataFrame
        row = pd.DataFrame([features])
        
        # 3. Predict
        prob = model.predict_proba(row)[0]
        confidence = np.max(prob)
        pred_idx = np.argmax(prob)
        
        outcomes = {0: 'Away Win', 1: 'Draw', 2: 'Home Win'}
        prediction = outcomes[pred_idx]
        
        # 4. Determine Action based on your Threshold stats
        action = "PASS"
        if confidence > 0.45: action = "BET (Small)"
        if confidence > 0.50: action = "BET (BIG) ðŸ”¥" # 71% Accuracy Zone
        
        print(f"{home:<20} vs {away:<20} | {prediction:<15} | {confidence:.1%}      | {action}")
        
        predictions.append({'Home': home, 'Away': away, 'Pick': prediction, 'Conf': confidence})

    return pd.DataFrame(predictions)

# ==============================================================================
#  INPUT YOUR UPCOMING MATCHES HERE
# ==============================================================================
# Use the EXACT names from your "Available teams" list
next_week_matches = [
    ('Real Madrid', 'Barcelona'),
    ('Barcelona', 'Getafe'),
    ('Manchester City', 'Chelsea'),
    ('Liverpool', 'Arsenal'),
    ('Bayern Munich', 'Dortmund'),  # Assuming these exist in your data
    ('AtlÃ©tico Madrid', 'Valencia'),
    ('Girona', 'Real Betis'),
    ('Tottenham Hotspur', 'Newcastle United')
]

# Run the predictor
df_preds = predict_next_week(next_week_matches, bst, team_stats, le)


MATCHUP                                       | PREDICTION      | CONFIDENCE | ACTION
-------------------------------------------------------------------------------------
Real Madrid          vs Sevilla              | Home Win        | 76.9%      | BET (BIG) ðŸ”¥
Barcelona            vs Getafe               | Home Win        | 70.7%      | BET (BIG) ðŸ”¥
Manchester City      vs Chelsea              | Home Win        | 45.7%      | BET (Small)
Liverpool            vs Arsenal              | Home Win        | 65.8%      | BET (BIG) ðŸ”¥
Bayern Munich        vs Dortmund             | ???             | N/A        | Unknown Team
AtlÃ©tico Madrid      vs Valencia             | Home Win        | 74.8%      | BET (BIG) ðŸ”¥
Girona               vs Real Betis           | Draw            | 39.4%      | PASS
Tottenham Hotspur    vs Newcastle United     | Home Win        | 39.6%      | PASS
