In [5]:
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib
import re
import warnings
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

warnings.filterwarnings('ignore')

print("==========================================================")
print("   üöÄ WIN/LOSS SPECIALIST MODEL (No Draw Predictions)    ")
print("==========================================================")

# ==========================================
# 1. LOAD & CLEAN
# ==========================================
try:
    df = pd.read_csv("match_data.csv")
    print(f"‚úÖ Loaded {len(df)} matches.")
except FileNotFoundError:
    print("‚ùå Error: match_data.csv not found.")
    exit()

def extract_date(url):
    try:
        match = re.search(r'([A-Za-z]+-\d{1,2}-\d{4})', str(url))
        if match: return pd.to_datetime(match.group(1), format='%B-%d-%Y', errors='coerce')
    except: pass
    return pd.NaT

df['date'] = df['match_url'].apply(extract_date)
df = df.sort_values(by='date').reset_index(drop=True)

def get_stat_cols(df):
    exclude = ['match_url', 'date', 'home_team_name', 'away_team_name', 'xg_is_estimated', 'match_outcome']
    cols = [c for c in df.columns if c not in exclude]
    base_stats = set()
    for c in cols:
        if c.startswith('home_'): base_stats.add(c.replace('home_', ''))
        elif c.startswith('away_'): base_stats.add(c.replace('away_', ''))
    return list(base_stats)

all_stats = get_stat_cols(df)

for side in ['home', 'away']:
    p_col = f"{side}_team_possession"
    if p_col in df.columns and df[p_col].dtype == 'object':
        df[p_col] = pd.to_numeric(df[p_col].astype(str).str.rstrip('%'), errors='coerce').fillna(50) / 100.0

    for s in all_stats:
        col = f"{side}_{s}"
        if col not in df.columns: col = f"{side}_team_{s}"
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
            df[col] = df[col].fillna(df.groupby(f'{side}_team_name')[col].transform('median'))
            df[col] = df[col].fillna(0)

# ==========================================
# 2. ENHANCED FEATURE ENGINEERING
# ==========================================
print("\nüîß Building Enhanced Features...")

def calculate_elo_advanced(df):
    """Enhanced ELO with form tracking"""
    k_factor = 22
    home_advantage = 65
    team_elos = {team: 1500 for team in set(df['home_team_name']).union(set(df['away_team_name']))}
    team_form = {team: [] for team in team_elos.keys()}
    
    elo_h, elo_a, form_h, form_a, momentum_h, momentum_a = [], [], [], [], [], []
    
    for _, row in df.iterrows():
        h, a = row['home_team_name'], row['away_team_name']
        rh, ra = team_elos[h], team_elos[a]
        elo_h.append(rh)
        elo_a.append(ra)
        
        # Recent form (last 5 games)
        recent_h = team_form[h][-5:] if team_form[h] else [0.5] * 5
        recent_a = team_form[a][-5:] if team_form[a] else [0.5] * 5
        form_h.append(np.mean(recent_h))
        form_a.append(np.mean(recent_a))
        
        # Momentum (last 3 vs previous 3)
        if len(team_form[h]) >= 6:
            momentum_h.append(np.mean(team_form[h][-3:]) - np.mean(team_form[h][-6:-3]))
        else:
            momentum_h.append(0)
        
        if len(team_form[a]) >= 6:
            momentum_a.append(np.mean(team_form[a][-3:]) - np.mean(team_form[a][-6:-3]))
        else:
            momentum_a.append(0)
        
        if row['home_team_score'] > row['away_team_score']: 
            res_h, res_a = 1, 0
        elif row['home_team_score'] == row['away_team_score']: 
            res_h, res_a = 0.5, 0.5
        else: 
            res_h, res_a = 0, 1
        
        team_form[h].append(res_h)
        team_form[a].append(res_a)
        
        dr = (rh + home_advantage) - ra
        e_h = 1 / (1 + 10 ** (-dr / 400))
        change = k_factor * (res_h - e_h)
        team_elos[h] = rh + change
        team_elos[a] = ra - change
    
    return elo_h, elo_a, form_h, form_a, momentum_h, momentum_a, team_elos

df['home_elo'], df['away_elo'], df['home_form'], df['away_form'], df['home_momentum'], df['away_momentum'], current_elos = calculate_elo_advanced(df)
df['diff_elo'] = (df['home_elo'] + 65) - df['away_elo']

# Rolling Stats with multiple windows
cols_to_roll = []
h_d = df[['date', 'match_url', 'home_team_name']].rename(columns={'home_team_name':'team'})
a_d = df[['date', 'match_url', 'away_team_name']].rename(columns={'away_team_name':'team'})

for f in all_stats:
    c_h = f"home_{f}" if f"home_{f}" in df.columns else f"home_team_{f}"
    c_a = f"away_{f}" if f"away_{f}" in df.columns else f"away_team_{f}"
    if c_h in df.columns and c_a in df.columns:
        h_d[f] = df[c_h]
        a_d[f] = df[c_a]
        cols_to_roll.append(f)

stacked = pd.concat([h_d, a_d]).sort_values(['team', 'date'])

# Multiple rolling windows
for f in cols_to_roll:
    stacked[f'roll_{f}'] = stacked.groupby('team')[f].transform(
        lambda x: x.shift(1).ewm(span=10, min_periods=1).mean()
    )
    stacked[f'roll_recent_{f}'] = stacked.groupby('team')[f].transform(
        lambda x: x.shift(1).ewm(span=5, min_periods=1).mean()
    )

roll_cols = [f'roll_{f}' for f in cols_to_roll] + [f'roll_recent_{f}' for f in cols_to_roll]
df = df.merge(stacked[['match_url', 'team'] + roll_cols], 
              left_on=['match_url', 'home_team_name'], right_on=['match_url', 'team'], 
              how='left').drop(columns=['team']).rename(columns={c: f'home_{c}' for c in roll_cols})
df = df.merge(stacked[['match_url', 'team'] + roll_cols], 
              left_on=['match_url', 'away_team_name'], right_on=['match_url', 'team'], 
              how='left').drop(columns=['team']).rename(columns={c: f'away_{c}' for c in roll_cols})

# Build feature set focused on WIN/LOSS discrimination
print("Building Win/Loss Features...")
features = ['diff_elo', 'home_elo', 'away_elo', 'home_form', 'away_form', 
            'home_momentum', 'away_momentum']

# Form differentials
df['form_advantage'] = df['home_form'] - df['away_form']
df['momentum_advantage'] = df['home_momentum'] - df['away_momentum']
features.extend(['form_advantage', 'momentum_advantage'])

# Statistical advantages
for f in cols_to_roll:
    df[f'diff_{f}'] = df[f'home_roll_{f}'] - df[f'away_roll_{f}']
    df[f'diff_recent_{f}'] = df[f'home_roll_recent_{f}'] - df[f'away_roll_recent_{f}']
    features.extend([f'diff_{f}', f'diff_recent_{f}', 
                     f'home_roll_{f}', f'away_roll_{f}'])

df = df.fillna(0)

print(f"Total features: {len(features)}")

# ==========================================
# 3. SPLIT DATA
# ==========================================
conditions = [
    (df['home_team_score'] > df['away_team_score']),
    (df['home_team_score'] == df['away_team_score']),
    (df['home_team_score'] < df['away_team_score'])
]
df['match_outcome'] = np.select(conditions, [2, 1, 0])

split = int(len(df) * 0.85)
train_df = df.iloc[:split]
test_df = df.iloc[split:]

print(f"\nTrain: {len(train_df)} | Test: {len(test_df)}")

# ==========================================
# 4. ENSEMBLE MODEL (FOCUSED ON WIN/LOSS)
# ==========================================
print("\nüéØ Training Ensemble for Win/Loss Prediction...")

X_train = train_df[features]
y_train = train_df['match_outcome']
X_test = test_df[features]
y_test = test_df['match_outcome']

# Train 3 diverse models
print("  ‚Üí XGBoost...")
xgb_model = xgb.XGBClassifier(
    n_estimators=600,
    learning_rate=0.01,
    max_depth=6,
    min_child_weight=1,
    subsample=0.85,
    colsample_bytree=0.85,
    gamma=0.1,
    reg_alpha=0.05,
    reg_lambda=1.5,
    objective='multi:softprob',
    random_state=42
)
xgb_model.fit(X_train, y_train)

print("  ‚Üí Random Forest...")
rf_model = RandomForestClassifier(
    n_estimators=500,
    max_depth=10,
    min_samples_leaf=3,
    max_features='sqrt',
    random_state=42
)
rf_model.fit(X_train, y_train)

print("  ‚Üí Gradient Boosting...")
gb_model = GradientBoostingClassifier(
    n_estimators=400,
    learning_rate=0.02,
    max_depth=5,
    subsample=0.85,
    random_state=42
)
gb_model.fit(X_train, y_train)

# Ensemble predictions
xgb_probs = xgb_model.predict_proba(X_test)
rf_probs = rf_model.predict_proba(X_test)
gb_probs = gb_model.predict_proba(X_test)

# Weighted ensemble (XGBoost usually performs best)
ensemble_probs = 0.50 * xgb_probs + 0.30 * rf_probs + 0.20 * gb_probs

# ==========================================
# 5. SIMPLE DECISION: NEVER PREDICT DRAW
# ==========================================
print("\nüé≤ Making Predictions (No Draws)...")

# Simply pick Home if P(Home) > P(Away), else Away
final_preds = np.where(ensemble_probs[:, 2] > ensemble_probs[:, 0], 2, 0)

# ==========================================
# 6. EVALUATION
# ==========================================
acc = accuracy_score(y_test, final_preds)

print("\n==================================================")
print(f"   WIN/LOSS SPECIALIST ACCURACY: {acc:.2%}   ")
print("==================================================")

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, final_preds)
print(cm)

print("\nClassification Report:")
print(classification_report(y_test, final_preds, target_names=['Away', 'Draw', 'Home']))

# Detailed metrics
away_correct = cm[0, 0]
away_total = cm[0].sum()
home_correct = cm[2, 2]
home_total = cm[2].sum()
draws_called_away = cm[1, 0]
draws_called_home = cm[1, 2]

print(f"\nüìä Win/Loss Performance:")
print(f"   Away Win Accuracy: {away_correct}/{away_total} = {100*away_correct/away_total:.1f}%")
print(f"   Home Win Accuracy: {home_correct}/{home_total} = {100*home_correct/home_total:.1f}%")
print(f"   Decisive Match Accuracy: {(away_correct + home_correct)}/{(away_total + home_total)} = {100*(away_correct + home_correct)/(away_total + home_total):.1f}%")

print(f"\nüìä Draw Handling:")
print(f"   Draws called as Away: {draws_called_away}/99 ({100*draws_called_away/99:.1f}%)")
print(f"   Draws called as Home: {draws_called_home}/99 ({100*draws_called_home/99:.1f}%)")
print(f"   Average cost per draw: {100*(draws_called_away + draws_called_home)/99:.1f}%")

print(f"\nü§ñ Individual Model Accuracies:")
print(f"   XGBoost: {accuracy_score(y_test, np.where(xgb_probs[:, 2] > xgb_probs[:, 0], 2, 0)):.2%}")
print(f"   Random Forest: {accuracy_score(y_test, np.where(rf_probs[:, 2] > rf_probs[:, 0], 2, 0)):.2%}")
print(f"   Gradient Boosting: {accuracy_score(y_test, np.where(gb_probs[:, 2] > gb_probs[:, 0], 2, 0)):.2%}")

# Confidence analysis
print(f"\nüìà Confidence Analysis:")
home_win_confidence = ensemble_probs[:, 2] - ensemble_probs[:, 0]
for threshold in [0.1, 0.2, 0.3, 0.4]:
    high_conf = np.abs(home_win_confidence) > threshold
    if sum(high_conf) > 0:
        high_conf_acc = accuracy_score(y_test[high_conf], final_preds[high_conf])
        print(f"   Confidence >{threshold:.1f}: {sum(high_conf)} matches, {high_conf_acc:.2%} accurate")

# Feature importance
importances = xgb_model.feature_importances_
top_15_idx = np.argsort(importances)[-15:]
print(f"\nüîç Top 15 Features:")
for idx in top_15_idx[::-1]:
    print(f"   {features[idx]}: {importances[idx]:.4f}")

# Save
joblib.dump({
    'xgb_model': xgb_model,
    'rf_model': rf_model,
    'gb_model': gb_model,
    'features': features,
    'elo_dict': current_elos,
    'weights': [0.50, 0.30, 0.20],
    'df_recent': df[['date', 'home_team_name', 'away_team_name'] + 
                    [c for c in df.columns if 'roll_' in c]].tail(1500),
}, 'football_model_winloss_specialist.pkl')

print("\n‚úÖ Win/Loss Specialist Model Saved.")
print("\nüí° Strategy: Accept draws as unpredictable, maximize win/loss accuracy.")

   üöÄ WIN/LOSS SPECIALIST MODEL (No Draw Predictions)    
‚úÖ Loaded 2964 matches.

üîß Building Enhanced Features...
Building Win/Loss Features...
Total features: 357

Train: 2519 | Test: 445

üéØ Training Ensemble for Win/Loss Prediction...
  ‚Üí XGBoost...
  ‚Üí Random Forest...
  ‚Üí Gradient Boosting...

üé≤ Making Predictions (No Draws)...

   WIN/LOSS SPECIALIST ACCURACY: 53.48%   

Confusion Matrix:
[[ 86   0  60]
 [ 37   0  62]
 [ 48   0 152]]

Classification Report:
              precision    recall  f1-score   support

        Away       0.50      0.59      0.54       146
        Draw       0.00      0.00      0.00        99
        Home       0.55      0.76      0.64       200

    accuracy                           0.53       445
   macro avg       0.35      0.45      0.39       445
weighted avg       0.41      0.53      0.47       445


üìä Win/Loss Performance:
   Away Win Accuracy: 86/146 = 58.9%
   Home Win Accuracy: 152/200 = 76.0%
   Decisive Match Accuracy: 23

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib
import re
import warnings
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, log_loss

warnings.filterwarnings('ignore')

print("==========================================================")
print("   üöÄ GRANDMASTER TRAINER (FULL GRID SEARCH + TUNING)     ")
print("==========================================================")

# ==========================================
# 1. LOAD & CLEAN DATA
# ==========================================
try:
    df = pd.read_csv("match_data/match_data_combined.csv")
    print(f"‚úÖ Loaded {len(df)} matches.")
except FileNotFoundError:
    print("‚ùå Error: match_data_combined.csv not found.")
    exit()

def extract_date(url):
    try:
        match = re.search(r'([A-Za-z]+-\d{1,2}-\d{4})', str(url))
        if match: return pd.to_datetime(match.group(1), format='%B-%d-%Y', errors='coerce')
    except: pass
    return pd.NaT

print("Parsing dates...")
df['date'] = df['match_url'].apply(extract_date)
df = df.sort_values(by='date').reset_index(drop=True)
df = df.dropna(subset=['date'])

# --- MARKET DATA IMPUTATION ---
if 'AvgH' in df.columns:
    df['market_prob_home'] = (1 / df['AvgH']).fillna(0.33)
    df['market_prob_draw'] = (1 / df['AvgD']).fillna(0.33)
    df['market_prob_away'] = (1 / df['AvgA']).fillna(0.33)
    
    # Normalize
    m_sum = df['market_prob_home'] + df['market_prob_draw'] + df['market_prob_away']
    df['market_prob_home'] /= m_sum
    df['market_prob_draw'] /= m_sum
    df['market_prob_away'] /= m_sum
    
    df['has_odds'] = df['AvgH'].notna().astype(int)
else:
    df['market_prob_home'] = 0.33
    df['market_prob_draw'] = 0.33
    df['market_prob_away'] = 0.33
    df['has_odds'] = 0

# --- DYNAMIC STAT CLEANING ---
def get_stat_cols(df):
    exclude = ['match_url', 'date', 'home_team_name', 'away_team_name', 'xg_is_estimated', 'match_outcome', 
               'AvgH', 'AvgD', 'AvgA', 'Avg>2.5', 'Avg<2.5', 'market_prob_home', 'market_prob_draw', 'market_prob_away', 'has_odds']
    cols = [c for c in df.columns if c not in exclude]
    base_stats = set()
    for c in cols:
        if c.startswith('home_'): base_stats.add(c.replace('home_', ''))
        elif c.startswith('away_'): base_stats.add(c.replace('away_', ''))
    return list(base_stats)

all_stats = get_stat_cols(df)

for side in ['home', 'away']:
    p_col = f"{side}_team_possession"
    if p_col in df.columns:
        df[p_col] = pd.to_numeric(df[p_col].astype(str).str.rstrip('%'), errors='coerce').fillna(50) / 100.0

    for s in all_stats:
        col = f"{side}_{s}"
        if col not in df.columns: col = f"{side}_team_{s}"
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
            df[col] = df[col].fillna(df.groupby(f'{side}_team_name')[col].transform('median'))
            df[col] = df[col].fillna(0)

# ==========================================
# 2. FEATURE ENGINEERING
# ==========================================

# --- ELO ---
def calculate_elo(df):
    k_factor = 20
    home_advantage = 70
    team_elos = {team: 1500 for team in set(df['home_team_name']).union(set(df['away_team_name']))}
    elo_h, elo_a = [], []
    for _, row in df.iterrows():
        h, a = row['home_team_name'], row['away_team_name']
        rh, ra = team_elos[h], team_elos[a]
        elo_h.append(rh); elo_a.append(ra)
        if row['home_team_score'] > row['away_team_score']: res = 1
        elif row['home_team_score'] == row['away_team_score']: res = 0.5
        else: res = 0
        dr = (rh + home_advantage) - ra
        e_h = 1 / (1 + 10 ** (-dr / 400))
        change = k_factor * (res - e_h)
        team_elos[h] = rh + change
        team_elos[a] = ra - change
    return elo_h, elo_a, team_elos

df['home_elo'], df['away_elo'], current_elos = calculate_elo(df)
df['diff_elo'] = (df['home_elo'] + 70) - df['away_elo']

# --- Rest Days ---
long_df = pd.concat([
    df[['date', 'home_team_name']].rename(columns={'home_team_name':'team'}),
    df[['date', 'away_team_name']].rename(columns={'away_team_name':'team'})
]).sort_values(['team', 'date'])
long_df['rest'] = (long_df['date'] - long_df.groupby('team')['date'].shift(1)).dt.days.fillna(7).clip(upper=14)
rest_map = dict(zip(zip(long_df['date'], long_df['team']), long_df['rest']))
df['diff_rest'] = df.apply(lambda x: rest_map.get((x['date'], x['home_team_name']),7), axis=1) - \
                  df.apply(lambda x: rest_map.get((x['date'], x['away_team_name']),7), axis=1)

# --- Rolling Stats ---
df['home_team_points'] = np.select([df['home_team_score']>df['away_team_score'], df['home_team_score']==df['away_team_score']], [3, 1], 0)
df['away_team_points'] = np.select([df['away_team_score']>df['home_team_score'], df['away_team_score']==df['home_team_score']], [3, 1], 0)
if 'team_points' not in all_stats: all_stats.append('team_points')

cols_to_roll = []
h_d = df[['date', 'match_url', 'home_team_name']].rename(columns={'home_team_name':'team'})
a_d = df[['date', 'match_url', 'away_team_name']].rename(columns={'away_team_name':'team'})

for f in all_stats:
    c_h = f"home_{f}" if f"home_{f}" in df.columns else f"home_team_{f}"
    c_a = f"away_{f}" if f"away_{f}" in df.columns else f"away_team_{f}"
    if c_h in df.columns and c_a in df.columns:
        h_d[f] = df[c_h]; a_d[f] = df[c_a]
        cols_to_roll.append(f)

stacked = pd.concat([h_d, a_d]).sort_values(['team', 'date'])
for f in cols_to_roll:
    # Use EWMA 10 for stability
    stacked[f'roll_{f}'] = stacked.groupby('team')[f].transform(lambda x: x.shift(1).ewm(span=10, min_periods=1).mean())

df = df.merge(stacked[['match_url', 'team'] + [f'roll_{f}' for f in cols_to_roll]], left_on=['match_url', 'home_team_name'], right_on=['match_url', 'team'], how='left').drop(columns=['team']).rename(columns={f'roll_{f}': f'home_roll_{f}' for f in cols_to_roll})
df = df.merge(stacked[['match_url', 'team'] + [f'roll_{f}' for f in cols_to_roll]], left_on=['match_url', 'away_team_name'], right_on=['match_url', 'team'], how='left').drop(columns=['team']).rename(columns={f'roll_{f}': f'away_roll_{f}' for f in cols_to_roll})

for f in cols_to_roll:
    df[f'diff_{f}'] = df[f'home_roll_{f}'] - df[f'away_roll_{f}']

df = df.fillna(0)

# ==========================================
# 3. SELECT TOP FEATURES (RFE)
# ==========================================
features = ['diff_elo', 'home_elo', 'away_elo', 'diff_rest', 'market_prob_home', 'market_prob_draw', 'market_prob_away', 'has_odds']
for f in cols_to_roll:
    features.append(f"home_roll_{f}")
    features.append(f"away_roll_{f}")
    features.append(f"diff_{f}")

print(f"üìä Initial Features: {len(features)}")

conditions = [
    (df['home_team_score'] > df['away_team_score']),
    (df['home_team_score'] == df['away_team_score']),
    (df['home_team_score'] < df['away_team_score'])
]
y = np.select(conditions, [2, 1, 0])
X = df[features].copy()

split = int(len(df) * 0.85)
X_train = X.iloc[:split]
X_test = X.iloc[split:]
y_train = y[:split]
y_test = y[split:]

print("\n‚úÇÔ∏è Selecting Top 50 Features...")
# Random Forest is best at finding non-linear relationships
selector = SelectFromModel(
    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42), 
    max_features=100, 
    threshold=-np.inf
)
selector.fit(X_train, y_train)
selected_cols = X_train.columns[selector.get_support()]

# Update Data
X_train = X_train[selected_cols]
X_test = X_test[selected_cols]
print(f"‚úÖ Reduced to {len(selected_cols)} Features.")
# print(f"Top Features: {list(selected_cols[:10])}")

# ==========================================
# 4. GRID SEARCH (MAXIMIZE HIGH CONFIDENCE)
# ==========================================
# We use 'neg_log_loss' as scoring because it optimizes for Probability Calibration
tscv = TimeSeriesSplit(n_splits=3)

# --- 1. XGBoost ---
print("\nüîç Tuning XGBoost (LogLoss)...")
xgb_grid = {
    'n_estimators': [200, 300],
    'learning_rate': [0.01, 0.03],
    'max_depth': [3, 4],
    'subsample': [0.7, 0.8],
    'colsample_bytree': [0.7, 0.8],
    'gamma': [1, 2] # Regularization
}
xgb_base = xgb.XGBClassifier(objective='multi:softprob', num_class=3, tree_method='hist', random_state=42)
gs_xgb = GridSearchCV(xgb_base, xgb_grid, cv=tscv, scoring='neg_log_loss', n_jobs=-1)
gs_xgb.fit(X_train, y_train)
print(f"‚úÖ Best XGB LogLoss: {-gs_xgb.best_score_:.4f}")
print(f"   Best Params: {gs_xgb.best_params_}")
best_xgb = gs_xgb.best_estimator_

# --- 2. Random Forest ---
print("\nüîç Tuning Random Forest (LogLoss)...")
rf_grid = {
    'n_estimators': [300, 500],
    'max_depth': [8, 12, 15],
    'min_samples_leaf': [2, 5],
    'max_features': ['sqrt']
}
rf_base = RandomForestClassifier(random_state=42, class_weight='balanced')
gs_rf = GridSearchCV(rf_base, rf_grid, cv=tscv, scoring='neg_log_loss', n_jobs=-1)
gs_rf.fit(X_train, y_train)
print(f"‚úÖ Best RF LogLoss: {-gs_rf.best_score_:.4f}")
print(f"   Best Params: {gs_rf.best_params_}")
best_rf = gs_rf.best_estimator_

# --- 3. Logistic Regression ---
print("\nüîç Tuning Logistic Regression...")
lr_pipe = make_pipeline(StandardScaler(), LogisticRegression(class_weight='balanced', multi_class='multinomial', max_iter=2000))
lr_grid = {
    'logisticregression__C': [0.01, 0.1, 1.0],
    'logisticregression__solver': ['lbfgs']
}
gs_lr = GridSearchCV(lr_pipe, lr_grid, cv=tscv, scoring='neg_log_loss', n_jobs=-1)
gs_lr.fit(X_train, y_train)
print(f"‚úÖ Best LR LogLoss: {-gs_lr.best_score_:.4f}")
best_lr = gs_lr.best_estimator_

# ==========================================
# 5. FINAL ENSEMBLE
# ==========================================
print("\nüèóÔ∏è Building Final Ensemble...")

# We weight based on which model had the best LogLoss
# Lower LogLoss = Higher Weight
weights = [1.2, 1.5, 0.8] # Heuristic: RF usually wins on small noisy data, LR is good anchor

ensemble = VotingClassifier(
    estimators=[('xgb', best_xgb), ('rf', best_rf), ('lr', best_lr)],
    voting='soft',
    weights=weights
)

# Calibrate (Sigmoid)
calibrated = CalibratedClassifierCV(ensemble, method='sigmoid', cv=3)
calibrated.fit(X_train, y_train)

# ==========================================
# 6. EVALUATION
# ==========================================
print("\nüìä EVALUATING ON TEST SET...")
preds = calibrated.predict(X_test)
probs = calibrated.predict_proba(X_test)
acc = accuracy_score(y_test, preds)

print("==================================================")
print(f"   FINAL ACCURACY: {acc:.2%}   ")
print("==================================================")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, preds))

# --- SNIPER ANALYSIS ---
print("\nüéØ SNIPER ANALYSIS (High Confidence Bets):")
results = pd.DataFrame({
    'Actual': y_test,
    'Pred': preds,
    'Conf': np.max(probs, axis=1)
})

for t in [0.50, 0.55, 0.60, 0.70]:
    sub = results[results['Conf'] > t]
    if len(sub) > 0:
        win_rate = accuracy_score(sub['Actual'], sub['Pred'])
        print(f"   > Confidence > {t:.2f}: {len(sub)} bets | Win Rate: {win_rate:.2%}")

# Save
joblib.dump({
    'model': calibrated, 
    'features': list(selected_cols),
    'elo_dict': current_elos,
    'df_recent': df[['date', 'home_team_name', 'away_team_name'] + [c for c in df.columns if 'roll_' in c]].tail(1000)
}, 'football_model_final.pkl')

print("‚úÖ Saved.")

   üöÄ GRANDMASTER TRAINER (FULL GRID SEARCH + TUNING)     
‚úÖ Loaded 2286 matches.
Parsing dates...
üìä Initial Features: 272

‚úÇÔ∏è Selecting Top 50 Features...
‚úÖ Reduced to 150 Features.

üîç Tuning XGBoost (LogLoss)...
‚úÖ Best XGB LogLoss: 0.9778
   Best Params: {'colsample_bytree': 0.7, 'gamma': 2, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.8}

üîç Tuning Random Forest (LogLoss)...
‚úÖ Best RF LogLoss: 0.9809
   Best Params: {'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 300}

üîç Tuning Logistic Regression...
‚úÖ Best LR LogLoss: 1.0117

üèóÔ∏è Building Final Ensemble...

üìä EVALUATING ON TEST SET...
   FINAL ACCURACY: 52.60%   

Confusion Matrix:
[[ 64   1  39]
 [ 37   0  52]
 [ 33   2 118]]

üéØ SNIPER ANALYSIS (High Confidence Bets):
   > Confidence > 0.50: 178 bets | Win Rate: 64.61%
   > Confidence > 0.55: 136 bets | Win Rate: 67.65%
   > Confidence > 0.60: 93 bets | Win Rate: 72.04%
   > Confide

In [7]:
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

print("==================================================")
print("   üîç FEATURE IMPORTANCE (MARKET AWARENESS)       ")
print("==================================================")

try:
    artifacts = joblib.load('football_model_final.pkl')
    model = artifacts['model']
    features = artifacts['features']
    print(f"‚úÖ Model Loaded. Features: {len(features)}")
    
    # Extract from Calibrated -> Voting -> Random Forest (Index 0 or 1 depending on weights)
    # Note: VotingClassifier doesn't expose feature_importances_ directly.
    # We need to dig into the fitted estimators.
    
    # 1. Get the Voting Classifier
    voting_clf = model.calibrated_classifiers_[0].estimator
    
    # 2. Get Random Forest (It usually has the best feature importance logic)
    # Check names in voting clf
    rf_model = voting_clf.named_estimators_.get('rf', None)

            
    if rf_model:
        importances = rf_model.feature_importances_
        
        # DataFrame
        fi_df = pd.DataFrame({
            'Feature': features,
            'Importance': importances
        }).sort_values(by='Importance', ascending=False)
        
        print("\nüèÜ TOP 20 FEATURES:")
        print(fi_df.head(20).to_string(index=False))
        
        # Check for Market Odds
        print("\nüí∞ MARKET ODDS RANKING:")
        odds_feats = [f for f in features if 'market' in f or 'odds' in f]
        print(fi_df[fi_df['Feature'].isin(odds_feats)])
        
        # Check for Player Stats
        print("\nüèÉ NEW PLAYER STATS RANKING (TOP 5):")
        player_feats = [f for f in features if 'player' in f]
        print(fi_df[fi_df['Feature'].isin(player_feats)].head(5))

    else:
        print("Could not find Random Forest in ensemble.")

except Exception as e:
    print(f"Error: {e}")

   üîç FEATURE IMPORTANCE (MARKET AWARENESS)       
‚úÖ Model Loaded. Features: 60

üèÜ TOP 20 FEATURES:
                                    Feature  Importance
                                   diff_elo    0.049892
                           market_prob_away    0.038096
                           market_prob_home    0.037551
                                   home_elo    0.026865
                            diff_players_xA    0.024345
diff_players_touches_attacking_penalty_area    0.022287
                           market_prob_draw    0.021000
       diff_players_touches_attacking_third    0.018309
                           diff_total_shots    0.017935
                         home_roll_offsides    0.017918
                          diff_players_npxG    0.017883
         home_roll_players_defensive_errors    0.017432
    diff_players_progressive_passes_recived    0.017205
                           diff_players_xAG    0.017119
                       diff_goalkeeper_PSxG    0.0170