In [36]:
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib
import re
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

warnings.filterwarnings('ignore')

print("==================================================")
print("   FOOTBALL MATCH PREDICTOR (PRODUCTION v2.0)     ")
print("==================================================")

# ==========================================
# 1. LOAD & CLEAN DATA
# ==========================================
try:
    df = pd.read_csv("match_data.csv")
except FileNotFoundError:
    print("Error: match_data.csv not found.")
    exit()

# Date Parsing
def extract_date(url):
    try:
        match = re.search(r'([A-Za-z]+-\d{1,2}-\d{4})', str(url))
        if match: return pd.to_datetime(match.group(1), format='%B-%d-%Y', errors='coerce')
    except: pass
    return pd.NaT

df['date'] = df['match_url'].apply(extract_date)
df = df.sort_values(by='date').reset_index(drop=True)

# Numeric Cleaning & Imputation
stats_cols = ["xg", "possession", "shots_onTarget", "corners", "fouls", "team_points"]
for side in ['home', 'away']:
    p_col = f"{side}_team_possession"
    if p_col in df.columns:
        df[p_col] = pd.to_numeric(df[p_col].astype(str).str.rstrip('%'), errors='coerce').fillna(50) / 100.0
    
    for s in stats_cols:
        col = f"{side}_{s}" if s not in ['xg', 'possession', 'team_points'] else f"{side}_team_{s}"
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
            df[col] = df[col].fillna(df.groupby(f'{side}_team_name')[col].transform('median'))
            df[col] = df[col].fillna(df[col].mean())

# ==========================================
# 2. FEATURE ENGINEERING ENGINE
# ==========================================

# --- ELO Calculation ---
def calculate_elo(df):
    k_factor = 20
    home_advantage = 65 
    team_elos = {team: 1500 for team in set(df['home_team_name']).union(set(df['away_team_name']))}
    elo_h, elo_a = [], []
    
    for _, row in df.iterrows():
        h, a = row['home_team_name'], row['away_team_name']
        rh, ra = team_elos[h], team_elos[a]
        elo_h.append(rh); elo_a.append(ra)
        
        if row['home_team_score'] > row['away_team_score']: res = 1
        elif row['home_team_score'] == row['away_team_score']: res = 0.5
        else: res = 0
        
        dr = (rh + home_advantage) - ra
        e_h = 1 / (1 + 10 ** (-dr / 400))
        team_elos[h] = rh + k_factor * (res - e_h)
        team_elos[a] = ra + k_factor * ((1 - res) - (1 - e_h))
        
    return elo_h, elo_a, team_elos

df['home_elo'], df['away_elo'], current_elos = calculate_elo(df)
df['diff_elo'] = (df['home_elo'] + 65) - df['away_elo']

# --- Rest Days ---
long_df = pd.concat([
    df[['date', 'home_team_name']].rename(columns={'home_team_name':'team'}),
    df[['date', 'away_team_name']].rename(columns={'away_team_name':'team'})
]).sort_values(['team', 'date'])
long_df['rest'] = (long_df['date'] - long_df.groupby('team')['date'].shift(1)).dt.days.fillna(7).clip(upper=14)
rest_map = dict(zip(zip(long_df['date'], long_df['team']), long_df['rest']))

df['diff_rest'] = df.apply(lambda x: rest_map.get((x['date'], x['home_team_name']),7), axis=1) - \
                  df.apply(lambda x: rest_map.get((x['date'], x['away_team_name']),7), axis=1)

# --- Rolling Stats ---
# Recalc Points
df['home_team_points'] = np.select([df['home_team_score']>df['away_team_score'], df['home_team_score']==df['away_team_score']], [3, 1], 0)
df['away_team_points'] = np.select([df['away_team_score']>df['home_team_score'], df['away_team_score']==df['home_team_score']], [3, 1], 0)

roll_feats = ['team_xg', 'team_possession', 'shots_onTarget', 'corners', 'team_points']
h_d = df[['date', 'match_url', 'home_team_name']].rename(columns={'home_team_name':'team'})
a_d = df[['date', 'match_url', 'away_team_name']].rename(columns={'away_team_name':'team'})

for f in roll_feats:
    c_h = f"home_{f}" if f in ['team_points', 'team_xg', 'team_possession'] else f"home_{f}"
    c_a = f"away_{f}" if f in ['team_points', 'team_xg', 'team_possession'] else f"away_{f}"
    if c_h in df.columns: h_d[f] = df[c_h]
    if c_a in df.columns: a_d[f] = df[c_a]

stacked = pd.concat([h_d, a_d]).sort_values(['team', 'date'])
for f in roll_feats:
    if f in stacked.columns:
        stacked[f'roll_{f}'] = stacked.groupby('team')[f].transform(lambda x: x.shift(1).rolling(5, min_periods=1).mean())

df = df.merge(stacked[['match_url', 'team'] + [f'roll_{f}' for f in roll_feats]], left_on=['match_url', 'home_team_name'], right_on=['match_url', 'team'], how='left').drop(columns=['team']).rename(columns={f'roll_{f}': f'home_roll_{f}' for f in roll_feats})
df = df.merge(stacked[['match_url', 'team'] + [f'roll_{f}' for f in roll_feats]], left_on=['match_url', 'away_team_name'], right_on=['match_url', 'team'], how='left').drop(columns=['team']).rename(columns={f'roll_{f}': f'away_roll_{f}' for f in roll_feats})

for f in roll_feats:
    df[f'diff_{f}'] = df[f'home_roll_{f}'] - df[f'away_roll_{f}']

df = df.fillna(0)

# ==========================================
# 3. CONFIGURE MODELS (Hardcoded Winners)
# ==========================================
# Features
features = [
    'diff_elo', 'home_elo', 'away_elo',
    'diff_rest', 'diff_team_points', 'diff_team_xg', 
    'diff_shots_onTarget', 'diff_corners',
    'home_roll_team_xg', 'away_roll_team_xg'
]

# Target
conditions = [
    (df['home_team_score'] > df['away_team_score']),
    (df['home_team_score'] == df['away_team_score']),
    (df['home_team_score'] < df['away_team_score'])
]
y = np.select(conditions, [2, 1, 0])
X = df[features]

print(f"Training on {len(X)} matches using {len(features)} features...")

# --- 1. XGBoost (Tuned) ---
xgb_clf = xgb.XGBClassifier(
    n_estimators=200, learning_rate=0.03, max_depth=3, 
    subsample=0.7, colsample_bytree=0.7, gamma=5,
    objective='multi:softprob', num_class=3, random_state=42
)

# --- 2. Random Forest (Tuned) ---
rf_clf = RandomForestClassifier(
    n_estimators=200, max_depth=5, min_samples_leaf=10, 
    max_features='sqrt', random_state=42
)

# --- 3. Logistic Regression (Tuned & Scaled) ---
lr_clf = make_pipeline(
    StandardScaler(),
    LogisticRegression(C=0.01, solver='lbfgs', multi_class='multinomial', max_iter=1000)
)

# --- 4. The Ensemble (Adjusted Weights) ---
# Previous weights [3,1,2] -> New Weights [2, 1, 5] to favor LR
ensemble = VotingClassifier(
    estimators=[('xgb', xgb_clf), ('rf', rf_clf), ('lr', lr_clf)],
    voting='soft',
    weights=[2, 1, 5] 
)

ensemble.fit(X, y)
print("‚úÖ Ensemble Model Trained.")

# Save Model & Data Helpers
joblib.dump({
    'model': ensemble, 
    'features': features,
    'elo_dict': current_elos,
    'df_recent': df[['date', 'home_team_name', 'away_team_name'] + [c for c in df.columns if 'roll_' in c]].tail(500)
}, 'football_model_final.pkl')

print("‚úÖ Model Saved to 'football_model_final.pkl'")

# ==========================================
# 4. PREDICTION INTERFACE
# ==========================================

def predict_match(home, away):
    print(f"\nüîÆ ANALYSIS: {home} vs {away}")
    
    # 1. Get ELOs
    h_elo = current_elos.get(home, 1500)
    a_elo = current_elos.get(away, 1500)
    
    # 2. Get Rolling Stats (Latest available)
    def get_latest(team):
        rows = df[(df['home_team_name']==team) | (df['away_team_name']==team)]
        if len(rows) == 0: return None
        last = rows.sort_values('date').iloc[-1]
        prefix = 'home_' if last['home_team_name']==team else 'away_'
        return {f: last[f"{prefix}roll_{f}"] for f in roll_feats}

    h_stats = get_latest(home)
    a_stats = get_latest(away)
    
    if not h_stats or not a_stats:
        print("‚ùå Team data not found.")
        return

    # 3. Build Input
    input_data = {
        'diff_elo': (h_elo + 65) - a_elo,
        'home_elo': h_elo,
        'away_elo': a_elo,
        'diff_rest': 0, # Assume equal rest for future prediction
        'home_roll_team_xg': h_stats['team_xg'],
        'away_roll_team_xg': a_stats['team_xg']
    }
    
    # Calc diffs
    for f in roll_feats:
        input_data[f"diff_{f}"] = h_stats[f] - a_stats[f]
        
    # Predict
    input_df = pd.DataFrame([input_data])[features]
    probs = ensemble.predict_proba(input_df)[0]
    
    # Output
    print(f"üìä Win Probabilities:")
    print(f"   üè† {home}: {probs[2]:.1%} (Odds: {1/probs[2]:.2f})")
    print(f"   ü§ù Draw:        {probs[1]:.1%} (Odds: {1/probs[1]:.2f})")
    print(f"   ‚úàÔ∏è {away}: {probs[0]:.1%} (Odds: {1/probs[0]:.2f})")
    
    conf = max(probs)
    winner = "Home" if probs[2]==conf else "Draw" if probs[1]==conf else "Away"
    
    if conf > 0.60: print("üíé SNIPER BET (High Confidence)")
    elif conf > 0.50: print("‚úÖ VALUE BET (Good Confidence)")
    elif conf > 0.45: print("‚ö†Ô∏è RISKY BET (Moderate Confidence)")
    else: print("‚ùå NO BET (Coin Flip)")

# TEST
predict_match("Angola", "Zimbabwe")
predict_match("Liverpool", "Manchester City")

   FOOTBALL MATCH PREDICTOR (PRODUCTION v2.0)     
Training on 1272 matches using 10 features...
‚úÖ Ensemble Model Trained.
‚úÖ Model Saved to 'football_model_final.pkl'

üîÆ ANALYSIS: Angola vs Zimbabwe
üìä Win Probabilities:
   üè† Angola: 41.7% (Odds: 2.40)
   ü§ù Draw:        24.8% (Odds: 4.03)
   ‚úàÔ∏è Zimbabwe: 33.5% (Odds: 2.98)
‚ùå NO BET (Coin Flip)

üîÆ ANALYSIS: Liverpool vs Manchester City
üìä Win Probabilities:
   üè† Liverpool: 35.3% (Odds: 2.84)
   ü§ù Draw:        18.0% (Odds: 5.55)
   ‚úàÔ∏è Manchester City: 46.7% (Odds: 2.14)
‚ö†Ô∏è RISKY BET (Moderate Confidence)


In [None]:
import pandas as pd
import numpy as np
import joblib
import warnings

warnings.filterwarnings('ignore')

print("==================================================")
print("   ü§ñ PREMIER LEAGUE VALUE BETTING ENGINE üí∞      ")
print("==================================================")

# ==========================================
# 1. LOAD MODEL & DATA
# ==========================================
try:
    # Load the optimized model dictionary
    artifacts = joblib.load('football_model_final.pkl')
    model = artifacts['model']
    features = artifacts['features']
    current_elos = artifacts['elo_dict']
    df_history = artifacts['df_recent'] # We use this to look up current form
    print("‚úÖ Model & Data Loaded Successfully.")
except FileNotFoundError:
    print("‚ùå Error: 'football_model_final.pkl' not found. Run the training script first!")
    exit()

# ==========================================
# 2. CONFIGURATION
# ==========================================
# Minimum "Edge" required to place a bet. 
# 0.05 means we only bet if the model calculates a 5% ROI or higher.
EDGE_THRESHOLD = 0.05 

# Standard mappings for team names (Screenshot Name -> FBref Name)
team_map = {
    "Manchester Utd": "Manchester United",
    "Newcastle": "Newcastle United",
    "Nottingham": "Nottingham Forest",
    "Manchester City": "Manchester City",
    "Arsenal": "Arsenal",
    "Brighton": "Brighton & Hove Albion",
    "Brentford": "Brentford",
    "Bournemouth": "Bournemouth",
    "Burnley": "Burnley",
    "Everton": "Everton",
    "Liverpool": "Liverpool",
    "Wolves": "Wolverhampton Wanderers",
    "West Ham": "West Ham United",
    "Fulham": "Fulham",
    "Chelsea": "Chelsea",
    "Aston Villa": "Aston Villa",
    "Sunderland": "Sunderland",
    "Leeds": "Leeds United",
    "Crystal Palace": "Crystal Palace",
    "Tottenham": "Tottenham Hotspur"
}

# ==========================================
# 3. INPUT FIXTURES (From Your Screenshot)
# ==========================================
fixtures = [
    # Date: 26 Dec
    {"Home": "Manchester Utd", "Away": "Newcastle", "Odds_1": 2.53, "Odds_X": 3.61, "Odds_2": 2.68},
    # Date: 27 Dec
    {"Home": "Nottingham", "Away": "Manchester City", "Odds_1": 5.29, "Odds_X": 4.45, "Odds_2": 1.58},
    {"Home": "Arsenal", "Away": "Brighton", "Odds_1": 1.40, "Odds_X": 4.92, "Odds_2": 7.85},
    {"Home": "Brentford", "Away": "Bournemouth", "Odds_1": 2.29, "Odds_X": 3.60, "Odds_2": 3.03},
    {"Home": "Burnley", "Away": "Everton", "Odds_1": 4.01, "Odds_X": 3.35, "Odds_2": 2.01},
    {"Home": "Liverpool", "Away": "Wolves", "Odds_1": 1.24, "Odds_X": 6.59, "Odds_2": 11.28},
    {"Home": "West Ham", "Away": "Fulham", "Odds_1": 2.68, "Odds_X": 3.41, "Odds_2": 2.63},
    {"Home": "Chelsea", "Away": "Aston Villa", "Odds_1": 1.85, "Odds_X": 3.88, "Odds_2": 4.03},
    # Date: 28 Dec
    {"Home": "Sunderland", "Away": "Leeds", "Odds_1": 2.58, "Odds_X": 3.23, "Odds_2": 2.86},
    {"Home": "Crystal Palace", "Away": "Tottenham", "Odds_1": 2.29, "Odds_X": 3.30, "Odds_2": 3.25},
]

# ==========================================
# 4. PREDICTION ENGINE
# ==========================================
roll_feats = ['team_xg', 'team_possession', 'shots_onTarget', 'corners', 'team_points','fouls']

def get_latest_stats(team_name):
    # Map screenshot name to FBref name
    real_name = team_map.get(team_name, team_name)
    
    # Filter dataset for this team
    rows = df_history[
        (df_history['home_team_name'] == real_name) | 
        (df_history['away_team_name'] == real_name)
    ].sort_values('date')
    
    if len(rows) == 0: return None, 1500 # Default ELO if new
    
    last = rows.iloc[-1]
    
    # Get stats
    stats = {}
    prefix = 'home_' if last['home_team_name'] == real_name else 'away_'
    
    for f in roll_feats:
        stats[f] = last[f"{prefix}roll_{f}"]
    
    # Get ELO
    elo = current_elos.get(real_name, 1500)
    
    return stats, elo

bets = []

print(f"Analyzing {len(fixtures)} matches for value...")

for f in fixtures:
    h_team, a_team = f['Home'], f['Away']
    
    # Get Data
    h_stats, h_elo = get_latest_stats(h_team)
    a_stats, a_elo = get_latest_stats(a_team)
    
    if not h_stats or not a_stats:
        print(f"‚ö†Ô∏è Warning: Missing data for {h_team} or {a_team}. Skipping.")
        continue

    # Build Input Vector
    input_data = {
        'diff_elo': (h_elo + 65) - a_elo, # +65 Home Adv
        'home_elo': h_elo,
        'away_elo': a_elo,
        'diff_rest': 0, # Assume equal rest for simplicity (or update manually)
        'home_roll_team_xg': h_stats['team_xg'],
        'away_roll_team_xg': a_stats['team_xg']
    }
    
    for feat in roll_feats:
        # Calculate differentials like 'diff_team_possession'
        input_data[f"diff_{feat}"] = h_stats[feat] - a_stats[feat]

    # Predict
    input_df = pd.DataFrame([input_data])
    # Ensure columns align
    input_df = input_df.reindex(columns=features, fill_value=0)
    
    probs = model.predict_proba(input_df)[0]
    prob_away, prob_draw, prob_home = probs[0], probs[1], probs[2]
    print(f"home:{f['Home']} away: {f['Away']}")
    print(f"prob home{prob_home:.2f} || prob draw {prob_draw:.2f} || prob away{prob_away:.2f} ")
    

   ü§ñ PREMIER LEAGUE VALUE BETTING ENGINE üí∞      
‚úÖ Model & Data Loaded Successfully.
Analyzing 10 matches for value...
home:Manchester Utd away: Newcastle
prob home0.31511755988040513 || prob draw 0.32594837398594295 || prob away0.358934066133652 
home:Nottingham away: Manchester City
prob home0.1903550050274614 || prob draw 0.2505021907708814 || prob away0.5591428116522378 
home:Arsenal away: Brighton
prob home0.4257649524709866 || prob draw 0.2792174825882924 || prob away0.29501756494072107 
home:Brentford away: Bournemouth
prob home0.26337294615860324 || prob draw 0.32620756728005895 || prob away0.4104194940119185 
home:Burnley away: Everton
prob home0.216557938338729 || prob draw 0.3530384023509119 || prob away0.4304036667609397 
home:Liverpool away: Wolves
prob home0.4092276958808812 || prob draw 0.3329873664512732 || prob away0.2577849525690068 
home:West Ham away: Fulham
prob home0.2407357803728536 || prob draw 0.35550794988036005 || prob away0.40375626229620565 
home:Ch