## Import Libraries

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
from itertools import combinations
from sklearn.preprocessing import LabelEncoder

np.random.seed(42)

## Import prediction models

In [4]:
model = joblib.load("../Models/best_model.pkl")
scaler = joblib.load('../Models/scaler.pkl')
feature_list = joblib.load('../Models/feature_names.pkl')

## Import files for prediction

In [6]:
qualified = pd.read_csv("../DataSets/Qualified.csv")
features_df = pd.read_csv("../DataSets/data_features.csv")
team_stats_df = pd.read_csv("../DataSets/team_stats.csv")

## Standardizing

In [8]:
country_mapping = {
    'ir iran': 'iran',
    'korea republic': 'south korea',
    'usa': 'united states',
    'usmnt': 'united states',
    'st.vincent/grenadines': 'st. vincent / grenadines',
    'st vincent and the grenadines' : 'st. vincent / grenadines',
    'congo dr' : 'dr congo',
    'el salvador': 'elsalvador',
    'united arab emirates' : 'uae',
    'turkiye' : 'turkey',
    'czech republic' : 'czechia',
    'curaçao' : 'curacao'
}

In [9]:
qualified['Team'] = qualified['Team'].str.lower()
qualified['Team'] = qualified['Team'].replace(country_mapping)
qualified.head()

Unnamed: 0,Team,Qualification_Method,Confederation,FIFA_Rank_Nov2025,Status
0,united states,Host,CONCACAF,15,Qualified
1,canada,Host,CONCACAF,33,Qualified
2,mexico,Host,CONCACAF,13,Qualified
3,spain,UEFA Group Winner,UEFA,1,Qualified
4,england,UEFA Group Winner,UEFA,4,Qualified


In [10]:
team_stats_df.columns

Index(['Unnamed: 0', 'total_matches', 'total_wins', 'total_draws',
       'total_losses', 'win_rate', 'draw_rate', 'loss_rate', 'goals_scored',
       'goals_conceded', 'avg_goals_scored', 'avg_goals_conceded',
       'goal_diff_per_match', 'xg_for', 'xg_against', 'avg_xg_for',
       'avg_xg_against', 'xg_diff_per_match', 'xg_overperf', 'yellow_cards',
       'avg_yellow_cards', 'clean_sheets', 'clean_sheet_rate'],
      dtype='object')

In [11]:
team_stats_df = team_stats_df.rename(columns = {'Unnamed: 0' : 'Team'})

In [12]:
team_stats_df.columns

Index(['Team', 'total_matches', 'total_wins', 'total_draws', 'total_losses',
       'win_rate', 'draw_rate', 'loss_rate', 'goals_scored', 'goals_conceded',
       'avg_goals_scored', 'avg_goals_conceded', 'goal_diff_per_match',
       'xg_for', 'xg_against', 'avg_xg_for', 'avg_xg_against',
       'xg_diff_per_match', 'xg_overperf', 'yellow_cards', 'avg_yellow_cards',
       'clean_sheets', 'clean_sheet_rate'],
      dtype='object')

## Merging team statistic to the dataset with qualified teams, so every qualified team would have its own statstics.

In [14]:
qualified_with_stats = pd.merge(qualified, team_stats_df, on='Team', how='left')

### Dropping 4 rows, as 2 of them do not have statistics (NaN). Dropped two more rows, for simulation we must have specific number of teams in each group

In [16]:
qualified_with_stats = qualified_with_stats[qualified_with_stats['Team']!= 'curacao']
qualified_with_stats = qualified_with_stats[qualified_with_stats['Team']!= 'new zealand']
qualified_with_stats = qualified_with_stats[qualified_with_stats['Team']!= 'norway']
qualified_with_stats = qualified_with_stats[qualified_with_stats['Team']!= 'paraguay']

In [17]:
qualified_with_stats = qualified_with_stats.reset_index(drop=True)

In [18]:
sorted(qualified_with_stats["Team"].unique())

['algeria',
 'argentina',
 'australia',
 'austria',
 'belgium',
 'brazil',
 'canada',
 'cape verde',
 'colombia',
 'croatia',
 'denmark',
 'dr congo',
 'ecuador',
 'egypt',
 'england',
 'france',
 'germany',
 'ghana',
 'haiti',
 'iran',
 'iraq',
 'italy',
 'ivory coast',
 'japan',
 'jordan',
 'mexico',
 'morocco',
 'netherlands',
 'panama',
 'poland',
 'portugal',
 'qatar',
 'saudi arabia',
 'scotland',
 'senegal',
 'south africa',
 'south korea',
 'spain',
 'switzerland',
 'tunisia',
 'turkey',
 'united states',
 'uruguay',
 'uzbekistan']

In [19]:
qualified_with_stats.columns
qualified_with_stats = qualified_with_stats.rename(columns = {'FIFA_Rank_Nov2025' : 'elo'})

In [20]:
display(qualified_with_stats)

Unnamed: 0,Team,Qualification_Method,Confederation,elo,Status,total_matches,total_wins,total_draws,total_losses,win_rate,...,xg_for,xg_against,avg_xg_for,avg_xg_against,xg_diff_per_match,xg_overperf,yellow_cards,avg_yellow_cards,clean_sheets,clean_sheet_rate
0,united states,Host,CONCACAF,15,Qualified,13.0,6.0,3.0,4.0,0.461538,...,17.26,13.16,1.327692,1.012308,0.315385,0.133846,22.0,1.692308,5.0,0.384615
1,canada,Host,CONCACAF,33,Qualified,11.0,3.0,4.0,4.0,0.272727,...,14.19,10.75,1.29,0.977273,0.312727,-0.108182,22.0,2.0,5.0,0.454545
2,mexico,Host,CONCACAF,13,Qualified,12.0,7.0,3.0,2.0,0.583333,...,17.79,8.73,1.4825,0.7275,0.755,-0.399167,29.0,2.416667,7.0,0.583333
3,spain,UEFA Group Winner,UEFA,1,Qualified,11.0,8.0,2.0,1.0,0.727273,...,18.93,10.47,1.720909,0.951818,0.769091,0.460909,17.0,1.545455,5.0,0.454545
4,england,UEFA Group Winner,UEFA,4,Qualified,12.0,6.0,4.0,2.0,0.5,...,14.86,11.08,1.238333,0.923333,0.315,0.511667,16.0,1.333333,5.0,0.416667
5,france,UEFA Group Winner,UEFA,3,Qualified,13.0,7.0,4.0,2.0,0.538462,...,22.78,15.11,1.752308,1.162308,0.59,-0.213846,17.0,1.307692,5.0,0.384615
6,portugal,UEFA Group Winner,UEFA,5,Qualified,10.0,5.0,2.0,3.0,0.5,...,17.36,11.04,1.736,1.104,0.632,-0.036,17.0,1.7,4.0,0.4
7,germany,UEFA Group Winner,UEFA,10,Qualified,8.0,4.0,2.0,2.0,0.5,...,19.94,7.79,2.4925,0.97375,1.51875,-0.3675,15.0,1.875,2.0,0.25
8,netherlands,UEFA Group Winner,UEFA,7,Qualified,11.0,6.0,3.0,2.0,0.545455,...,12.92,12.64,1.174545,1.149091,0.025455,0.643636,12.0,1.090909,4.0,0.363636
9,croatia,UEFA Group Winner,UEFA,11,Qualified,10.0,2.0,6.0,2.0,0.2,...,13.55,15.37,1.355,1.537,-0.182,-0.255,24.0,2.4,2.0,0.2


## Much Prediction function

In [22]:
# For now, create a simple encoder. (Because it is much faster for a machine to work with numbers)
team_encoder = LabelEncoder()
team_encoder.fit(qualified_with_stats['Team'].values)

# Current year for WC2026
CURRENT_YEAR = 2026

def predict_match(team1, team2, team_stats_df, model, scaler=None):
    """
    Predicts match outcome: team1 (home) vs team2 (away)
    Returns: [P(away_win), P(draw), P(home_win)]
    """
    
    # Get stats for both teams
    team1_stats = team_stats_df[team_stats_df['Team'] == team1].iloc[0]
    team2_stats = team_stats_df[team_stats_df['Team'] == team2].iloc[0]
    
    # Create feature dictionary matching x_train exactly
    features = {
        'year': CURRENT_YEAR,
        'home_team_elo': team1_stats['elo'],
        'away_team_elo': team2_stats['elo'],
        'elo_diff': team1_stats['elo'] - team2_stats['elo'],
        'home_qualified': 1,  # All teams are qualified
        'away_qualified': 1,
        
        # Home team features
        'home_team_total_matches': team1_stats['total_matches'],
        'home_team_total_wins': team1_stats['total_wins'],
        'home_team_total_draws': team1_stats['total_draws'],
        'home_team_total_losses': team1_stats['total_losses'],
        'home_team_win_rate': team1_stats['win_rate'],
        'home_team_draw_rate': team1_stats['draw_rate'],
        'home_team_loss_rate': team1_stats['loss_rate'],
        'home_team_goals_scored': team1_stats['goals_scored'],
        'home_team_goals_conceded': team1_stats['goals_conceded'],
        'home_team_avg_goals_scored': team1_stats['avg_goals_scored'],
        'home_team_avg_goals_conceded': team1_stats['avg_goals_conceded'],
        'home_team_goal_diff_per_match': team1_stats['goal_diff_per_match'],
        'home_team_xg_for': team1_stats['xg_for'],
        'home_team_xg_against': team1_stats['xg_against'],
        'home_team_avg_xg_for': team1_stats['avg_xg_for'],
        'home_team_avg_xg_against': team1_stats['avg_xg_against'],
        'home_team_xg_diff_per_match': team1_stats['xg_diff_per_match'],
        'home_team_xg_overperf': team1_stats['xg_overperf'],
        'home_team_yellow_cards': team1_stats['yellow_cards'],
        'home_team_avg_yellow_cards': team1_stats['avg_yellow_cards'],
        'home_team_clean_sheets': team1_stats['clean_sheets'],
        'home_team_clean_sheet_rate': team1_stats['clean_sheet_rate'],
        
        # Away team features
        'away_team_total_matches': team2_stats['total_matches'],
        'away_team_total_wins': team2_stats['total_wins'],
        'away_team_total_draws': team2_stats['total_draws'],
        'away_team_total_losses': team2_stats['total_losses'],
        'away_team_win_rate': team2_stats['win_rate'],
        'away_team_draw_rate': team2_stats['draw_rate'],
        'away_team_loss_rate': team2_stats['loss_rate'],
        'away_team_goals_scored': team2_stats['goals_scored'],
        'away_team_goals_conceded': team2_stats['goals_conceded'],
        'away_team_avg_goals_scored': team2_stats['avg_goals_scored'],
        'away_team_avg_goals_conceded': team2_stats['avg_goals_conceded'],
        'away_team_goal_diff_per_match': team2_stats['goal_diff_per_match'],
        'away_team_xg_for': team2_stats['xg_for'],
        'away_team_xg_against': team2_stats['xg_against'],
        'away_team_avg_xg_for': team2_stats['avg_xg_for'],
        'away_team_avg_xg_against': team2_stats['avg_xg_against'],
        'away_team_xg_diff_per_match': team2_stats['xg_diff_per_match'],
        'away_team_xg_overperf': team2_stats['xg_overperf'],
        'away_team_yellow_cards': team2_stats['yellow_cards'],
        'away_team_avg_yellow_cards': team2_stats['avg_yellow_cards'],
        'away_team_clean_sheets': team2_stats['clean_sheets'],
        'away_team_clean_sheet_rate': team2_stats['clean_sheet_rate'],
        
        # Difference features
        'goal_diff': team1_stats['avg_goals_scored'] - team2_stats['avg_goals_scored'],
        'def_diff': team1_stats['avg_goals_conceded'] - team2_stats['avg_goals_conceded'],
        'xg_for_diff': team1_stats['avg_xg_for'] - team2_stats['avg_xg_for'],
        'xg_against_diff': team1_stats['avg_xg_against'] - team2_stats['avg_xg_against'],
        'win_rate_diff': team1_stats['win_rate'] - team2_stats['win_rate'],
        
        # Encoded team names
        'home_team_encoded': team_encoder.transform([team1])[0],
        'away_team_encoded': team_encoder.transform([team2])[0],
    }
    
    # Convert to DataFrame with correct column order
    X = pd.DataFrame([features])
    
    # Ensure column order matches training
    expected_columns = [
        'year', 'home_team_elo', 'away_team_elo', 'elo_diff', 'home_qualified', 'away_qualified',
        'home_team_total_matches', 'home_team_total_wins', 'home_team_total_draws', 
        'home_team_total_losses', 'home_team_win_rate', 'home_team_draw_rate', 'home_team_loss_rate',
        'home_team_goals_scored', 'home_team_goals_conceded', 'home_team_avg_goals_scored',
        'home_team_avg_goals_conceded', 'home_team_goal_diff_per_match', 'home_team_xg_for',
        'home_team_xg_against', 'home_team_avg_xg_for', 'home_team_avg_xg_against',
        'home_team_xg_diff_per_match', 'home_team_xg_overperf', 'home_team_yellow_cards',
        'home_team_avg_yellow_cards', 'home_team_clean_sheets', 'home_team_clean_sheet_rate',
        'away_team_total_matches', 'away_team_total_wins', 'away_team_total_draws',
        'away_team_total_losses', 'away_team_win_rate', 'away_team_draw_rate', 'away_team_loss_rate',
        'away_team_goals_scored', 'away_team_goals_conceded', 'away_team_avg_goals_scored',
        'away_team_avg_goals_conceded', 'away_team_goal_diff_per_match', 'away_team_xg_for',
        'away_team_xg_against', 'away_team_avg_xg_for', 'away_team_avg_xg_against',
        'away_team_xg_diff_per_match', 'away_team_xg_overperf', 'away_team_yellow_cards',
        'away_team_avg_yellow_cards', 'away_team_clean_sheets', 'away_team_clean_sheet_rate',
        'goal_diff', 'def_diff', 'xg_for_diff', 'xg_against_diff', 'win_rate_diff',
        'home_team_encoded', 'away_team_encoded'
    ]
    
    X = X[expected_columns]
    
    # Scale if needed
    if scaler is not None:
        X = pd.DataFrame(scaler.transform(X), columns=X.columns)
    
    # Get probabilities
    probs = model.predict_proba(X)[0]
    
    return probs


## Function to simulate one match

In [24]:
def simulate_match(team1, team2, probabilities):
    """
    Simulates match result based on probabilities
    Returns: winner ('team1', 'team2', or 'draw')
    """
    # Assuming probabilities order: [Away Win, Draw, Home Win]
    outcome = np.random.choice(['team2', 'draw', 'team1'], p=probabilities)
    return outcome

## Simulation of whole group stage

In [26]:
def simulate_group_stage(groups, team_stats_df, model, scaler=None):
    """
    Simulates entire group stage
    Returns: dict with group standings
    """
    group_results = {}
    
    for group_name, teams in groups.items():
        # Initialize standings
        standings = {team: {'points': 0, 'gf': 0, 'ga': 0, 'gd': 0, 'wins': 0} 
                     for team in teams}
        
        # Get all matchups (6 matches per group)
        matchups = list(combinations(teams, 2))
        
        for team1, team2 in matchups:
            # Predict probabilities
            probs = predict_match(team1, team2, team_stats_df, model, scaler)
            
            # Simulate match
            result = simulate_match(team1, team2, probs)
            
            # Simulate goals based on result
            if result == 'team1':
                goals1, goals2 = np.random.randint(1, 4), np.random.randint(0, 2)
                standings[team1]['points'] += 3
                standings[team1]['wins'] += 1
            elif result == 'team2':
                goals1, goals2 = np.random.randint(0, 2), np.random.randint(1, 4)
                standings[team2]['points'] += 3
                standings[team2]['wins'] += 1
            else:  # draw
                goals1 = goals2 = np.random.randint(0, 3)
                standings[team1]['points'] += 1
                standings[team2]['points'] += 1
            
            # Update standings
            standings[team1]['gf'] += goals1
            standings[team1]['ga'] += goals2
            standings[team1]['gd'] = standings[team1]['gf'] - standings[team1]['ga']
            
            standings[team2]['gf'] += goals2
            standings[team2]['ga'] += goals1
            standings[team2]['gd'] = standings[team2]['gf'] - standings[team2]['ga']
        
        # Sort by points, then goal difference
        sorted_standings = sorted(standings.items(), 
                                  key=lambda x: (x[1]['points'], x[1]['gd'], x[1]['gf']), 
                                  reverse=True)
        
        group_results[group_name] = sorted_standings
    
    return group_results

## Find the two winners from the group stage and returning them

In [28]:
def get_knockout_teams(group_results):
    """
    Gets top 2 from each group (22 teams for 11 groups)
    Returns: list of qualified teams
    """
    qualified = []
    
    # Get top 2 from each group
    for group_name, standings in group_results.items():
        qualified.append(standings[0][0])  # 1st place
        qualified.append(standings[1][0])  # 2nd place
    
    return qualified


## Function to simulate a match in the knockout

In [30]:
def simulate_single_knockout_match(team1, team2, team_stats_df, model, scaler=None):
    """
    Simulates single knockout match (no draws, go to penalties if needed)
    """
    probs = predict_match(team1, team2, team_stats_df, model, scaler)
    result = simulate_match(team1, team2, probs)
    
    # If draw, 50/50 penalty shootout
    if result == 'draw':
        result = np.random.choice(['team1', 'team2'])
    
    return team1 if result == 'team1' else team2


## Simulation of the whole knockout stage and finding the winner

In [32]:
def simulate_knockout_round(teams, team_stats_df, model, scaler=None):
    """Simulates one knockout round"""
    winners = []
    for i in range(0, len(teams), 2):
        team1, team2 = teams[i], teams[i+1]
        winner = simulate_single_knockout_match(team1, team2, team_stats_df, model, scaler)
        winners.append(winner)
    return winners


def simulate_knockout_stage(teams, team_stats_df, model, scaler=None):
    """
    Simulates knockout stage
    Returns: (champion, runner_up, third_place)
    """
    current_round = teams.copy()
    
    # Continue knockout rounds until we have 4 teams (semi-finals)
    while len(current_round) > 4:
        current_round = simulate_knockout_round(current_round, team_stats_df, model, scaler)
    
    # Semi-finals
    semi_winners = simulate_knockout_round(current_round, team_stats_df, model, scaler)
    semi_losers = [t for t in current_round if t not in semi_winners]
    
    # Third place match
    third_place = simulate_single_knockout_match(semi_losers[0], semi_losers[1], 
                                                   team_stats_df, model, scaler)
    
    # Final
    champion = simulate_single_knockout_match(semi_winners[0], semi_winners[1], 
                                              team_stats_df, model, scaler)
    runner_up = [t for t in semi_winners if t != champion][0]
    
    return champion, runner_up, third_place


## Full tournament simulation using previous functions

In [34]:
def simulate_tournament(groups, team_stats_df, model, scaler=None):
    """
    Simulates entire WC2026
    Returns: (champion, runner_up, third_place)
    """
    # Group stage
    group_results = simulate_group_stage(groups, team_stats_df, model, scaler)
    
    # Get qualified teams
    knockout_teams = get_knockout_teams(group_results)
    
    # Knockout stage
    champion, runner_up, third_place = simulate_knockout_stage(knockout_teams, team_stats_df, model, scaler)
    
    return champion, runner_up, third_place

## Monte carlo simulation (Monte Carlo simulation is a method that uses repeated random sampling to estimate the probability of different outcomes in a process that involves uncertainty.)

In [36]:
def run_monte_carlo(groups, team_stats_df, model, n_simulations=1, scaler=None):
    """
    Runs tournament simulation N times
    Returns: probability distributions
    """
    results = {
        'champion': {},
        'runner_up': {},
        'third_place': {}
    }
    
    print(f"Running {n_simulations} WC2026 simulations...\n")
    
    for i in range(n_simulations):
        if (i + 1) % 1000 == 0:
            print(f" Completed {i + 1}/{n_simulations}")
        
        champion, runner_up, third = simulate_tournament(groups, team_stats_df, model, scaler)
        
        # Count results
        results['champion'][champion] = results['champion'].get(champion, 0) + 1
        results['runner_up'][runner_up] = results['runner_up'].get(runner_up, 0) + 1
        results['third_place'][third] = results['third_place'].get(third, 0) + 1
    
    # Convert to probabilities
    for category in results:
        for team in results[category]:
            results[category][team] = results[category][team] / n_simulations * 100
    
    return results

## Function to display the results

In [38]:
def display_results(results):
    """Display tournament prediction results"""
    
    print("\n" + "="*60)
    print(" FIFA WORLD CUP 2026 PREDICTIONS ")
    print("="*60)
    
    # Top 15 favorites to win
    champion_probs = sorted(results['champion'].items(), key=lambda x: x[1], reverse=True)
    print("\n TOP 15 FAVORITES TO WIN:")
    print("-" * 60)
    for i, (team, prob) in enumerate(champion_probs[:15], 1):
        bar = "█" * int(prob / 2)
        print(f"{i:2d}. {team.title():20s} {prob:5.2f}% {bar}")
    
    # Runner-up probabilities
    print("\n TOP 10 RUNNER-UP PROBABILITIES:")
    print("-" * 60)
    runner_up_probs = sorted(results['runner_up'].items(), key=lambda x: x[1], reverse=True)
    for i, (team, prob) in enumerate(runner_up_probs[:10], 1):
        print(f"{i:2d}. {team.title():20s} {prob:5.2f}%")
    
    # Third place probabilities
    print("\n TOP 10 THIRD-PLACE PROBABILITIES:")
    print("-" * 60)
    third_probs = sorted(results['third_place'].items(), key=lambda x: x[1], reverse=True)
    for i, (team, prob) in enumerate(third_probs[:10], 1):
        print(f"{i:2d}. {team.title():20s} {prob:5.2f}%")
    
    print("\n" + "="*60)


# Run Simulation

In [40]:
if __name__ == "__main__":

    groups = {
        'A': ['brazil', 'denmark', 'egypt', 'uruguay'],
        'B': ['argentina', 'italy', 'morocco', 'panama'],
        'C': ['france', 'croatia', 'senegal', 'germany'],
        'D': ['spain', 'mexico', 'south korea', 'canada'],
        'E': ['england', 'poland', 'japan', 'qatar'],
        'F': ['belgium', 'colombia', 'ghana', 'jordan'],
        'G': ['netherlands', 'united states', 'austria', 'cape verde'],
        'H': ['portugal', 'turkey', 'south africa', 'uzbekistan']
    }
    
    # Run Monte Carlo simulation 100 times
    results = run_monte_carlo(groups, qualified_with_stats, model, n_simulations=1000, scaler=scaler)
    
    # Display results
    display_results(results)
    
    # Save results
    results_df = pd.DataFrame({
        'Team': list(results['champion'].keys()),
        'Champion_Probability': [results['champion'].get(t, 0) for t in results['champion'].keys()],
        'RunnerUp_Probability': [results['runner_up'].get(t, 0) for t in results['champion'].keys()],
        'Third_Probability': [results['third_place'].get(t, 0) for t in results['champion'].keys()]
    }).sort_values('Champion_Probability', ascending=False)
    
    results_df.to_csv('wc2026_results.csv', index=False)
    print("\n Results saved to 'wc2026_predictions.csv'")

Running 1000 WC2026 simulations...

 Completed 1000/1000

 FIFA WORLD CUP 2026 PREDICTIONS 

 TOP 15 FAVORITES TO WIN:
------------------------------------------------------------
 1. Poland               11.00% █████
 2. South Africa          9.60% ████
 3. Italy                 8.00% ████
 4. Ghana                 7.50% ███
 5. Canada                6.90% ███
 6. Morocco               6.30% ███
 7. Belgium               4.80% ██
 8. Denmark               4.40% ██
 9. Mexico                3.50% █
10. South Korea           3.30% █
11. Egypt                 3.30% █
12. United States         3.10% █
13. Croatia               3.00% █
14. Uruguay               2.40% █
15. France                2.20% █

 TOP 10 RUNNER-UP PROBABILITIES:
------------------------------------------------------------
 1. Canada                8.00%
 2. Italy                 7.80%
 3. Poland                7.50%
 4. Ghana                 6.20%
 5. Croatia               5.00%
 6. South Africa          4.90%
 7. B