In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import os
import itertools

# -------------------------
# Folder for charts
if not os.path.exists("charts"):
    os.makedirs("charts")

# -------------------------
# Load historical dataset
df = pd.read_csv("data/processed/fifa_wc_features_enhanced.csv")
features = ["home_host_advantage","home_strength","away_strength",
            "home_recent_form","away_recent_form","home_h2h","away_h2h","stage"]
X = df[features]
y = df["match_outcome"]
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Train RF on all historical data
rf = RandomForestClassifier(n_estimators=300, max_depth=8, random_state=42)
rf.fit(X, y_encoded)

# -------------------------
# 32 teams for 2026
teams = [
    "Brazil","Argentina","Germany","France","Spain","Portugal","England","Belgium",
    "Netherlands","Italy","Croatia","Uruguay","Switzerland","Denmark","Mexico","USA",
    "Senegal","Japan","South Korea","Australia","Morocco","Tunisia","Cameroon","Ghana",
    "Canada","Poland","Serbia","Wales","Costa Rica","Ecuador","Iran","Saudi Arabia"
]

np.random.seed(42)
team_stats = pd.DataFrame({
    "team": teams,
    "strength": np.random.uniform(0.5,0.9,len(teams)),
    "recent_form": np.random.uniform(0.5,0.85,len(teams))
})

# -------------------------
# Generate group stage (8 groups of 4)
groups = [teams[i:i+4] for i in range(0,32,4)]
group_matches = []
for group in groups:
    for home, away in itertools.combinations(group, 2):
        group_matches.append({"home_team": home, "away_team": away, "stage": 1})

future_matches = pd.DataFrame(group_matches)

# Merge team stats
future_matches = future_matches.merge(team_stats.rename(columns={"team":"home_team","strength":"home_strength","recent_form":"home_recent_form"}), on="home_team")
future_matches = future_matches.merge(team_stats.rename(columns={"team":"away_team","strength":"away_strength","recent_form":"away_recent_form"}), on="away_team")
future_matches["home_host_advantage"] = 0
future_matches["home_h2h"] = 1
future_matches["away_h2h"] = 0
future_matches["match"] = future_matches["home_team"] + " vs " + future_matches["away_team"]

# -------------------------
# Predict match outcomes
X_future = future_matches[features]
y_pred = rf.predict(X_future)
y_proba = rf.predict_proba(X_future)

future_matches["predicted_outcome"] = le.inverse_transform(y_pred)
future_matches["prob_H"] = y_proba[:,2].astype(float)
future_matches["prob_D"] = y_proba[:,1].astype(float)
future_matches["prob_A"] = y_proba[:,0].astype(float)

# -------------------------
# Pie charts per match
for idx, row in future_matches.iterrows():
    plt.figure(figsize=(4,4))
    probs = np.array([row["prob_H"], row["prob_D"], row["prob_A"]], dtype=float)
    probs /= probs.sum()
    plt.title(row["match"])
    plt.pie(probs, labels=["Home Win","Draw","Away Win"], autopct='%1.1f%%', colors=["#4daf4a","#377eb8","#e41a1c"])
    plt.savefig(f"charts/pie_{row['home_team']}_vs_{row['away_team']}.png")
    plt.close()

# -------------------------
# Bar chart group stage
plt.figure(figsize=(12,5))
plt.bar(future_matches["match"], future_matches["prob_H"], label="Home Win", color="#4daf4a")
plt.bar(future_matches["match"], future_matches["prob_D"], bottom=future_matches["prob_H"], label="Draw", color="#377eb8")
plt.bar(future_matches["match"], future_matches["prob_A"], bottom=future_matches["prob_H"]+future_matches["prob_D"], label="Away Win", color="#e41a1c")
plt.ylabel("Probability")
plt.title("Group Stage Match Outcome Probabilities")
plt.xticks(rotation=90)
plt.legend()
plt.tight_layout()
plt.savefig("charts/bar_group_stage_probabilities.png")
plt.close()

# -------------------------
# Simulate group stage results: 2 teams advance per group
group_winners = []
for group in groups:
    group_df = future_matches[future_matches["home_team"].isin(group) | future_matches["away_team"].isin(group)].copy()
    points = {team:0 for team in group}
    for idx, row in group_df.iterrows():
        probs = np.array([row["prob_H"], row["prob_D"], row["prob_A"]], dtype=float)
        probs /= probs.sum()
        outcome = np.random.choice(["H","D","A"], p=probs)
        if outcome=="H":
            points[row["home_team"]] += 3
        elif outcome=="A":
            points[row["away_team"]] += 3
        else:
            points[row["home_team"]] += 1
            points[row["away_team"]] += 1
    sorted_group = sorted(points.items(), key=lambda x: x[1], reverse=True)
    group_winners.extend([sorted_group[0][0], sorted_group[1][0]])

# -------------------------
# Knockout stage simulation (Round of 16 â†’ Final)
def simulate_knockout(teams_list, stage_name):
    winners = []
    matches = []
    for i in range(0, len(teams_list), 2):
        home = teams_list[i]
        away = teams_list[i+1]
        home_stats = team_stats[team_stats["team"]==home].iloc[0]
        away_stats = team_stats[team_stats["team"]==away].iloc[0]
        match_dict = {
            "home_team": home, "away_team": away, "stage": stage_name,
            "home_strength": home_stats.strength, "away_strength": away_stats.strength,
            "home_recent_form": home_stats.recent_form, "away_recent_form": away_stats.recent_form,
            "home_host_advantage":0, "home_h2h":1, "away_h2h":0
        }
        X_match = pd.DataFrame([match_dict])[features]
        probs = rf.predict_proba(X_match)[0]
        probs = np.array([probs[2], probs[1], probs[0]], dtype=float)
        probs /= probs.sum()
        outcome = np.random.choice(["H","D","A"], p=probs)
        winner = home if outcome=="H" else away if outcome=="A" else home if home_stats.strength>=away_stats.strength else away
        winners.append(winner)
        matches.append({**match_dict, "predicted_winner":winner})
    return winners, matches

# Round of 16
r16_teams = group_winners
r16_winners, r16_matches = simulate_knockout(r16_teams, stage_name=2)

# Quarterfinals
qf_winners, qf_matches = simulate_knockout(r16_winners, stage_name=3)

# Semifinals
sf_winners, sf_matches = simulate_knockout(qf_winners, stage_name=4)

# Final
final_winner, final_match = simulate_knockout(sf_winners, stage_name=5)
champion = final_winner[0]

# -------------------------
# Save champion
print(f"Predicted 2026 FIFA World Cup Champion: {champion}")

# -------------------------
# Save tournament winner distribution chart
plt.figure(figsize=(12,4))
all_winners = r16_winners + qf_winners + sf_winners + final_winner
sns.countplot(x=all_winners, palette="Set3")
plt.title("Knockout Stage Winners Distribution")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig("charts/knockout_winners_distribution.png")
plt.close()

print("All charts saved in 'charts/' folder")
