In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import os
import itertools

# -------------------------
# Ensure charts folder exists
if not os.path.exists("charts"):
    os.makedirs("charts")

# -------------------------
# Load enhanced dataset
df = pd.read_csv("data/processed/fifa_wc_features_enhanced.csv")

features = ["home_host_advantage","home_strength","away_strength",
            "home_recent_form","away_recent_form","home_h2h","away_h2h","stage"]
X = df[features]
y = df["match_outcome"]
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Train Random Forest
rf = RandomForestClassifier(n_estimators=300, max_depth=8, random_state=42)
rf.fit(X, y_encoded)

# -------------------------
# Deployment-ready: Define 32 teams
teams = [
    "Brazil","Argentina","Germany","France","Spain","Portugal","England","Belgium",
    "Netherlands","Italy","Croatia","Uruguay","Switzerland","Denmark","Mexico","USA",
    "Senegal","Japan","South Korea","Australia","Morocco","Tunisia","Cameroon","Ghana",
    "Canada","Poland","Serbia","Wales","Costa Rica","Ecuador","Iran","Saudi Arabia"
]

# Assign random strengths and recent form for simulation
np.random.seed(42)
team_stats = pd.DataFrame({
    "team": teams,
    "strength": np.random.uniform(0.5,0.9,len(teams)),
    "recent_form": np.random.uniform(0.5,0.85,len(teams))
})

# -------------------------
# Generate group stage matches (groups of 4)
groups = [teams[i:i+4] for i in range(0,32,4)]
group_matches = []
for group in groups:
    for home, away in itertools.combinations(group, 2):
        group_matches.append({"home_team": home, "away_team": away, "stage": 1})

future_matches = pd.DataFrame(group_matches)

# Merge strength and recent form
future_matches = future_matches.merge(team_stats.rename(columns={"team":"home_team","strength":"home_strength","recent_form":"home_recent_form"}), on="home_team")
future_matches = future_matches.merge(team_stats.rename(columns={"team":"away_team","strength":"away_strength","recent_form":"away_recent_form"}), on="away_team")

# Set dummy values for home_host_advantage and h2h
future_matches["home_host_advantage"] = 0
future_matches["home_h2h"] = 1
future_matches["away_h2h"] = 0

# -------------------------
# Predict outcomes
X_future = future_matches[features]
y_pred = rf.predict(X_future)
y_proba = rf.predict_proba(X_future)

future_matches["predicted_outcome"] = le.inverse_transform(y_pred)
future_matches["prob_H"] = y_proba[:,2].astype(float)
future_matches["prob_D"] = y_proba[:,1].astype(float)
future_matches["prob_A"] = y_proba[:,0].astype(float)
future_matches["match"] = future_matches["home_team"] + " vs " + future_matches["away_team"]

# -------------------------
# Pie charts for each match
for idx, row in future_matches.iterrows():
    plt.figure(figsize=(4,4))
    plt.title(row["match"])
    probs = np.array([row["prob_H"], row["prob_D"], row["prob_A"]], dtype=float)
    probs /= probs.sum()
    plt.pie(probs, labels=["Home Win","Draw","Away Win"], autopct='%1.1f%%',
            colors=["#4daf4a","#377eb8","#e41a1c"])
    plt.savefig(f"charts/pie_{row['home_team']}_vs_{row['away_team']}.png")
    plt.close()

# -------------------------
# Bar chart of probabilities
plt.figure(figsize=(12,5))
plt.bar(future_matches["match"], future_matches["prob_H"], label="Home Win", color="#4daf4a")
plt.bar(future_matches["match"], future_matches["prob_D"], bottom=future_matches["prob_H"], label="Draw", color="#377eb8")
plt.bar(future_matches["match"], future_matches["prob_A"], 
        bottom=future_matches["prob_H"]+future_matches["prob_D"], label="Away Win", color="#e41a1c")
plt.ylabel("Probability")
plt.title("Group Stage Match Outcome Probabilities")
plt.xticks(rotation=90)
plt.legend()
plt.tight_layout()
plt.savefig("charts/bar_group_stage_probabilities.png")
plt.close()

# -------------------------
# Simulate winners (simple example: top team per match)
winners = []
for idx, row in future_matches.iterrows():
    probs = np.array([row["prob_H"], row["prob_D"], row["prob_A"]], dtype=float)
    probs /= probs.sum()
    outcome = np.random.choice(["H","D","A"], p=probs)
    if outcome=="H":
        winners.append(row["home_team"])
    elif outcome=="A":
        winners.append(row["away_team"])
    else:
        winner = row["home_team"] if row["home_strength"]>=row["away_strength"] else row["away_team"]
        winners.append(winner)

plt.figure(figsize=(10,4))
sns.countplot(x=winners, palette="Set2")
plt.title("Simulated Group Stage Winners")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig("charts/simulated_group_winners.png")
plt.close()

print("All group stage predictions and charts saved in 'charts/' folder")


All group stage predictions and charts saved in 'charts/' folder



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x=winners, palette="Set2")
