In [None]:

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load clean dataset
df = pd.read_csv("data/processed/fifa_wc_features.csv")

# -------------------------
# Step 1: Match stage based on goal difference
df["stage"] = 1
df.loc[df["goal_diff"].abs()>3, "stage"] = 2  # rough knockout approximation

# -------------------------
# Step 2: Recent form (last 3 matches)
df["home_recent_form"] = 0.5
df["away_recent_form"] = 0.5
team_form = {}

for idx, row in df.iterrows():
    home, away = row["home_team"], row["away_team"]
    home_stats = team_form.get(home, [0.5,0.5,0.5])
    away_stats = team_form.get(away, [0.5,0.5,0.5])
    df.at[idx, "home_recent_form"] = sum(home_stats)/len(home_stats)
    df.at[idx, "away_recent_form"] = sum(away_stats)/len(away_stats)
    if row["match_outcome"]=="H":
        home_new, away_new = 1, 0
    elif row["match_outcome"]=="A":
        home_new, away_new = 0, 1
    else:
        home_new, away_new = 0.5, 0.5
    team_form[home] = (home_stats + [home_new])[-3:]
    team_form[away] = (away_stats + [away_new])[-3:]

# -------------------------
# Step 3: Head-to-head
h2h = {}
home_h2h, away_h2h = [], []

for idx, row in df.iterrows():
    pair = tuple(sorted([row["home_team"], row["away_team"]]))
    wins = h2h.get(pair, {"home":0, "away":0})
    home_h2h.append(wins["home"])
    away_h2h.append(wins["away"])
    if row["match_outcome"]=="H":
        wins["home"] += 1
    elif row["match_outcome"]=="A":
        wins["away"] += 1
    h2h[pair] = wins

df["home_h2h"] = home_h2h
df["away_h2h"] = away_h2h

# -------------------------
# Step 4: Features and target
features = ["home_host_advantage","home_strength","away_strength",
            "home_recent_form","away_recent_form","home_h2h","away_h2h","stage"]
X = df[features]
y = df["match_outcome"]

le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Train-test split
train_df = df[df["world_cup_year"] <= 2014]
test_df = df[df["world_cup_year"] >= 2018]

X_train = train_df[features]
y_train = le.transform(train_df["match_outcome"])
X_test = test_df[features]
y_test = le.transform(test_df["match_outcome"])

# -------------------------
# Step 5: Random Forest
rf = RandomForestClassifier(n_estimators=300, max_depth=8, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, target_names=le.classes_))

# -------------------------
# Save enhanced dataset for reference
df.to_csv("data/processed/fifa_wc_features_enhanced.csv", index=False)
print("Enhanced ML-ready dataset saved as fifa_wc_features_enhanced.csv")


Random Forest Accuracy: 0.46875
Confusion Matrix:
 [[15  4 28]
 [ 9  2 17]
 [ 8  2 43]]
              precision    recall  f1-score   support

           A       0.47      0.32      0.38        47
           D       0.25      0.07      0.11        28
           H       0.49      0.81      0.61        53

    accuracy                           0.47       128
   macro avg       0.40      0.40      0.37       128
weighted avg       0.43      0.47      0.42       128

Enhanced ML-ready dataset saved as fifa_wc_features_enhanced.csv
