# Football Match Predictor & API
This notebook contains the workflow to train a Machine Learning model for football match prediction and serve it via a FastAPI backend.

## Structure:
1. **Data Loading & Preprocessing**: Load historical match data.
2. **Feature Engineering**: Create features like `strength`, `form`, `h2h`.
3. **Model Training**: Train a `RandomForestClassifier`.
4. **Model Saving**: Save the trained model and feature maps as pickle files.
5. **API Serving**: Run a FastAPI server to serve predictions.

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import nest_asyncio
import uvicorn
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from fastapi.middleware.cors import CORSMiddleware

# Apply nest_asyncio to allow running uvicorn in a notebook
nest_asyncio.apply()

In [2]:
# 2) Daten laden
# Adjust path if necessary for your environment
path = "C:/Users/taaacya1/ML-datasets/archive/results.csv" 

if not os.path.exists(path):
    print(f"⚠️ Warning: File not found at {path}. Please check the path.")
    # Create a dummy dataframe if file is missing to prevent errors in next cells (for demo purposes)
    # In production, you should handle this properly.
    data = {
        'date': pd.date_range(start='2020-01-01', periods=100),
        'home_team': np.random.choice(['Switzerland', 'Germany', 'France', 'Italy'], 100),
        'away_team': np.random.choice(['Switzerland', 'Germany', 'France', 'Italy'], 100),
        'home_score': np.random.randint(0, 5, 100),
        'away_score': np.random.randint(0, 5, 100)
    }
    df = pd.DataFrame(data)
else:
    df = pd.read_csv(path)

# 3) Ergebnis-Spalte berechnen
df["result"] = np.where(
    df["home_score"] > df["away_score"], "home_win",
    np.where(df["home_score"] < df["away_score"], "away_win", "draw")
)

# 4) Datum umwandeln und nach Zeit sortieren
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df = df.sort_values("date").reset_index(drop=True)

# 5) Split in Trainings- und Validierungsdaten (zeitbasiert)
split_index = int(len(df) * 0.8)
train_df = df.iloc[:split_index].copy()
val_df   = df.iloc[split_index:].copy()

In [3]:
# 6) Teamstärke berechnen (Siegquote pro Team)
home_wins_tr = train_df[train_df["result"] == "home_win"].groupby("home_team").size()
away_wins_tr = train_df[train_df["result"] == "away_win"].groupby("away_team").size()
total_games_tr = train_df["home_team"].value_counts() + train_df["away_team"].value_counts()
win_rate_tr = (home_wins_tr.add(away_wins_tr, fill_value=0) / total_games_tr).fillna(0)
mean_wr_tr = float(win_rate_tr.mean())

# 7) Head-to-Head berechnen
matchups = train_df.groupby(["home_team", "away_team"])["result"].value_counts().unstack(fill_value=0)
# Handle case where 'home_win' column might be missing if no home wins in subset
if "home_win" in matchups.columns:
    matchups["home_win_rate"] = matchups["home_win"] / matchups.sum(axis=1)
else:
    matchups["home_win_rate"] = 0.0

h2h_map = matchups["home_win_rate"]
mean_h2h_tr = float(h2h_map.mean())

# 8) Funktion für bidirektionales Head-to-Head (wird für API benötigt)
def _get_h2h(home, away):
    if (home, away) in h2h_map.index:
        return float(h2h_map.loc[(home, away)])
    elif (away, home) in h2h_map.index:
        return float(1.0 - h2h_map.loc[(away, home)])
    else:
        return mean_h2h_tr

# 9) Form der letzten 5 Spiele berechnen
tmp_home = train_df[["date", "home_team", "result"]].copy()
tmp_home["win_flag"] = np.where(tmp_home["result"] == "home_win", 1,
                                np.where(tmp_home["result"] == "draw", 0.5, 0))
tmp_home = tmp_home.sort_values(["home_team", "date"])
form_home_series = (
    tmp_home.groupby("home_team")["win_flag"]
    .rolling(window=5, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)
train_df = train_df.sort_values("date").reset_index(drop=True)
tmp_home = tmp_home.sort_values("date").reset_index(drop=True)
train_df["form_home"] = form_home_series.values

tmp_away = train_df[["date", "away_team", "result"]].copy()
tmp_away["win_flag"] = np.where(tmp_away["result"] == "away_win", 1,
                                np.where(tmp_away["result"] == "draw", 0.5, 0))
tmp_away = tmp_away.sort_values(["away_team", "date"])
form_away_series = (
    tmp_away.groupby("away_team")["win_flag"]
    .rolling(window=5, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)
tmp_away = tmp_away.sort_values("date").reset_index(drop=True)
train_df["form_away"] = form_away_series.values

form_home_map = train_df.groupby("home_team")["form_home"].mean().fillna(0.5)
form_away_map = train_df.groupby("away_team")["form_away"].mean().fillna(0.5)

# 10) Features auf gesamtes DF anwenden
df["home_strength"] = df["home_team"].map(win_rate_tr).fillna(mean_wr_tr)
df["away_strength"] = df["away_team"].map(win_rate_tr).fillna(mean_wr_tr)

# OPTIMIZED: Vectorized H2H calculation (replaces slow apply)
h2h_lookup = h2h_map.reset_index(name="h2h_val")
# Merge direct (Home vs Away)
df = df.merge(h2h_lookup, on=["home_team", "away_team"], how="left").rename(columns={"h2h_val": "h2h_direct"})
# Merge reverse (Away vs Home)
h2h_lookup_rev = h2h_lookup.rename(columns={"home_team": "away_team", "away_team": "home_team", "h2h_val": "h2h_rev"})
df = df.merge(h2h_lookup_rev, on=["home_team", "away_team"], how="left")
# Combine logic
df["h2h_strength"] = df["h2h_direct"]
mask_nan = df["h2h_strength"].isna()
df.loc[mask_nan, "h2h_strength"] = 1.0 - df.loc[mask_nan, "h2h_rev"]
df["h2h_strength"] = df["h2h_strength"].fillna(mean_h2h_tr)
# Cleanup
df.drop(columns=["h2h_direct", "h2h_rev"], inplace=True)

df["form_home"] = df["home_team"].map(form_home_map).fillna(0.5)
df["form_away"] = df["away_team"].map(form_away_map).fillna(0.5)

# 11) Feature-Matrix X und Zielvariable y erstellen
y = df["result"]
X = pd.get_dummies(df[["home_team", "away_team"]], drop_first=False)
X["home_strength"] = df["home_strength"]
X["away_strength"] = df["away_strength"]
X["h2h_strength"]  = df["h2h_strength"]
X["form_home"]     = df["form_home"]
X["form_away"]     = df["form_away"]
X["strength_diff"] = X["home_strength"] - X["away_strength"]
X["form_diff"]     = X["form_home"] - X["form_away"]

# 12) Split in Train/Test
train_X, val_X = X.iloc[:split_index], X.iloc[split_index:]
train_y, val_y = y.iloc[:split_index], y.iloc[split_index:]

In [4]:
from sklearn.ensemble import RandomForestRegressor

# 13) Modell trainieren
df_model = RandomForestClassifier(random_state=1)
df_model.fit(train_X, train_y)

# 14) Bewertung
val_pred = df_model.predict(val_X)
acc = accuracy_score(val_y, val_pred)
cm = confusion_matrix(val_y, val_pred, labels=["home_win", "draw", "away_win"])

print("Validation Accuracy for Random Forest Model:", round(acc, 3))
print("Confusion matrix (rows=true, cols=pred):")
print(cm)

Validation Accuracy for Random Forest Model: 0.515
Confusion matrix (rows=true, cols=pred):
[[3638  499  546]
 [1416  310  512]
 [1368  394 1087]]


In [5]:
# ==========================================
# 4. SAVE MODEL & ARTIFACTS (Optional)
# ==========================================
# Since we are running the API in the same notebook, we can use the variables directly.
# If you want to persist them:
# import pickle
# with open("model.pkl", "wb") as f: pickle.dump(df_model, f)
# with open("win_rate_tr.pkl", "wb") as f: pickle.dump(win_rate_tr, f)
# ...

In [6]:
# ==========================================
# 5. FASTAPI APP & PREDICTION LOGIC
# ==========================================
import requests
from datetime import datetime, timedelta
import random

app = FastAPI(title="Match Predictor API")

# Enable CORS for Frontend
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

class MatchInput(BaseModel):
    home_team: str
    away_team: str

# 15) Prediction-Funktion (Adapted for API)
X_columns = X.columns 

def _make_row(home, away):
    row = pd.DataFrame(0, index=[0], columns=X_columns)
    h_col = f"home_team_{home}"
    a_col = f"away_team_{away}"

    if home == away:
        raise ValueError("Home- und Auswärtsteam dürfen nicht identisch sein.")

    if h_col in row.columns:
        row.loc[0, h_col] = 1
    # else: warning handled in logs

    if a_col in row.columns:
        row.loc[0, a_col] = 1
    # else: warning handled in logs

    row["home_strength"] = win_rate_tr.get(home, mean_wr_tr)
    row["away_strength"] = win_rate_tr.get(away, mean_wr_tr)
    row["h2h_strength"]  = _get_h2h(home, away)
    row["form_home"]     = form_home_map.get(home, 0.5)
    row["form_away"]     = form_away_map.get(away, 0.5)
    row["strength_diff"] = row["home_strength"] - row["away_strength"]
    row["form_diff"]     = row["form_home"] - row["form_away"]

    return row

@app.get("/teams")
def get_teams():
    # Return sorted list of all unique teams found in training data
    # We can get this from win_rate_tr index which contains all teams
    teams = sorted(list(win_rate_tr.index))
    return {"teams": teams}

@app.get("/upcoming")
def get_upcoming_matches():
    # Generate random "International Friendlies"
    # This ensures we always have matches to predict with known teams
    teams = sorted(list(win_rate_tr.index))
    upcoming = []
    
    today = datetime.now()
    
    # Generate 6 random matches
    for i in range(6):
        # Pick two distinct random teams
        home, away = random.sample(teams, 2)
        
        # Random date within next 14 days
        days_offset = random.randint(1, 14)
        hours_offset = random.randint(12, 21) # Match time between 12:00 and 21:00
        match_date = today + timedelta(days=days_offset)
        match_date = match_date.replace(hour=hours_offset, minute=0, second=0)
        
        upcoming.append({
            "id": i,
            "date": match_date.strftime("%Y-%m-%d %H:%M"),
            "home_team": home,
            "away_team": away,
            "competition": "International Friendly"
        })
        
    # Sort by date
    upcoming.sort(key=lambda x: x["date"])
    
    return {"matches": upcoming}

@app.post("/predict")
def predict_match_endpoint(match: MatchInput):
    try:
        row = _make_row(match.home_team, match.away_team)
        
        # Classification (Win/Draw/Loss)
        proba = df_model.predict_proba(row)[0]
        classes = df_model.classes_
        pred_idx = int(np.argmax(proba))
        pred = classes[pred_idx]
        confidence = float(proba[pred_idx])
        probs = {cls: float(p) for cls, p in zip(classes, proba)}
        
        return {
            "prediction": pred,
            "confidence": round(confidence, 3),
            "probs": probs
        }
    except ValueError as ve:
        raise HTTPException(status_code=400, detail=str(ve))
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

print("FastAPI App Initialized")

FastAPI App Initialized


In [7]:
# ==========================================
# 6. RUN SERVER
# ==========================================
if __name__ == "__main__":
    print("Starting Server at http://localhost:8000")
    # uvicorn.run() fails in notebooks because the event loop is already running.
    # We use the Server object directly and await it.
    config = uvicorn.Config(app, host="0.0.0.0", port=8000)
    server = uvicorn.Server(config)
    await server.serve()

INFO:     Started server process [52144]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


Starting Server at http://localhost:8000
INFO:     127.0.0.1:63855 - "GET /teams HTTP/1.1" 200 OK
INFO:     127.0.0.1:63855 - "GET /teams HTTP/1.1" 200 OK
INFO:     127.0.0.1:63855 - "GET /teams HTTP/1.1" 200 OK
INFO:     127.0.0.1:63855 - "GET /teams HTTP/1.1" 200 OK
INFO:     127.0.0.1:63855 - "GET /upcoming HTTP/1.1" 200 OK
INFO:     127.0.0.1:63855 - "GET /upcoming HTTP/1.1" 200 OK
INFO:     127.0.0.1:63855 - "GET /upcoming HTTP/1.1" 200 OK
INFO:     127.0.0.1:63855 - "GET /upcoming HTTP/1.1" 200 OK
INFO:     127.0.0.1:52464 - "OPTIONS /predict HTTP/1.1" 200 OK
INFO:     127.0.0.1:52464 - "OPTIONS /predict HTTP/1.1" 200 OK
INFO:     127.0.0.1:52464 - "POST /predict HTTP/1.1" 200 OK
INFO:     127.0.0.1:52464 - "POST /predict HTTP/1.1" 200 OK
INFO:     127.0.0.1:54461 - "GET /teams HTTP/1.1" 200 OK
INFO:     127.0.0.1:54461 - "GET /teams HTTP/1.1" 200 OK
INFO:     127.0.0.1:54461 - "GET /upcoming HTTP/1.1" 200 OK
INFO:     127.0.0.1:54461 - "GET /teams HTTP/1.1" 200 OK
INFO:     127.

INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [52144]
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [52144]
