In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import os
import re
import numpy as np

DATA_PATH = "data/raw/UEFA Champions League 2004-2021.csv"
GRAPH_DIR = "graphs/"
os.makedirs(GRAPH_DIR, exist_ok=True)

sns.set_style("whitegrid")
plt.rcParams['figure.facecolor'] = 'white'

df = pd.read_csv(DATA_PATH)
df.columns = [c.strip() for c in df.columns]

def clean_score(x):
    if pd.isna(x): 
        return 0
    m = re.search(r'(\d+)', str(x))
    return int(m.group(1)) if m else 0

df['homeScore'] = df['homeScore'].apply(clean_score)
df['awayscore'] = df['awayscore'].apply(clean_score)

df['Winner'] = df.apply(lambda r: 'Home' if r['homeScore'] > r['awayscore'] else ('Away' if r['homeScore'] < r['awayscore'] else 'Draw'), axis=1)

all_teams = pd.concat([df['homeTeam'], df['awayteam']]).unique()
le = LabelEncoder()
le.fit(all_teams)
df['h_enc'] = le.transform(df['homeTeam'])
df['a_enc'] = le.transform(df['awayteam'])

df_bin = df[df['Winner'] != 'Draw'].copy()
X = df_bin[['h_enc', 'a_enc']]
y = (df_bin['Winner'] == 'Away').astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
model = CategoricalNB(alpha=0.1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

acc = accuracy_score(y_test, y_pred) * 100
print(f"Accuracy: {acc:.1f}%")
print(classification_report(y_test, y_pred, target_names=['Home Win', 'Away Win']))

plt.figure(figsize=(8,5))
top5 = df[df['Winner'] != 'Draw']['Winner'].value_counts().head(5)
colors = sns.color_palette("rocket_r", len(top5))
bars = plt.bar(range(len(top5)), top5.values, color=colors)
plt.xticks(range(len(top5)), top5.index, rotation=20, ha='right')
plt.title('Top 5 Clubs by Victories', fontsize=13, fontweight='bold')
plt.ylabel('Wins')
for i, bar in enumerate(bars):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5, 
             str(int(top5.values[i])), ha='center', va='bottom', fontweight='bold')
plt.tight_layout()
plt.savefig(f"{GRAPH_DIR}top5_clubs.png", dpi=150)
plt.close()

plt.figure(figsize=(6,5))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='YlOrRd', 
            xticklabels=['Home', 'Away'], yticklabels=['Home', 'Away'],
            cbar_kws={'shrink': 0.8})
plt.title('Prediction Matrix', fontsize=12, fontweight='bold')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.savefig(f"{GRAPH_DIR}confusion_matrix.png", dpi=150)
plt.close()

h = (df['Winner']=='Home').sum()
a = (df['Winner']=='Away').sum()
d = (df['Winner']=='Draw').sum()
plt.figure(figsize=(7,7))
wedges, texts, autotexts = plt.pie([h,a,d], labels=['Home Wins','Away Wins','Draws'], 
                                    autopct='%1.1f%%', startangle=90,
                                    colors=['#2ecc71', '#e74c3c', '#f1c40f'],
                                    explode=(0.02,0.02,0.02))
for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontweight('bold')
plt.title('Match Outcome Distribution', fontsize=13, fontweight='bold')
plt.savefig(f"{GRAPH_DIR}outcomes.png", dpi=150)
plt.close()

def predict_match(home, away):
    try:
        h, a = le.transform([home, away])
        pred = model.predict([[h, a]])[0]
        return away if pred == 1 else home
    except ValueError:
        return home

def simulate_knockout(teams):
    teams = list(teams)
    np.random.shuffle(teams)
    while len(teams) > 1:
        next_r = []
        for i in range(0, len(teams), 2):
            if i+1 >= len(teams):
                next_r.append(teams[i])
                continue
            w = predict_match(teams[i], teams[i+1])
            next_r.append(w)
        teams = next_r
    return teams[0]

top_clubs = top5.index.tolist()
if len(top_clubs) >= 4:
    bracket = np.random.choice(top_clubs, size=8, replace=True).tolist()
    champ = simulate_knockout(bracket)
    print(f"Simulated Champion: {champ}")

Model Accuracy: 53.16%
              precision    recall  f1-score   support

           2       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         7
           7       0.40      0.44      0.42         9
          10       0.40      0.50      0.44         4
          11       0.00      0.00      0.00         1
          12       0.65      0.65      0.65        23
          13       1.00      0.50      0.67         2
          14       0.65      0.85      0.74        20
          16       0.00      0.00      0.00         2
          17       0.00      0.00      0.00         2
          18       1.00      0.33      0.50         6
          19       0.00      0.00      0.00         2
          20       0.00      0.00      0.00         2
          21       0.00      0.00      0.00         0
          22       0.00      0.00      0.00         2
          23       0.64      0.56      0.60        16
          25       0.44      0.92      0.60        74
    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Predicted UEFA Champions League Winner: Bayern MÃ¼nchen



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_teams.index, y=top_teams.values, palette='viridis')
