# Euro 2024

Datasets and info:
- Matches taken from [International football results from 1872 to 2024](https://www.kaggle.com/datasets/martj42/international-football-results-from-1872-to-2017)
- ELO data taken from [World Football Elo Ratings](https://www.eloratings.net/)
- [A Mathematician's Guide to the World Cup](https://www.youtube.com/watch?v=KjISuZ5o06Q)

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

In [11]:
teams_list = ["Albania","Andorra","Armenia","Austria","Azerbaijan","Belarus","Belgium","Bosnia and Herzegovina","Bulgaria","Croatia","Cyprus","Czechia","Denmark","England","Estonia","Faroe Islands","Finland","France","Georgia","Germany","Gibraltar","Greece","Hungary","Iceland","Israel","Italy","Kazakhstan","Kosovo","Latvia","Liechtenstein","Lithuania","Luxembourg","Malta","Moldova","Monaco","Montenegro","Netherlands","North Macedonia","Northern Ireland","Norway","Poland","Portugal","Ireland","Romania","Russia","San Marino","Scotland","Serbia","Slovakia","Slovenia","Spain","Sweden","Switzerland","Turkey","Ukraine","Wales"]

In [12]:
date_from = '2020-01-01'
date_to = '2024-06-12'

elo_ratings = pd.read_csv("data/elo.csv")
match_results = pd.read_csv("data/results.csv")
team_stats = {}

# Preprocess ELO ratings for quick access
elo_ratings_dict = {(row['team'], row['year']): row['rating'] for idx, row in elo_ratings.iterrows()}

def update_team_stats(team, opponent_score, team_score, elo_diff):
    if team not in team_stats:
        team_stats[team] = {'total_score': team_score, 'games': 1, 'scores': [team_score], 
                            'opponent_scores': [opponent_score], 'elo_diffs': [elo_diff]}
    else:
        stats = team_stats[team]
        stats['total_score'] += team_score
        stats['games'] += 1
        stats['scores'].append(team_score)
        stats['opponent_scores'].append(opponent_score)
        stats['elo_diffs'].append(elo_diff)

# Matches within the date range
filtered_matches = match_results.loc[(match_results['date'] > date_from) & (match_results['date'] <= date_to)]

for idx, match in filtered_matches.iterrows():
    home_team, away_team = match["home_team"], match["away_team"]
    match_year = int(match['date'][:4])
    
    home_elo = elo_ratings_dict.get((home_team, match_year))
    away_elo = elo_ratings_dict.get((away_team, match_year))

    # Skip the match if ELO ratings are missing
    if home_elo is None or away_elo is None:
        continue

    update_team_stats(home_team, match["away_score"], match["home_score"], home_elo - away_elo)
    update_team_stats(away_team, match["home_score"], match["away_score"], away_elo - home_elo)

In [13]:
xG = {team: stats['total_score'] / stats['games'] for team, stats in team_stats.items()}

X, y = [], []
for team, stats in team_stats.items():
    X.extend(stats['elo_diffs'])
    y.extend((np.array(stats['scores']) - xG[team]).tolist())

X, y = np.array(X).reshape(-1, 1), np.array(y).reshape(-1, 1)

reg = LinearRegression().fit(X, y)

elo_dict = {i: j for i, j in zip(teams_list, [(elo_ratings[(elo_ratings['year'] == int(date_to.split('-')[0])) & (elo_ratings['team'] == team)]['rating']).to_list()[0] for team in teams_list])}

In [14]:
def match(home, away, allow_draw=True):
    home_advantage = int(reg.predict(np.array(elo_dict[home] - elo_dict[away]).reshape(-1, 1)))
    away_advantage = int(reg.predict(np.array(elo_dict[away] - elo_dict[home]).reshape(-1, 1)))
    home_score = np.random.poisson(xG[home], 10000) + home_advantage
    away_score = np.random.poisson(xG[away], 10000) + away_advantage

    # Calculate win_team for score matrix dimension
    win_team = max(home_score.max(), away_score.max())
    
    scores, counts = np.unique(np.vstack((home_score, away_score)).T, axis=0, return_counts=True)
    score_matrix = np.zeros((win_team+1, win_team+1))
    score_matrix[scores[:,0], scores[:,1]] = counts

    # Calculate win, lose, draw counts
    home_win_count = np.sum(np.tril(score_matrix, -1))
    away_win_count = np.sum(np.triu(score_matrix, 1))
    draw_count = np.sum(np.diag(score_matrix)) if allow_draw else 0

    # Determine most probable outcome
    outcomes = np.array([home_win_count, away_win_count, draw_count])
    most_probable_outcome_index = outcomes.argmax()

    # Find the most frequent score for the most probable outcome
    if most_probable_outcome_index == 0:  # Home win
        score = np.unravel_index(np.argmax(np.tril(score_matrix, -1)), score_matrix.shape)
    elif most_probable_outcome_index == 1:  # Away win
        score = np.unravel_index(np.argmax(np.triu(score_matrix, 1)), score_matrix.shape)
    else:  # Draw
        score = np.argmax(np.diag(score_matrix)), np.argmax(np.diag(score_matrix))

    # Calculate percentages
    total_simulations = outcomes.sum() if allow_draw else home_win_count + away_win_count
    home_win = (home_win_count / total_simulations) * 100
    away_win = (away_win_count / total_simulations) * 100
    draw = (draw_count / total_simulations) * 100
    
    return home_win, away_win, draw, score

In [20]:
home_win, away_win, draw, score = match("San Marino","France", False)

# Print the results
print(f"Home Win Percentage: {home_win:.2f}%")
print(f"Away Win Percentage: {away_win:.2f}%")
print(f"Draw Percentage: {draw:.2f}%")
print(f"Most Probable Outcome: {score}")

Home Win Percentage: 98.25%
Away Win Percentage: 1.75%
Draw Percentage: 0.00%
Most Probable Outcome: (11, 4)


In [10]:
import json

matches_predictions = {}

# Generate all possible match combinations
for groups in [True, False]:
    for home_team in teams_list:
        for away_team in teams_list:
            if home_team != away_team:  # Ensure we don't match a team with itself
                # Call the match function
                home_win, away_win, draw, score = match(home_team, away_team, groups)
                
                # Format the key as "HomeTeam_vs_AwayTeam"
                match_key = f"{home_team}_{away_team}_{'0' if not groups else '1'}"
                
                # Store the results in the dictionary
                matches_predictions[match_key] = {
                    "predictions": [round(home_win, 2), round(away_win, 2), round(draw, 2)],
                    "scorePrediction": [int(score[0]), int(score[1])]
                }

# Convert the dictionary to a JSON string
matches_predictions_json = json.dumps(matches_predictions, indent=4)

# To save the JSON to a file
with open('matches_predictions.json', 'w+') as file:
    file.write(matches_predictions_json)

It's not coming home 😭🏴󠁧󠁢󠁥󠁮󠁧󠁿