# Euro 2024

Datasets and info:
- Matches taken from [International football results from 1872 to 2024](https://www.kaggle.com/datasets/martj42/international-football-results-from-1872-to-2017)
- ELO data taken from [World Football Elo Ratings](https://www.eloratings.net/)
- [A Mathematician's Guide to the World Cup](https://www.youtube.com/watch?v=KjISuZ5o06Q)

## Preprocess Data

In [1]:
import os
import csv

In [2]:
folder_path = 'euro'

required_attributes = ['home_team', 'away_team', 'home_score', 'away_score', 'home_penalty', 'away_penalty', 'home_score_total', 'away_score_total', 'date', 'stage', 'stadium', 'city']
stage_mapping = {'ROUND_OF_16': 'Round of 16', 'SEMIFINAL': 'Semi-finals', 'FINAL': 'Final', 'THIRD_PLAY_OFF': 'Third-place play-off', 'QUARTER_FINALS': 'Quarter-finals'}
countries = {'Republic of Ireland':'Ireland','USSR':'Russia','West Germany':'Germany','Commonwealth of Independent States':'Russia','Türkiye':'Turkey','Czechoslovakia':'Czechia','Yugoslavia':'Serbia'}
special_attr_mapping = {
    'date': 'date_time','stadium': 'stadium_name','city': 'stadium_city',
    'stage': lambda row: row.get('group_name', '').strip() or stage_mapping.get(row.get('round', '')),
    'home_team': lambda row: countries.get(row.get('home_team', ''), row.get('home_team', '')),
    'away_team': lambda row: countries.get(row.get('away_team', ''), row.get('away_team', ''))
}

def adjust_score_total(score, penalty, score_total):
    return str(score + penalty) if score + penalty > score_total else str(score_total)

for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):
        source_file_path = os.path.join(folder_path, file_name)
        destination_file_path = os.path.join(folder_path, file_name)

        with open(source_file_path, mode='r', encoding='utf-8') as source_file:
            csv_reader = csv.DictReader(source_file)
            filtered_rows = []

            for row in csv_reader:
                filtered_row = {}
                for attr in required_attributes:
                    if attr in ['home_score', 'away_score', 'home_penalty', 'away_penalty', 'home_score_total', 'away_score_total']:
                        value = row.get(attr, '')
                        filtered_row[attr] = str(int(float(value))) if value else ''
                    elif attr in special_attr_mapping:
                        mapping = special_attr_mapping[attr]
                        if callable(mapping):
                            filtered_row[attr] = mapping(row)
                        else:
                            filtered_row[attr] = row.get(mapping, '')
                    else:
                        filtered_row[attr] = row.get(attr, '')

                filtered_row['home_score_total'] = adjust_score_total(int(filtered_row.get('home_score', '0') or 0),int(filtered_row.get('home_penalty', '0') or 0),int(filtered_row.get('home_score_total', '0') or 0))
                filtered_row['away_score_total'] = adjust_score_total(int(filtered_row.get('away_score', '0') or 0),int(filtered_row.get('away_penalty', '0') or 0),int(filtered_row.get('away_score_total', '0') or 0))

                filtered_rows.append(filtered_row)
        
        with open(destination_file_path, mode='w', newline='', encoding='utf-8') as destination_file:
            csv_writer = csv.DictWriter(destination_file, fieldnames=required_attributes)
            csv_writer.writeheader()
            csv_writer.writerows(filtered_rows)

In [15]:
uefa_changes = {
    1960: ['Albania', 'Austria', 'Belgium', 'Bulgaria', 'Czechia', 'Denmark', 'East Germany', 'England', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Iceland', 'Ireland', 'Italy', 'Luxembourg', 'Netherlands', 'Northern Ireland', 'Norway', 'Poland', 'Portugal', 'Romania', 'Russia', 'Scotland', 'Serbia', 'Spain', 'Sweden', 'Switzerland', 'Turkey', 'Wales'],
    1964: ['Cyprus', 'Malta'],
    1984: ['Liechtenstein'],
    1992: ['Faroe Islands', 'San Marino', '-East Germany'],
    1996: ['Armenia', 'Azerbaijan', 'Belarus', 'Croatia', 'Estonia', 'Georgia', 'Israel', 'Latvia', 'Lithuania', 'Moldova', 'North Macedonia', 'Slovakia', 'Slovenia', 'Ukraine'],
    2000: ['Andorra', 'Bosnia and Herzegovina'],
    2004: ['Kazakhstan'],
    2008: ['Montenegro'],
    2016: ['Gibraltar'],
    2020: ['Kosovo']
}

def get_uefa_countries(year):
    uefa_countries = set()
    for change_year, countries in uefa_changes.items():
        if change_year <= year:
            for country in countries:
                if country.startswith('-'):
                    uefa_countries.remove(country[1:])
                else:
                    uefa_countries.add(country)
    return sorted(list(uefa_countries))

## Prediction Model

In [4]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

In [16]:
def create_predictions(date_to, teams_list):
    date_from = str(int(date_to.split('-')[0])-4) + '-01-01'

    elo_ratings = pd.read_csv("data/elo.csv")
    match_results = pd.read_csv("data/results.csv")
    team_stats = {}

    # Preprocess ELO ratings for quick access
    elo_ratings_dict = {(row['team'], row['year']): row['rating'] for idx, row in elo_ratings.iterrows()}

    def update_team_stats(team, opponent_score, team_score, elo_diff):
        if team not in team_stats:
            team_stats[team] = {'total_score': team_score, 'games': 1, 'scores': [team_score], 
                                'opponent_scores': [opponent_score], 'elo_diffs': [elo_diff]}
        else:
            stats = team_stats[team]
            stats['total_score'] += team_score
            stats['games'] += 1
            stats['scores'].append(team_score)
            stats['opponent_scores'].append(opponent_score)
            stats['elo_diffs'].append(elo_diff)

    # Matches within the date range
    filtered_matches = match_results.loc[(match_results['date'] > date_from) & (match_results['date'] <= date_to)]

    for idx, match in filtered_matches.iterrows():
        home_team, away_team = match["home_team"], match["away_team"]
        match_year = int(match['date'][:4])

        home_elo = elo_ratings_dict.get((home_team, match_year))
        away_elo = elo_ratings_dict.get((away_team, match_year))

        # Skip the match if ELO ratings are missing
        if home_elo is None or away_elo is None:
            continue

        update_team_stats(home_team, match["away_score"], match["home_score"], home_elo - away_elo)
        update_team_stats(away_team, match["home_score"], match["away_score"], away_elo - home_elo)

    xG = {team: stats['total_score'] / stats['games'] for team, stats in team_stats.items()}

    X, y = [], []
    for team, stats in team_stats.items():
        X.extend(stats['elo_diffs'])
        y.extend((np.array(stats['scores']) - xG[team]).tolist())

    X, y = np.array(X).reshape(-1, 1), np.array(y).reshape(-1, 1)

    reg = LinearRegression().fit(X, y)

    elo_dict = {i: j for i, j in zip(teams_list, [(elo_ratings[(elo_ratings['year'] == int(date_to.split('-')[0])) & (elo_ratings['team'] == team)]['rating']).to_list()[0] for team in teams_list])}
    return elo_dict, reg, xG

In [6]:
def match(elo_dict, reg, xG, home, away, allow_draw=True):
    home_advantage = int(reg.predict(np.array(elo_dict[home] - elo_dict[away]).reshape(-1, 1)))
    away_advantage = int(reg.predict(np.array(elo_dict[away] - elo_dict[home]).reshape(-1, 1)))
    home_score = np.random.poisson(xG[home], 10000) + home_advantage
    away_score = np.random.poisson(xG[away], 10000) + away_advantage

    # Calculate win_team for score matrix dimension
    win_team = max(home_score.max(), away_score.max())
    
    scores, counts = np.unique(np.vstack((home_score, away_score)).T, axis=0, return_counts=True)
    score_matrix = np.zeros((win_team+1, win_team+1))
    score_matrix[scores[:,0], scores[:,1]] = counts

    # Calculate win, lose, draw counts
    home_win_count = np.sum(np.tril(score_matrix, -1))
    away_win_count = np.sum(np.triu(score_matrix, 1))
    draw_count = np.sum(np.diag(score_matrix)) if allow_draw else 0

    # Determine most probable outcome
    outcomes = np.array([home_win_count, away_win_count, draw_count])
    most_probable_outcome_index = outcomes.argmax()

    # Find the most frequent score for the most probable outcome
    if most_probable_outcome_index == 0:  # Home win
        score = np.unravel_index(np.argmax(np.tril(score_matrix, -1)), score_matrix.shape)
    elif most_probable_outcome_index == 1:  # Away win
        score = np.unravel_index(np.argmax(np.triu(score_matrix, 1)), score_matrix.shape)
    else:  # Draw
        score = np.argmax(np.diag(score_matrix)), np.argmax(np.diag(score_matrix))

    # Calculate percentages
    total_simulations = outcomes.sum() if allow_draw else home_win_count + away_win_count
    home_win = (home_win_count / total_simulations) * 100
    away_win = (away_win_count / total_simulations) * 100
    draw = (draw_count / total_simulations) * 100
    
    return home_win, away_win, draw, score

## Create Prediction CSV

In [7]:
import json
from datetime import datetime, timedelta

In [8]:
def create_preds_csv(date_to, file_name):
    matches_predictions = {}
    teams_list = get_uefa_countries(int(date_to.split('-')[0]))
    elo_dict, reg, xG = create_predictions(date_to, teams_list)

    # Populate the dictionary
    for home_team in teams_list:
        matches_predictions[home_team] = {}
        for groups in [True, False]:
            for away_team in teams_list:
                if home_team != away_team:
                    home_win, away_win, draw, score = match(elo_dict, reg, xG, home_team, away_team, groups)
                    match_key = f"{away_team}_{'1' if groups else '0'}"
                    matches_predictions[home_team][match_key] = {
                        "predictions": [round(home_win, 2), round(away_win, 2), round(draw, 2)],
                        "scorePrediction": [int(score[0]), int(score[1])]
                    }

    header = ['home_team'] + [f"{away_team}_{'1' if groups else '0'}" for groups in [True, False] for away_team in teams_list]

    # Write to CSV
    with open(file_name, 'w+', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(header)

        for home_team, predictions in matches_predictions.items():
            row = [home_team]
            for away_team_comb in header[1:]:
                if away_team_comb in predictions:
                    prediction = predictions[away_team_comb]
                    row.append(json.dumps(prediction))
                else:
                    row.append('')
            writer.writerow(row)

In [17]:
preds_folder_path = 'predictions'

if not os.path.exists(preds_folder_path):
    os.makedirs(preds_folder_path)

for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):
        with open(os.path.join(folder_path, file_name), mode='r', encoding='utf-8') as source_file:
            reader = csv.DictReader(source_file)
            dates = [datetime.strptime(row['date'], '%Y-%m-%dT%H:%M:%SZ') for row in reader]
            formatted_date = (min(dates) - timedelta(days=1)).strftime('%Y-%m-%d')
            create_preds_csv(formatted_date, os.path.join(preds_folder_path, file_name))

It's not coming home 😭🏴󠁧󠁢󠁥󠁮󠁧󠁿