In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import random
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

scaler = 0

# Read in csv
df = pd.read_csv("euro2024_data (1).csv")
df.fillna(0, inplace=True)
df.drop(columns=['played90s_nl2022', 'played90s_wc2022', 'played90s_euro2016', 'played90s_wc2018', 'played90s_euro2021'], inplace=True)

# Define feature columns excluding squad
feature_columns = df.drop(columns=['squad']).columns

# Define target variable
target = df['total_90s_played']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df[feature_columns], target, test_size=0.2, random_state=42)

# Import and initialize Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Prediction and error testing
y_pred = rf_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R-squared:", r2)

# Dictionary containing all the groups for group stage
EURO_groups = {
    'Group A': ['Germany', 'Scotland', 'Hungary', 'Switzerland'],
    'Group B': ['Spain', 'Italy', 'Croatia', 'Albania'],
    'Group C': ['Slovenia', 'Denmark', 'Serbia', 'England'],
    'Group D': ['Poland', 'Netherlands', 'Austria', 'France'],
    'Group E': ['Belgium', 'Slovakia', 'Romania', 'Ukraine'],
    'Group F': ['Türkiye', 'Portugal', 'Georgia', 'Czech Republic'],
}

def simulate_match(df, team1, team2, scaler, rf_model):
    # Retrieve team statistics from DataFrame
    team1_stats = df[df['squad'] == team1].drop(columns=['squad']).values
    team2_stats = df[df['squad'] == team2].drop(columns=['squad']).values

    # Ensure data is retrieved
    if len(team1_stats) == 0 or len(team2_stats) == 0:
        return "Error: Team statistics not found"

    # Predict performance for both teams
    team1_performance = rf_model.predict(team1_stats)[0]
    team2_performance = rf_model.predict(team2_stats)[0]

    # Determine match winner based on performance scores
    if team1_performance > team2_performance:
        return team1
    elif team1_performance < team2_performance:
        return team2
    else:
        return "Draw"

def simulate_knockout_match(df, team1, team2, scaler, rf_model):
        # Retrieve team statistics from DataFrame
    team1_stats = df[df['squad'] == team1].drop(columns=['squad']).values
    team2_stats = df[df['squad'] == team2].drop(columns=['squad']).values

    # Ensure data is retrieved
    if len(team1_stats) == 0 or len(team2_stats) == 0:
        return "Error: Team statistics not found"

    # Predict performance for both teams
    team1_performance = rf_model.predict(team1_stats)[0]
    team2_performance = rf_model.predict(team2_stats)[0]
    
    if team1_performance >= team2_performance:
        return team1
    elif team1_performance < team2_performance:
        return team2

# Simulates results for a group
def simulate_group(df, t1, t2, t3, t4, rf_model):
    group = {t1: 0, t2: 0, t3: 0, t4: 0}
    matchups = [(t1, t2), (t1, t3), (t1, t4), (t2, t3), (t2, t4), (t3, t4)]
    
    for team1, team2 in matchups:
        result = simulate_match(df, team1, team2, scaler, rf_model)
        if result == team1:
            group[team1] += 3
        elif result == team2:
            group[team2] += 3
        else:
            group[team1] += 1
            group[team2] += 1

    print(sorted(group, key=group.get, reverse=True))
    return sorted(group, key=group.get, reverse=True)

def simulate_groupstage(df, rf_model):
    groups_results = {}
    thirdplace_results = {}
    for group_name, teams in EURO_groups.items():
        # teams should be unpacked as individual arguments
        group_ranked = simulate_group(df, *teams, rf_model)
        groups_results[group_name] = group_ranked

        third_place_team = group_ranked[2]
        third_place_teams[third_place_team]
    return groups_results

def simulate_knockout_stage(df,groups,scaler,rf_model):
    
    list39 = ['Group A', 'Group D', 'Group E', 'Group F']
    choice39 = random.choice(list39)
    
    list39.remove('Group A')  # Contains D, E, F after removal
    if choice39 != 'Group A':
        list39.remove(choice39)
    choice40 = random.choice(list39)
    
    list41 = ['Group B', 'Group C']
    if choice39 in ['Group B', 'Group C']:
        list41.remove(choice39)
    choice41 = random.choice(list41)
    
    list43 = ['Group B', 'Group C', 'Group D', 'Group E']
    if choice39 in list43:
        list43.remove(choice39)
    if choice40 in list43:
        list43.remove(choice40)
    if choice41 in list43:
        list43.remove(choice41)
    choice43 = random.choice(list43)


    
    
    r16 = {
        'round of 16': '',
        'match37': (groups['Group A'][1], groups['Group B'][1]),
        'match38': (groups['Group A'][0], groups['Group C'][1]), #37 and 38 have been fixed
        'match39': (groups['Group B'][0], groups[choice39][2]), #match39 should be first place of C vs. best third place of D,E,F
        'match40': (groups['Group C'][0], groups[choice40][2]), #match40 should be first place of B vs. best third place of A,D,E,F
        'match41': (groups['Group F'][0], groups[choice41][2]), #match41 should be second place of D vs. second place of E
        'match42': (groups['Group D'][1], groups['Group E'][1]), #match42 should be first place of F vs. best third place of A,B,C
        'match43': (groups['Group E'][0], groups[choice43][2]), #match43 should be first place of E vs. best third place of A,B,C,D
        'match44': (groups['Group D'][0], groups['Group F'][1]) #match44 should be first place of D vs. second place of F
    }
    
    qf = {
        'quarter-finals':'',
        'match45': (simulate_knockout_match(df,r16['match39'][0],r16['match39'][1],scaler,rf_model),simulate_knockout_match(df,r16['match37'][0],r16['match37'][1],scaler,rf_model)),
        'match46': (simulate_knockout_match(df,r16['match41'][0],r16['match41'][1],scaler,rf_model),simulate_knockout_match(df,r16['match42'][0],r16['match42'][1],scaler,rf_model)),
        'match47': (simulate_knockout_match(df,r16['match43'][0],r16['match43'][1],scaler,rf_model),simulate_knockout_match(df,r16['match44'][0],r16['match44'][1],scaler,rf_model)),
        'match48': (simulate_knockout_match(df,r16['match40'][0],r16['match40'][1],scaler,rf_model),simulate_knockout_match(df,r16['match38'][0],r16['match38'][1],scaler,rf_model))
    }
 
    sf = {
        'semi-final':'',
        'match49': (simulate_knockout_match(df,qf['match45'][0],qf['match45'][1],scaler,rf_model),simulate_knockout_match(df,qf['match46'][0],qf['match46'][1],scaler,rf_model)),
        'match50': (simulate_knockout_match(df, qf['match47'][0], qf['match47'][1], scaler, rf_model), simulate_knockout_match(df, qf['match48'][0], qf['match48'][1], scaler, rf_model))

    }


    final = {
        'final':'',
        'match51': (simulate_knockout_match(df, sf['match49'][0], sf['match49'][1], scaler, rf_model), simulate_knockout_match(df, sf['match50'][0], sf['match50'][1], scaler, rf_model))
        
    }

    knockout_results = {}
    knockout_results.update(r16)
    knockout_results.update(qf)
    knockout_results.update(sf)
    knockout_results.update(final)
    
        
    
    return knockout_results

    

def simulate_tournament(df, tournament_groups, scaler, rf_model):

    groups_result = simulate_groupstage(df, rf_model)
    knockout_results = simulate_knockout_stage(df, groups_result, scaler, rf_model)

    winner = simulate_knockout_match(df, knockout_results['match51'][0], knockout_results['match51'][1], scaler, rf_model)
    
    if winner == knockout_results['match51'][0]:
        second = knockout_results['match51'][1]
    else:
        second = knockout_results['match51'][0]

    print(winner + " Are the Champions!", '\n', second + " Came in Second Place :(")

    return knockout_results
    



simulate_tournament(df, EURO_groups, scaler, rf_model)

Mean Absolute Error: 1.6666000000000025
Mean Squared Error: 3.528233400000012
R-squared: 0.8860036251550865
['Switzerland', 'Germany', 'Hungary', 'Scotland']




['Croatia', 'Spain', 'Italy', 'Albania']
['England', 'Denmark', 'Serbia', 'Slovenia']




['France', 'Poland', 'Netherlands', 'Austria']
['Belgium', 'Ukraine', 'Slovakia', 'Romania']




['Portugal', 'Czech Republic', 'Türkiye', 'Georgia']




France Are the Champions! 
 Croatia Came in Second Place :(




{'round of 16': '',
 'match37': ('Germany', 'Spain'),
 'match38': ('Switzerland', 'Denmark'),
 'match39': ('Croatia', 'Hungary'),
 'match40': ('England', 'Slovakia'),
 'match41': ('Portugal', 'Italy'),
 'match42': ('Poland', 'Ukraine'),
 'match43': ('Belgium', 'Serbia'),
 'match44': ('France', 'Czech Republic'),
 'quarter-finals': '',
 'match45': ('Croatia', 'Spain'),
 'match46': ('Portugal', 'Poland'),
 'match47': ('Belgium', 'France'),
 'match48': ('England', 'Switzerland'),
 'semi-final': '',
 'match49': ('Croatia', 'Portugal'),
 'match50': ('France', 'England'),
 'final': '',
 'match51': ('Croatia', 'France')}

In [None]:
print("Number of feature columns:", len(feature_columns))
print("Shape of X_train:", X_train.shape)

print(X_train.shape[1],
X_test.shape[1])

duplicate_columns = df.columns[df.columns.duplicated()]
print("Duplicate columns:", duplicate_columns)



In [138]:
df

Unnamed: 0,id,squad,avgpossession_wc2022,goalsx90_wc2022,assistsx90_wc2022,expgoalsx90_wc2022,diffgoalx90_wc2022,avgpossession_nl2022,goalsx90_nl2022,assistsx90_nl2022,...,diffgoalx90_euro2021,avgpossession_wc2018,goalsx90_wc2018,assistsx90_wc2018,expgoalsx90_wc2018,diffgoalx90_wc2018,avgpossession_euro2016,goalsx90_euro2016,assistsx90_euro2016,total_90s_played
0,1,Albania,0.0,0.0,0.0,0.0,0.0,54.5,1.0,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,40.3,0.33,0.33,7.0
1,2,Austria,0.0,0.0,0.0,0.0,0.0,43.5,1.0,1.0,...,-0.02,0.0,0.0,0.0,0.0,0.0,54.0,0.33,0.33,13.3
2,3,Belgium,57.0,0.33,0.33,1.57,-1.24,62.3,1.83,1.83,...,0.42,52.9,2.14,1.71,1.9,0.24,52.4,1.8,1.6,26.0
3,4,Croatia,54.3,1.04,1.04,0.91,0.13,51.3,1.38,0.69,...,0.11,56.7,1.62,1.0,1.4,0.22,48.8,1.15,0.92,33.0
4,5,Czech Republic,0.0,0.0,0.0,0.0,0.0,35.0,0.67,0.5,...,-0.03,0.0,0.0,0.0,0.0,0.0,42.3,0.67,0.33,14.0
5,6,Denmark,60.0,0.33,0.33,0.9,-0.57,47.7,1.5,1.33,...,0.32,43.0,0.69,0.69,0.68,0.01,0.0,0.0,0.0,19.6
6,7,England,62.8,2.6,2.2,1.72,0.88,53.3,0.67,0.33,...,-0.14,54.1,1.57,0.78,1.46,0.11,62.0,1.0,0.25,30.4
7,8,France,51.3,2.18,1.64,1.88,0.3,57.7,0.83,0.67,...,-0.34,48.3,1.71,0.86,1.3,0.41,54.4,1.77,1.5,31.9
8,9,Georgia,0.0,0.0,0.0,0.0,0.0,52.2,2.33,1.67,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
9,10,Germany,59.3,2.0,1.67,3.35,-1.35,65.7,1.83,1.17,...,-0.77,71.7,0.67,0.67,1.85,-1.18,66.8,1.11,0.95,22.3
