In [132]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import random
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Read in csv
df = pd.read_csv(r"C:\Users\varad\Downloads\euro2024_data (1).csv")
df.fillna(0, inplace=True)
df.drop(columns=['played90s_nl2022', 'played90s_wc2022', 'played90s_euro2016', 'played90s_wc2018', 'played90s_euro2021'], inplace=True)

# Define feature columns excluding squad
feature_columns = df.drop(columns=['squad']).columns

# Define target variable
target = df['total_90s_played']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df[feature_columns], target, test_size=0.2, random_state=42)

# Import and initialize Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Prediction and error testing
y_pred = rf_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R-squared:", r2)

# Dictionary containing all the groups for group stage
EURO_groups = {
    'Group A': ['Germany', 'Scotland', 'Hungary', 'Switzerland'],
    'Group B': ['Spain', 'Italy', 'Croatia', 'Albania'],
    'Group C': ['Slovenia', 'Denmark', 'Serbia', 'England'],
    'Group D': ['Poland', 'Netherlands', 'Austria', 'France'],
    'Group E': ['Belgium', 'Slovakia', 'Romania', 'Ukraine'],
    'Group F': ['Türkiye', 'Portugal', 'Georgia', 'Czech Republic'],
}

def simulate_match(df, team1, team2, scaler, rf_model):
    match_stats = np.concatenate([df[df['squad'] == team1].drop(columns=['squad']).values.flatten(), 
                                  df[df['squad'] == team2].drop(columns=['squad']).values.flatten()])
    match_outcome = rf_model.predict([match_stats])
    if match_outcome > 0:
        return team1
    elif match_outcome < 0:
        return team2
    else:
        return "Draw"

# Simulates results for a group
def simulate_group(df, t1, t2, t3, t4, scaler, rfmodel):
    # Initialize new dict to track points
    group = {t1: 0, t2: 0, t3: 0, t4: 0}

    # Interate to sim matches
    for team, i in group:
        for opp, j in group:
            if (team != opp):
                result = simulate_match(df, team, opp, scaler, rf_model)
                if result == team:
                    group[team] += 3
                elif result == opp:
                    group[opp] += 3
                else:
                    group[team] += 1
                    group[opp] += 1

    # Sort group by points, returns decending list of tuples in the form [(team, points), ...]
    group_new = sorted(group.item(), key = lambda x: x[1], reverse=True)

    # Returns list of just teams in rank order
    teams_ranked = []
    for tuple in group_new:
        teams_ranked.append(tuple[0])

    return teams_ranked

def simulate_groupstage():
    
    groups = EURO_groups
    for group in groups:
        group_ranked = []
        group_ranked = simulate_group(group[1][0], group[1][1], group[1][2], group[1][3])
        groups.update({group: group_ranked})
    
    return groups

def knockout_stage(df,groups,scaler,rf_model):
    
    list39 = ['Group A','Group D','Group E','Group F']
    choice39 = random.choice(list39)
    
    list40 = list39.remove('Group A') #contains D,E,F
    if choice39 in list40:
        list40.remove(choice39)
    choice40 = random.choice(list40)
    
    list41 = ['Group A','Group B','Group C']
    if choice39 in list41:
        list41.remove(choice39)
    choice41 = random.choice(list41)
    
    list43 = ['Group A','Group B','Group C','Group D']
    if choice39 in list43:
        list43.remove(choice39)
    if choice40 in list43:
        list43.remove(choice40)
    if choice41 in list43:
        list43.remove(choice41)
    choice43 = random.choice(list43)
    
    r16 = {
        'match37': (groups['Group A'][0], groups['Group C'][1]),
        'match38': (groups['Group A'][1], groups['Group B'][1]),
        'match39': (groups['Group B'][0], groups[choice39][2]),
        'match40': (groups['Group C'][0], groups[choice40][2]),
        'match41': (groups['Group F'][0], groups[choice41][2]),
        'match42': (groups['Group D'][1], groups['Group E'][1]),
        'match43': (groups['Group E'][0], groups[choice43][1]),
        'match44': (groups['Group D'][0], groups['Group F'][1]),
    }
    
    qf = {
        'match45': (simulate_match(df,r16['match39'][0],r16['match39'][1],scaler,rf_model),simulate_match(df,r16['match37'][0],r16_['match37'][1],scaler,rf_model)),
        'match46': (simulate_match(df,r16['match41'][0],r16['match41'][1],scaler,rf_model),simulate_match(df,r16['match42'][0],r16['match42'][1],scaler,rf_model)),
        'match47': (simulate_match(df,r16['match43'][0],r16['match43'][1],scaler,rf_model),simulate_match(df,r16['match44'][0],r16['match44'][1],scaler,rf_model)),
        'match48': (simulate_match(df,r16['match40'][0],r16['match40'][1],scaler,rf_model),simulate_match(df,r16['match38'][0],r16['match38'][1],scaler,rf_model))
    }
    
    sf = {

    'match49': (simulate_match(df, qf['match45'][0], qf['match45'][1], scaler, rf_model), simulate_match(df, qf['match46'][0], qf['match46'][1], scaler, rf_model)),
    'match50': (simulate_match(df, qf['match47'][0], qf['match47'][1], scaler, rf_model), simulate_match(df, qf['match48'][0], qf['match48'][1], scaler, rf_model)),

    }

    final = {
        
    'match51': (simulate_match(df, qf['match49'][0], qf['match49'][1], scaler, rf_model), simulate_match(df, qf['match50'][0], qf['match50'][1], scaler, rf_model))
    
    }



simulate_match(df, 'France', 'Georgia', 0, rf_model)

Mean Absolute Error: 1.6666000000000025
Mean Squared Error: 3.528233400000012
R-squared: 0.8860036251550865




ValueError: X has 50 features, but RandomForestRegressor is expecting 25 features as input.

In [None]:
print("Number of feature columns:", len(feature_columns))
print("Shape of X_train:", X_train.shape)

print(X_train.shape[1],
X_test.shape[1])

duplicate_columns = df.columns[df.columns.duplicated()]
print("Duplicate columns:", duplicate_columns)

