In [116]:
import random
import pandas as pd
from itertools import combinations

import matplotlib.pyplot as plt
import numpy as np


numtrials = 1000

# ELO + LOCATION INITIALISATION

In [117]:
countries = [
    "Brazil",
    "Spain",
    "France",
    "Argentina",
    "Uruguay",
    "Colombia",
    "United Kingdom",
    "Paraguay",
    "Germany",
    "Ecuador",
    "Portugal",
    "Italy",
    "Morocco",
    "Egypt",
    "South Korea",
    "Japan",
    "Mexico",
    "Costa Rica",
    "New Zealand",
    "Australia",
    "Turkey",
    "Switzerland", 
    "Norway",
    "Netherlands"
]

pools = {
    1: ['Mexico', 'Costa Rica', 'Ecuador', 'Colombia'],
    2: ['Brazil', 'Paraguay', 'Uruguay', "Argentina"],
    3: ['South Korea', 'Japan', 'New Zealand', 'Australia'],
    4: ['United Kingdom', 'Germany', 'France', 'Italy'],
    5: ['Spain', 'Portugal', 'Morocco', 'Egypt']
}


elo_ratings = [
    1994,
    2150,
    2031,
    2140,
    1922,
    1953,
    2012,
    1799,
    1988,
    1911,
    1988,
    1914,
    1807,
    1668,
    1745,
    1875,
    1817,
    1653,
    1596,
    1736,
    1837,
    1812,
    1828,
    1967
]

def get_elo(name):
    return countriesratings[name]

countriesratings = {country: elo_ratings[countries.index(country)] for country in countries}
countries_ranked = sorted(countries, key=get_elo, reverse=True)

In [118]:
import folium
from IPython.display import display

locations = [
    (-14.2350, -51.9253),  # Brazil
    (40.4637, -3.7492),    # Spain
    (46.6034, 1.8883),     # France
    (-38.4161, -63.6167),  # Argentina
    (-32.5228, -55.7659),  # Uruguay
    (4.5709, -74.2973),    # Colombia
    (55.3781, -3.4360),    # United Kingdom
    (-23.4420, -58.4438),  # Paraguay
    (51.1657, 10.4515),    # Germany
    (-1.8312, -78.1834),   # Ecuador
    (39.3999, -8.2245),    # Portugal
    (41.8719, 12.5674),    # Italy
    (31.7915, -7.0926),    # Morocco
    (26.8206, 30.8025),    # Egypt
    (35.9078, 127.7669),   # South Korea
    (36.2048, 138.2529),   # Japan
    (23.6345, -102.5528),  # Mexico
    (9.7489, -83.7534),    # Costa Rica
    (-40.9006, 174.8860),  # New Zealand
    (-25.2744, 133.7751),  # Australia
    (38.9637, 35.2433),    # Turkey
    (46.8182, 8.2275),     # Switzerland
    (60.4720, 8.4689),     # Norway
    (52.1326, 5.2913)      # Netherlands
]

locationdict = {country: locations[countries.index(country)] for country in countries}

games_played_country = {country: 0 for country in countries}
airtime_by_country = {country: 0 for country in countries}

# DISTANCE CALCULATION

In [119]:
from math import radians, sin, cos, sqrt, atan2

# DISTANCE CALCULATIONS
def haversine(pos1, pos2):
    lat1, lon1 = pos1
    lat2, lon2 = pos2
    # Convert degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    # Haversine formula
    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    R = 6371.0 # Radius of Earth in kilometers
    return R * c # Distance in kilometers


def estimate_flight_time(pos1, pos2, speed_kmh=900):
    distance = haversine(pos1, pos2)
    time_hours = distance / speed_kmh
    return time_hours

# ELO SYSTEM + EVALUATION

In [120]:
# Elo system
def winrate(country1, country2):
    return (1/(1 + 10**((countriesratings[country2]-countriesratings[country1])/400)))

# Evaluation
def evaluate_ranking(ranking, true_ranking):
    """Determines how good a ranking is."""

    loss = 0

    # For each pair of countries in the ranking
    for i, A in enumerate(countries):
        for j, B in enumerate(countries):
            if i >= j:
                continue
            rank_A = ranking.index(A) + 1
            rank_B = ranking.index(B) + 1
            true_A = true_ranking.index(A) + 1
            true_B = true_ranking.index(B) + 1
            if (rank_A < rank_B) == (true_A < true_B):
                continue

            W = (
                1 / true_A
                + 1 / true_B
                + 1 / rank_A
                + 1 / rank_B
            )
            loss += W * (abs(0.5 - winrate(A, B)))
    return loss

# MATCH SIMULATION

In [121]:

def simulate_match(country1, country2):
    
    global totaltime
    totaltime += estimate_flight_time(locationdict[country1], locationdict[country2])
    
    global total_games_played
    total_games_played += 1
    
    games_played_country[country1] += 1
    games_played_country[country2] += 1
    
    result_country = random.choice([country1, country2])
    airtime_by_country[result_country] += 2*estimate_flight_time(locationdict[country1], locationdict[country2])
    
    percent1 = winrate(country1, country2)
    
    if random.random() < percent1:
        result = 'win'
    else:
        result = 'loss'
    
    if result == 'win':
        return country1, 3, country2, 0  # country1 wins, gets 3 points, country2 gets 0
    elif result == 'loss':
        return country1, 0, country2, 3  # country2 wins, gets 3 points, country1 gets 0
    else:
        return country1, 1, country2, 1  # Draw, both get 1 point each

In [122]:
def init_standings(teams):
    return {team: {'wins': 0, 'losses': 0, 'draws': 0, 'points': 0} for team in teams}

def sort_standings(standings):
    df = pd.DataFrame(standings).T
    df = df.sort_values(by=['points', 'wins'], ascending=False)
    return df

# TOURNAMENT SIMULATIONS

In [123]:
def round_robin(teams):
    standings = init_standings(teams)
    for team1, team2 in combinations(teams, 2):
        t1, p1, t2, p2 = simulate_match(team1, team2)
        standings[t1]['points'] += p1
        standings[t2]['points'] += p2
        if p1 == 3:
            standings[t1]['wins'] += 1
            standings[t2]['losses'] += 1
        elif p2 == 3:
            standings[t2]['wins'] += 1
            standings[t1]['losses'] += 1
        else:
            standings[t1]['draws'] += 1
            standings[t2]['draws'] += 1
    return standings

In [124]:
def simulate_knockout(teams):
    seeded = []
    
    for country in teams:
        seeded.append((country, countriesratings[country]))
    
    seeded.sort(key=lambda x: x[1], reverse=True)
    quarter_finalists = [seeded[0][0], seeded[7][0], seeded[3][0], seeded[4][0], seeded[1][0], seeded[6][0], seeded[2][0], seeded[5][0]]
    semi_finalists = []
    bottom4 = []
    thirdfourth = []
    

    # Quarterfinals
    for i in range(0, 8, 2):
        team1, team2 = quarter_finalists[i], quarter_finalists[i+1]
        winner = simulate_match(team1, team2)[0 if random.random() < winrate(team1, team2) else 2]
        semi_finalists.append(winner)
        if winner != team1:
            bottom4.append(team1)
        else:
            bottom4.append(team2)
    
    # Sort bottom4 based on the order they appear in the list seeded
    bottom4.sort(key=lambda x: next(i for i, v in enumerate(seeded) if v[0] == x))

    # Semifinals
    finalists = []
    for i in range(0, 4, 2):
        team1, team2 = semi_finalists[i], semi_finalists[i+1]
        winner = simulate_match(team1, team2)[0 if random.random() < winrate(team1, team2) else 2]
        finalists.append(winner)
        if winner != team1:
            thirdfourth.append(team1)
        else:
            thirdfourth.append(team2)
    thirdfourth.sort(key=lambda x: next(i for i, v in enumerate(seeded) if v[0] == x))
    

    # Final
    final_winner = simulate_match(finalists[0], finalists[1])[0 if random.random() < winrate(team1, team2) else 2]
    if final_winner != finalists[0]:
        second = finalists[0]
    else:
        second = finalists[1]
    
    return {
        bottom4[0]: 5,
        bottom4[1]: 6,
        bottom4[2]: 7,
        bottom4[3]: 8,
        thirdfourth[0]: 3,
        thirdfourth[1]: 4,
        second: 2,
        final_winner: 1
    }

# PLOTTING

In [125]:
# ---------------------------- PLOTTING ---------------------------------
# Calculate rankings based on countriesratings
def plotgraph(avgstandings, title):
    ratings_ranking = {country: rank for rank, country in enumerate(sorted(countriesratings, key=countriesratings.get, reverse=True), start=1)}


    # Prepare data for the bar graph
    avg_standings_values = [avgstandings[country] for country in countries]

    # Perform max-min normalization for ratings
    min_avg_standing = min(avg_standings_values)
    max_avg_standing = max(avg_standings_values)

    normalized_ratings = {
        country: max_avg_standing - ((max_avg_standing - min_avg_standing) * (countriesratings[country] - min(countriesratings.values())) / 
                                    (max(countriesratings.values()) - min(countriesratings.values())))
        for country in countries
    }

    ratings_ranking_values = [normalized_ratings[country] for country in countries]



    x = np.arange(len(countries))  # the label locations
    width = 0.35  # the width of the bars

    fig, ax = plt.subplots(figsize=(12, 6))
    rects1 = ax.bar(x - width/2, avg_standings_values, width, label='Average Standing')
    rects2 = ax.bar(x + width/2, ratings_ranking_values, width, label='Ratings Ranking')

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_xlabel('Countries')
    ax.set_ylabel('Rankings')
    ax.set_title(title)
    ax.set_xticks(x)
    ax.set_xticklabels(countries, rotation=45, ha='right')
    ax.legend()
    fig.tight_layout()

    ax.yaxis.set_major_locator(plt.MaxNLocator(integer=True))
    plt.show()

# K-MEANS ASSIGNMENT

In [126]:
def calculate_centroid(cluster):
    if not cluster:
        return (0, 0)
    latitudes = [point[0] for point in cluster]
    longitudes = [point[1] for point in cluster]
    return (sum(latitudes) / len(latitudes), sum(longitudes) / len(longitudes))

def k_cluster(k, data):
    # Points of latitude and longitude
    
    # Forgy Method
    random_indices = list(np.random.choice(len(data), size=k, replace=False))
    centroids = [data[indx] for indx in random_indices]
    clusters = {i: [] for i in range(k)}
    
    # Random Partition
    # random_indices = np.random.randint(0, k, len(data))
    # clusters = {i: [] for i in range(k)}
    # for idx, point in zip(random_indices, data):
    #     clusters[idx].append(point)
    # centroids = [calculate_centroid(cluster) for cluster in clusters.values()]
    
    
    converged = False
    while not converged: 
        clusters = {i: [] for i in range(k)}
        
        for point in data:
            distance_to_centroids = [haversine(point, centroid) for centroid in centroids]
            idx = distance_to_centroids.index(min(distance_to_centroids))
            count = 1
            while len(clusters[idx]) >= len(data)/k:
                idx = sorted(range(len(distance_to_centroids)), key=lambda x: distance_to_centroids[x])[count]
                count += 1
            
            clusters[idx].append(point)
        
        new_centroids = [calculate_centroid(cluster) for cluster in clusters.values()]
        
        converged = (new_centroids == centroids)
        centroids = new_centroids
        
        
        if converged:
            return clusters

# SINGLE ROUND ROBIN MODEL

In [127]:
import statistics

poolsize_index = {}

for size in range(2, 9):
    pools = k_cluster(size, locations)
    pools = {pool_name: [countries[locations.index(location)] for location in teams] for pool_name, teams in pools.items()}
    print(pools)

    fits = {}
    avgstandings = {country: 0 for country in countries}

    loss_total = 0
    totaltime = 0
    total_games_played = 0
    games_played_country = {country: 0 for country in countries}
    total_std = 0
    
    if size == 6:
        to_plot = pools

    for _ in range(numtrials):
        airtime_by_country = {country: 0 for country in countries}
        pool_results = {}
        direct_to_quarters = []
        qualifier_candidates = []
        bottom4 = []
        ninth16 = []

        # ---------- Phase 1: Group Roundrobin ----------
        for pool_name, teams in pools.items():
            standings = round_robin(teams)
            df = sort_standings(standings)
            pool_results[pool_name] = df
            direct_to_quarters.append(df.index[0])      # Top 1 to quarters
            
            short = False
            if size != 2:
                if len(df.index[1:]) >= 2:
                    qualifier_candidates.extend(df.index[1:3])
                else:
                    qualifier_candidates.extend(df.index[1:])
                    short = True
            else:
                qualifier_candidates.extend(df.index[1:5])

            if len(list(df.index[3:])) == 1:
                bottom4.append(df.index[3])
            else:
                if size != 2:
                    if not short:
                        bottom4.extend(df.index[3:])
                else:
                    bottom4.extend(df.index[5:])
                

        # ---------- Phase 2: Qualifiers: Next 10 teams run in qualifiers ----------
        qualifier_results = round_robin(qualifier_candidates)
        qualifier_df = sort_standings(qualifier_results)
        qualifier_top4 = list(qualifier_df.index[:8-size])
        ninth16.extend(qualifier_df.index[8-size:])

        # ---------- Phase 3: Knockout ----------
        knockout_teams = direct_to_quarters + qualifier_top4
        knockout_results = simulate_knockout(knockout_teams)
        
        # ---------- Phase 4: Placement Round ----------
        if len(bottom4) > 0:
            placement_results = round_robin(bottom4)
            placement_df = sort_standings(placement_results)
        
        
        sorted_placement = sorted(placement_results.keys(), key=lambda x: placement_results[x]['points'], reverse=True)
        standings = init_standings(countries)


        sorted_knockout = sorted(knockout_results.items(), key=lambda x: x[1])
        for country, position in sorted_knockout:
            standings[country] = position
            
        rank = 9
        for country in ninth16:
            standings[country] = rank
            rank += 1
        
        for country in sorted_placement:
            standings[country] = rank
            rank += 1
            
        sorted_countries = sorted(standings.keys(), key=lambda x: standings[x])
        
        loss_total += evaluate_ranking(sorted_countries, countries_ranked)
            
        for country, position in standings.items():
            avgstandings[country] += position
            
        total_std += statistics.stdev(airtime_by_country.values())
            
    for country, position in avgstandings.items():
        avgstandings[country] = position/numtrials
    
    airtime_std = total_std/numtrials

    for country, num in games_played_country.items():
        games_played_country[country] = num/numtrials

    std_dev = statistics.stdev(games_played_country.values())
    poolsize_index[size] = {'games': total_games_played/numtrials, 'time': totaltime/numtrials, 'deviation': std_dev, 'loss': loss_total/numtrials, 'airtimedev': airtime_std/(totaltime/numtrials)}
    # fits['singlerobinbalanced'] = {'games': total_games_played/numtrials, 'time': totaltime/numtrials, 'deviation': std_dev, 'loss': loss_total/numtrials}
print(poolsize_index)

{0: ['Brazil', 'Argentina', 'Uruguay', 'Colombia', 'Paraguay', 'Ecuador', 'Morocco', 'Mexico', 'Costa Rica', 'New Zealand', 'Norway', 'Netherlands'], 1: ['Spain', 'France', 'United Kingdom', 'Germany', 'Portugal', 'Italy', 'Egypt', 'South Korea', 'Japan', 'Australia', 'Turkey', 'Switzerland']}
{0: ['South Korea', 'Japan', 'New Zealand', 'Australia', 'Turkey', 'Switzerland', 'Norway', 'Netherlands'], 1: ['Brazil', 'Argentina', 'Uruguay', 'Colombia', 'Paraguay', 'Ecuador', 'Mexico', 'Costa Rica'], 2: ['Spain', 'France', 'United Kingdom', 'Germany', 'Portugal', 'Italy', 'Morocco', 'Egypt']}
{0: ['Morocco', 'Egypt', 'Mexico', 'Costa Rica', 'Turkey', 'Switzerland'], 1: ['Spain', 'France', 'United Kingdom', 'Germany', 'Portugal', 'Italy'], 2: ['Brazil', 'Argentina', 'Uruguay', 'Colombia', 'Paraguay', 'Ecuador'], 3: ['South Korea', 'Japan', 'New Zealand', 'Australia', 'Norway', 'Netherlands']}
{0: ['Italy', 'Morocco', 'Egypt', 'Turkey', 'Switzerland'], 1: ['Brazil', 'Argentina', 'Uruguay', 'C

# ANALYSIS

In [128]:
# Min-max normalization for each metric in the fits dictionary
normalized_fits = {}
metrics = ['games', 'time', 'deviation', 'loss', 'airtimedev']

# Extract min and max for each metric
min_max = {metric: (min(fit[metric] for fit in poolsize_index.values()), max(fit[metric] for fit in poolsize_index.values())) for metric in metrics}

# Normalize each value
for key, values in poolsize_index.items():
    normalized_fits[key] = {
        metric: (values[metric] - min_max[metric][0]) / (min_max[metric][1] - min_max[metric][0])
        for metric in metrics
    }
    
print(normalized_fits)

# weights = [0.1968708483,	0.2797103175,	0.0807229401,	0.442695894]
# weights = [0.2214610339,	0.3101115125,	0.07330141242,	0.3951260413]

weights = [0.2104155797,	0.3404226382,	0.07273615894,	0.2821103089,	0.0943153142]

# Calculate the overall index for each round format
overall_index = {}
for key, values in normalized_fits.items():
    overall_index[key] = 1 - sum(values[metric] * weight for metric, weight in zip(metrics, weights))
    
for mode, index in overall_index.items():
    print(mode, index)


{2: {'games': 1.0, 'time': 1.0, 'deviation': 0.7943998605263822, 'loss': 0.0, 'airtimedev': 0.0}, 3: {'games': 0.6713286713286714, 'time': 0.6304953840817419, 'deviation': 1.0, 'loss': 0.31202743222153073, 'airtimedev': 0.3184408523553056}, 4: {'games': 0.32167832167832167, 'time': 0.34891646618481553, 'deviation': 0.2705908391958348, 'loss': 0.7268293069097105, 'airtimedev': 0.265414879175958}, 5: {'games': 0.13286713286713286, 'time': 0.13121564347752965, 'deviation': 0.0, 'loss': 0.866483147014235, 'airtimedev': 0.33318952387769557}, 6: {'games': 0.06293706293706294, 'time': 0.07783425254535209, 'deviation': 0.0366045250729469, 'loss': 0.44210908149092365, 'airtimedev': 0.7490309015082354}, 7: {'games': 0.0, 'time': 0.0, 'deviation': 0.4234396766795488, 'loss': 0.8296078599546081, 'airtimedev': 1.0}, 8: {'games': 0.2517482517482518, 'time': 0.35962313868000795, 'deviation': 0.9914855003660934, 'loss': 1.0, 'airtimedev': 0.7715011654492615}}
2 0.3913801875828392
3 0.45331092315782173

In [129]:
import folium

# Create a folium map centered at an average location
map_center = (sum(lat for lat, lon in locations) / len(locations), sum(lon for lat, lon in locations) / len(locations))
folium_map = folium.Map(location=map_center, zoom_start=2)

# Define colors for each group
colors = ['red', 'blue', 'green', 'purple', 'orange', 'pink']

# Get the 6 groupings
six_groupings = to_plot

# Add markers for each country, color-coded by group
for group_idx, group_countries in six_groupings.items():
    for country in group_countries:
        lat, lon = locationdict[country]
        folium.Marker(
            location=(lat, lon),
            popup=f"{country} (Group {group_idx + 1})",
            icon=folium.Icon(color=colors[group_idx % len(colors)])
        ).add_to(folium_map)

# Display the map
display(folium_map)