In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from itertools import combinations
from copy import deepcopy

In [2]:
games = pd.read_csv("data/international_matches.csv", usecols=[
    "date",
    "home_team",
    "away_team",
    "home_team_score",
    "away_team_score",
    "tournament",
    "home_team_continent",
    "away_team_continent"])
games['year'] = pd.DatetimeIndex(games['date']).year
del games['date']

qual_data = games[(games["tournament"] == "FIFA World Cup qualification") & (games["year"] > 2018)]
del qual_data["tournament"], qual_data["year"]

wc_data = games[(games["tournament"] == "FIFA World Cup")  & (games["year"] > 2000)]
del wc_data["home_team"], wc_data["away_team"], wc_data["tournament"], wc_data["year"]

del games

In [3]:
teams_qualified = np.array(["Qatar", "Ecuador", "Senegal", "Netherlands",
             "England", "Iran", "United States", "Wales",
             "Argentina", "Saudi Arabia", "Mexico", "Poland",
             "France", "Australia", "Denmark", "Tunisia",
             "Spain", "Costa Rica", "Germany", "Japan",
             "Belgium", "Canada", "Morocco", "Croatia",
             "Brazil", "Serbia", "Switzerland", "Cameroon",
             "Portugal", "Ghana", "Uruguay", "South Korea"])
continents = np.array(["Asia", "South America", "Africa", "Europe",
             "Europe", "Asia", "North America", "Europe",
             "South America", "Asia", "North America", "Europe",
             "Europe", "Asia", "Europe", "Africa",
             "Europe", "South America", "Europe", "Asia",
             "Europe", "North America", "Africa", "Europe",
             "South America", "Europe", "Europe", "Africa",
             "Europe", "Africa", "South America", "Asia"])

In [4]:
avg_goals = np.zeros(len(teams_qualified))
avg_conceded = np.zeros(len(teams_qualified))

for idx, team in enumerate(teams_qualified):
    home_mask = qual_data["home_team"] == team
    away_mask = qual_data["away_team"] == team
    continent_mask = qual_data["home_team_continent"] == continents[idx]
    avg_goals[idx] = np.mean(list(qual_data[home_mask]["home_team_score"]) + list(qual_data[away_mask]["away_team_score"]))
    avg_continent = np.mean(list(qual_data[continent_mask]["home_team_score"]) + list(qual_data[continent_mask]["away_team_score"]))
    avg_conceded[idx] = np.mean(list(qual_data[home_mask]["away_team_score"]) + list(qual_data[away_mask]["home_team_score"]))/avg_continent

In [5]:
def get_continent_mods(home_cont, away_cont):
    if home_cont == away_cont:
        return 1.0, 1.0
    
    home_mask = (wc_data["home_team_continent"] == home_cont) & (wc_data["away_team_continent"] == away_cont)
    away_mask = (wc_data["away_team_continent"] == home_cont) & (wc_data["home_team_continent"] == away_cont)
    home_goals = np.mean(list(wc_data[home_mask]["home_team_score"]) + list(wc_data[away_mask]["away_team_score"]))
    
    home_mask = (wc_data["home_team_continent"] == away_cont) & (wc_data["away_team_continent"] == home_cont)
    away_mask = (wc_data["away_team_continent"] == away_cont) & (wc_data["home_team_continent"] == home_cont)
    away_goals = np.mean(list(wc_data[home_mask]["home_team_score"]) + list(wc_data[away_mask]["away_team_score"]))
    
    return np.sqrt(home_goals/away_goals), np.sqrt(away_goals/home_goals)
    

def play_game(home_team, away_team, no_draw=False, echo=False):
    home_idx = np.where(teams_qualified == home_team)[0][0]
    away_idx = np.where(teams_qualified == away_team)[0][0]
    
    home_c_mod, away_c_mod = get_continent_mods(continents[home_idx], continents[away_idx])
    
    home_goals = avg_goals[home_idx]*home_c_mod*avg_conceded[away_idx]
    away_goals = avg_goals[away_idx]*away_c_mod*avg_conceded[home_idx]
    
    if echo:
        print(f"{home_goals} = {avg_goals[home_idx]}*{home_c_mod}*{avg_conceded[away_idx]}")
        print(f"{away_goals} = {avg_goals[away_idx]}*{away_c_mod}*{avg_conceded[home_idx]}")
    
    return np.random.poisson(lam=home_goals) + no_draw*np.random.rand(), np.random.poisson(lam=away_goals) + no_draw*np.random.rand()


def evaluate_group(teams):
    points = np.zeros(len(teams))
    goals = np.zeros(len(teams))
    conceded = np.zeros(len(teams))
    for home_team, away_team in combinations(teams, 2):
        h_goals, a_goals = play_game(home_team, away_team)
        points[teams == home_team] = points[teams == home_team] + 3*(h_goals>a_goals) + (h_goals == a_goals)
        goals[teams == home_team] = goals[teams == home_team] + h_goals
        conceded[teams == home_team] = conceded[teams == home_team] + a_goals
        points[teams == away_team] = points[teams == away_team] + 3*(a_goals>h_goals) + (a_goals == h_goals)
        goals[teams == away_team] = goals[teams == away_team] + a_goals
        conceded[teams == away_team] = conceded[teams == away_team] + h_goals
    return points, goals, conceded

def rank_group(points, goals, conceded):
    fake_points = points * 10000 + 5000
    fake_points = fake_points + (goals-conceded)*100
    fake_points = fake_points + goals
    fake_points = fake_points + 0.01*np.random.rand(len(points))
    ranks = np.zeros(len(points))
    ranks[np.argsort(fake_points)] = np.arange(4, 0, -1)
    return ranks

def pair_up(ranks):
    return np.array([
        np.where(ranks[0] == 1)[0][0],
        np.where(ranks[1] == 2)[0][0] + 4,
        np.where(ranks[2] == 1)[0][0] + 8,
        np.where(ranks[3] == 2)[0][0] + 12,
        np.where(ranks[4] == 1)[0][0] + 16,
        np.where(ranks[5] == 2)[0][0] + 20,
        np.where(ranks[6] == 1)[0][0] + 24,
        np.where(ranks[7] == 2)[0][0] + 28,
        np.where(ranks[0] == 2)[0][0],
        np.where(ranks[1] == 1)[0][0] + 4,
        np.where(ranks[2] == 2)[0][0] + 8,
        np.where(ranks[3] == 1)[0][0] + 12,
        np.where(ranks[4] == 2)[0][0] + 16,
        np.where(ranks[5] == 1)[0][0] + 20,
        np.where(ranks[6] == 2)[0][0] + 24,
        np.where(ranks[7] == 1)[0][0] + 28,
    ])

def elimination(ids):
    _ids = deepcopy(ids)
    winners = []
    _ids.shape = (len(_ids) // 2, 2)
    for home_idx, away_idx in _ids:
        result = play_game(teams_qualified[home_idx], teams_qualified[away_idx], no_draw=True)
        if result[0] > result[1]:
            winners = winners + [home_idx]
        if result[0] < result[1]:
            winners = winners + [away_idx]
    return np.array(winners)

In [6]:
def run_tournament():
    group_points = [evaluate_group(teams_qualified[s:s+4]) for s in range(0, 32, 4)]
    group_ranks = [rank_group(*g) for g in group_points]
    winners = pair_up(group_ranks)
    while len(winners) > 1:
        winners = elimination(winners)
    return winners

In [7]:
winners = np.array([run_tournament() for _ in range(1000)])

In [8]:
unique_winners, win_counts = np.unique(winners, return_counts=True)
sort_order = np.argsort(win_counts)[::-1]

for sort_idx in sort_order:
    print(f"{teams_qualified[unique_winners[sort_idx]]:15s} {win_counts[sort_idx]:>3d}/{len(winners)}")

England         221/1000
Denmark         129/1000
Germany         127/1000
Switzerland     115/1000
Brazil           83/1000
Qatar            68/1000
France           61/1000
Croatia          31/1000
South Korea      29/1000
Netherlands      28/1000
Poland           16/1000
Belgium          15/1000
Canada           15/1000
Portugal         15/1000
Argentina        10/1000
Morocco           9/1000
Japan             7/1000
Spain             7/1000
Tunisia           6/1000
Iran              4/1000
Uruguay           1/1000
Serbia            1/1000
Ecuador           1/1000
Costa Rica        1/1000
