In [1]:
import re
import numpy as np
import pandas as pd
import random as rnd
import statistics
from itertools import combinations
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

In [2]:
YEAR = 2018  # year for the model

In [3]:
# https://www.kaggle.com/datasets/martj42/international-football-results-from-1872-to-2017
matches = pd.read_csv("results.csv") 
matches['date'] = pd.to_datetime(matches["date"])
matches["year"] = [date.year for date in matches["date"]]  # [(i.year, i.month, i.day) for i in matches["date"]]
matches

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,year
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,1872
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,1873
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,1874
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,1875
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,1876
...,...,...,...,...,...,...,...,...,...,...
43747,2022-06-14,Moldova,Andorra,2,1,UEFA Nations League,Chișinău,Moldova,False,2022
43748,2022-06-14,Liechtenstein,Latvia,0,2,UEFA Nations League,Vaduz,Liechtenstein,False,2022
43749,2022-06-14,Chile,Ghana,0,0,Kirin Cup,Suita,Japan,True,2022
43750,2022-06-14,Japan,Tunisia,0,3,Kirin Cup,Suita,Japan,False,2022


In [4]:
def duplicate_matches(matches):
    """duplicate the dataset so every game is represented twice (home/away -> team/opponent)""" 
    duplic = matches.copy()
    duplic1, duplic2 = duplic.copy(), duplic.copy()

    duplic1 = duplic1.rename(columns={"home_team": "team", "away_team": "opponent", "home_score": "team_score", "away_score": "opponent_score"})
    duplic2 = duplic2.rename(columns={"away_team": "team", "home_team": "opponent", "away_score": "team_score", "home_score": "opponent_score"})
    duplic = pd.concat([duplic1, duplic2])
    return duplic

duplic = duplicate_matches(matches)
duplic.shape

(87504, 10)

In [5]:
wc_start = {2022: "2022-11-21", 2018: "2018-06-14", 2014: "2014-06-12", 2010: "2010-06-11"}

def get_elo_rankings(start, end):
    """Get ELO rankings and scores for teams from a given time frame
        ex: start_date = '2010-1-1', end_date='2010-12-31'."""
    start, end = pd.to_datetime(start), pd.to_datetime(end)
    s_day, s_mon, s_yr = str(start.day).zfill(2), str(start.month).zfill(2), start.year
    e_day, e_mon, e_yr = str(end.day).zfill(2), str(end.month).zfill(2), end.year

    link = f"https://www.international-football.net/average-elo-ratings?init=1&start-year={s_yr}&start-month={s_mon}&start-day={s_day}&end-year={e_yr}&end-month={e_mon}&end-day={e_day}&type=day"
    df = pd.read_html(link)
    df = pd.concat(df)
    
    df["Rank"] = df[0]
    df["Team"] = [re.split('\d', row)[0] for row in df[2]]
    df["Score"] = df[3]
    df["Year"] = e_yr
    df = df[["Rank", "Team", "Score", "Year"]]
    return df

def wc_rankings(start, end):
    """Get all rankings from start date to end date"""
    start, end = pd.to_datetime(start), pd.to_datetime(end)
    s_day, s_mon, s_yr = str(start.day).zfill(2), str(start.month).zfill(2), start.year
    e_day, e_mon, e_yr = str(end.day).zfill(2), str(end.month).zfill(2), end.year
    
    years = []
    years.append(get_elo_rankings(f"{s_yr}-{s_mon}-{s_day}", f"{s_yr}-12-31"))
    years += [get_elo_rankings(f"{yr}-01-01", f"{yr}-12-31") for yr in range(s_yr+1, e_yr)]
    years.append(get_elo_rankings(f"{e_yr}-01-01", f"{e_yr}-{e_mon}-{e_day}"))    
    return pd.concat(years)


In [6]:
def get_data(year):
    """Get the data up to {year}"""
    rankings = wc_rankings("1998-01-01", wc_start[year])
    
    data = duplicate_matches(matches)
    data = data.merge(rankings, left_on=["team", "year"], right_on=["Team", "Year"]).rename(columns={"Rank": "team_rank", "Score": "team_points"}).drop(columns=["Team", "Year"])
    data = data.merge(rankings, left_on=["opponent", "year"], right_on=["Team", "Year"]).rename(columns={"Rank": "opponent_rank", "Score": "opponent_points"}).drop(columns=["Team", "Year"])

    data = data[data['tournament'] != "Friendly"]
    data['rank_diff'] = data['team_rank'] - data['opponent_rank']
    data['avg_rank'] = (data['team_rank'] + data['opponent_rank']) / 2
    data['score_diff'] = data['team_score'] - data['opponent_score']
    data['team_won'] = data['score_diff'] > 0
    data['result'] = np.sign(data['team_score'] - data['opponent_score']) # -1 if home lost, 1 if home win, and 0 if tie
    data['comp'] = data['tournament'] == "FIFA World Cup"
    return rankings, data

In [7]:
def get_model(year, report=False):
    """Get the model trained on data before {year}"""
    rankings, match_data = get_data(year)
    
    X, y = match_data[[ "rank_diff", "avg_rank", "neutral", "team_points", "opponent_points"]], match_data["team_won"]
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=6, stratify = y)

    logreg = LogisticRegression() #LogisticRegressionCV(cv=5, random_state=0).fit(X, y)
    logreg.fit(X_train, y_train)
    
    if report:
        scores = cross_val_score(logreg, X_train, y_train, cv=5)
        print(f"Mean Validation accuracy: {scores.mean()}")

        y_pred = logreg.predict(X_test)
        print(f"Test data model accuracy: {logreg.score(X_test, y_test)}")
        print("\n", classification_report(y_test, y_pred))

        t2 = X_test['rank_diff'] < 0  # t2 = -1 * np.sign(X_test['rank_diff'])
        print("Baseline {predict higher ranked team}:\n", classification_report(y_test, t2))
        
    return logreg, rankings, match_data

logreg, rankings, match_data = get_model(YEAR, report=True)
# print(match_data.groupby("result")["result"].count())

Mean Validation accuracy: 0.7466066164815621
Test data model accuracy: 0.754321554229769

               precision    recall  f1-score   support

       False       0.78      0.84      0.81      3980
        True       0.71      0.62      0.67      2557

    accuracy                           0.75      6537
   macro avg       0.74      0.73      0.74      6537
weighted avg       0.75      0.75      0.75      6537

Baseline {predict higher ranked team}:
               precision    recall  f1-score   support

       False       0.84      0.69      0.76      3980
        True       0.62      0.79      0.70      2557

    accuracy                           0.73      6537
   macro avg       0.73      0.74      0.73      6537
weighted avg       0.75      0.73      0.73      6537



In [8]:
# 2010 World Cup Groups
group_a10 = ["Uruguay", "Mexico", "South Africa", "France"]
group_b10 = ["Argentina", "South Korea", "Greece", "Nigeria"]
group_c10 = ["United States", "England", "Slovenia", "Algeria"]
group_d10 = ["Germany", "Ghana", "Australia", "Serbia"]
group_e10 = ["Netherlands", "Japan", "Denmark", "Cameroon"]
group_f10 = ["Paraguay", "Slovakia", "New Zealand", "Italy"]
group_g10 = ["Brazil", "Portugal", "Ivory Coast", "North Korea"]
group_h10 = ["Spain", "Chile", "Switzerland", "Honduras"]

# 2014 World Cup Groups
group_a14 = ["Brazil", "Mexico", "Croatia", "Cameroon"]
group_b14 = ["Netherlands", "Chile", "Spain", "Australia"]
group_c14 = ["Colombia", "Greece", "Ivory Coast", "Japan"]
group_d14 = ["Costa Rica", "Uruguay", "Italy", "England"]
group_e14 = ["France", "Switzerland", "Ecuador", "Honduras"]
group_f14 = ["Argentina", "Nigeria", "Bosnia and Herzegovina", "Iran"]
group_g14 = ["Germany", "United States", "Portugal", "Ghana"]
group_h14 = ["Belgium", "Algeria", "Russia", "South Korea"]

# 2018 World Cup Groups
group_a18 = ["Uruguay", "Russia", "Saudi Arabia", "Egypt"]
group_b18 = ["Spain", "Portugal", "Iran", "Morocco"]
group_c18 = ["France", "Denmark", "Peru", "Australia"]
group_d18 = ["Croatia", "Argentina", "Nigeria", "Iceland"]
group_e18 = ["Brazil", "Switzerland", "Serbia", "Costa Rica"]
group_f18 = ["Sweden", "Mexico", "South Korea", "Germany"]
group_g18 = ["Belgium", "England", "Tunisia", "Panama"]
group_h18 = ["Colombia", "Japan", "Senegal", "Poland"]

# 2022 World Cup Groups
group_a22 = ["Qatar", "Ecuador", "Senegal", "Netherlands"]
group_b22 = ["England", "Iran", "United States", "Wales"]
group_c22 = ["Argentina", "Saudi Arabia", "Mexico", "Poland"]
group_d22 = ["France", "Australia", "Denmark", "Tunisia"]
group_e22 = ["Spain", "Costa Rica", "Germany", "Japan"]
group_f22 = ["Belgium", "Canada", "Morocco", "Croatia"]
group_g22 = ["Brazil", "Serbia", "Switzerland", "Cameroon"]
group_h22 = ["Portugal", "Ghana", "Uruguay", "South Korea"]

groups_list10 = [group_a10, group_b10, group_c10, group_d10, group_e10, group_f10, group_g10, group_h10]
all_teams10 = group_a10 + group_b10 + group_c10 + group_d10 + group_e10 + group_f10 + group_g10 + group_h10
groups_list14 = [group_a14, group_b14, group_c14, group_d14, group_e14, group_f14, group_g14, group_h14]
all_teams14 = group_a14 + group_b14 + group_c14 + group_d14 + group_e14 + group_f14 + group_g14 + group_h14
groups_list18 = [group_a18, group_b18, group_c18, group_d18, group_e18, group_f18, group_g18, group_h18]
all_teams18 = group_a18 + group_b18 + group_c18 + group_d18 + group_e18 + group_f18 + group_g18 + group_h18
groups_list22 = [group_a22, group_b22, group_c22, group_d22, group_e22, group_f22, group_g22, group_h22]
all_teams22 = group_a22 + group_b22 + group_c22 + group_d22 + group_e22 + group_f22 + group_g22 + group_h22

groups_list = {2022: groups_list22, 2018: groups_list18, 2014: groups_list14, 2010: groups_list10}
all_teams = {2022: all_teams22, 2018: all_teams18, 2014: all_teams14, 2010: all_teams10}

In [9]:
#for i in range(len(all_teams18)):
#   team = all_teams18[i]
#    print(i, team, list(rankings[rankings["Team"] == team]["Year"])[-1])

#for i in range(len(all_teams22)):
#    team = all_teams22[i]
#   print(i, team, matches[(matches["home_team"] == team) | (matches["away_team"] == team)].shape[0])

In [10]:
def get_probabs(team1, team2):
    """Get the probabilities"""
    rank1 = list(rankings[rankings["Team"] == team1]["Rank"])[-1]
    rank2 = list(rankings[rankings["Team"] == team2]["Rank"])[-1]
    points1 = list(rankings[rankings["Team"] == team1]["Score"])[-1]
    points2 = list(rankings[rankings["Team"] == team2]["Score"])[-1]
    
    rank_diff = rank1 - rank2
    avg_rank = (rank1 + rank2) / 2

    cols = ['rank_diff', 'avg_rank', 'neutral', 'team_points', 'opponent_points']
    row = pd.DataFrame(np.array([[rank_diff, avg_rank, True, points1, points2]]), columns=cols)
    team1_prob = round(logreg.predict_proba(row)[:,1][0], 2)   # probablity team1 wins

    row = pd.DataFrame(np.array([[-1 * rank_diff, avg_rank, True, points2, points1]]), columns=cols)
    team2_prob = round(logreg.predict_proba(row)[:,1][0], 2)   # probablity team2 wins

    tie_prob = 1 - team1_prob - team2_prob # probability of a tie
    return [team1_prob, tie_prob, team2_prob]
    
def simulate_group_match(team1, team2, pr=False):
    """Simulate a group stage match between two teams. Returns result (-1, 0, or 1)"""
    trials = 1
    team1_prob, tie_prob, team2_prob = get_probabs(team1, team2)
    #result = Counter(np.random.choice([1, 0, -1], trials, p =[team1_prob, tie_prob, team2_prob]))
    #result = result.most_common()[0][0]  # either -1, 0, or 1
    result = np.random.choice([1, 0, -1], p = [team1_prob, tie_prob, team2_prob]) # either -1, 0, or 1 (1 trial)
    return result, team1_prob, tie_prob, team2_prob

def simulate_group(group, pr=True):
    """Simulate a group or a matchup"""
    table = {team: 0 for team in group}
    draws = 0
    
    for team1, team2 in combinations(group, 2):  # each match
        result, team1_prob, tie_prob, team2_prob = simulate_group_match(team1, team2)
        
        if result == 1: # team1 wins
            table[team1] += 3
            result_str = f"{team1} beats {team2}:"

        elif result == -1: # team2 wins
            table[team2] += 3
            result_str = f"{team1} loses to {team2}"

        else: # result == 0 (tie)
            draws += 1
            table[team1] += 1
            table[team2] += 1
            result_str = f"{team1} draws {team2}"
            
        if pr:
            print(result_str, [round(team1_prob, 2), round(tie_prob, 2), round(team2_prob, 2)],  "-> ", result)

    standings = sorted(table.items(), key=lambda x: x[1], reverse=True)
    
    if pr:
        print(standings, "draws:", draws)
    return standings


In [11]:
def get_knockout_round(groups_list, pr=True):
    """Get teams advancing to the knockout round"""
    gw = []  # group winners
    for group in groups_list:
        standings = [i[0] for i in simulate_group(group, pr=pr)]
        gw.append(standings[:2])
        
        if pr:
            print("")
    return gw

def simulate_knockout_match(team1, team2, pr=False):
    """Simulate a match between two teams
    Returns the winning team"""
    trials = 1 #5
    team1_prob, tie_prob, team2_prob = get_probabs(team1, team2)

    result = Counter(np.random.choice([1, 0, -1], trials, p =[team1_prob, tie_prob, team2_prob]))
    result = result.most_common()[0][0]
    
    tie_str = ""
    if result == 0:  # no ties for knockout round
        total = team1_prob + team2_prob
        t1, t2 = team1_prob / total, team2_prob / total
        result = Counter(np.random.choice([1, -1], trials, p =[t1, t2])) 
        result = result.most_common()[0][0] # either 1 or -1
        tie_str = " in OT"
        
    if pr:
        if result == 1: # team1 wins
            print(f"{team1} beats {team2}{tie_str}: ".ljust(40), f"[{team1_prob} vs {team2_prob}]")
            
        else: # result == -1: # team2 wins
            print(f"{team1} loses to {team2}{tie_str}: ".ljust(40), f"[{team1_prob} vs {team2_prob}]")

    if result == 1:
        return team1
    else:
        return team2

In [12]:
def simulate_knockout_round(gw, pr=False):
    """Simulate the knockout stages, determine top 4 teams"""
    r16 = [gw[0][0], gw[1][1], gw[2][0], gw[3][1], gw[4][0], gw[5][1], gw[6][0], gw[7][1],
           gw[1][0], gw[0][1], gw[3][0], gw[2][1], gw[5][0], gw[4][1], gw[7][0], gw[6][1]]

    if pr:
        print("\n", r16)

    qf = [simulate_knockout_match(r16[2*i], r16[2*i+1], pr=pr) for i in range(8)]

    if pr:
        print("\nQF: ", qf)

    sf = [simulate_knockout_match(qf[2*i], qf[2*i+1], pr=pr) for i in range(4)]
    
    if pr:
        print("\nSF: ", sf)

    final = [simulate_knockout_match(sf[2*i], sf[2*i+1], pr=pr) for i in range(2)]
    bronze = [t for t in sf if t not in final]
    
    if pr:
        print("\nB :", bronze)
        print("F :", final)
        
    third = simulate_knockout_match(bronze[0], bronze[1], pr=pr)
    fourth = [t for t in bronze if t != third][0]
    champ = simulate_knockout_match(final[0], final[1], pr=pr)
    second = [t for t in final if t != champ][0]
    return champ, second, third, fourth

In [16]:
#import time
#s = time.time()
simulate_knockout_round(get_knockout_round(groups_list[YEAR], False), pr=True)
#time.time() - s


 ['Egypt', 'Portugal', 'France', 'Nigeria', 'Brazil', 'Mexico', 'Belgium', 'Colombia', 'Spain', 'Russia', 'Argentina', 'Australia', 'Germany', 'Switzerland', 'Poland', 'England']
Egypt loses to Portugal:                 [0.07 vs 0.77]
France beats Nigeria in OT:              [0.72 vs 0.09]
Brazil beats Mexico:                     [0.7 vs 0.09]
Belgium beats Colombia in OT:            [0.33 vs 0.33]
Spain beats Russia:                      [0.78 vs 0.06]
Argentina beats Australia in OT:         [0.69 vs 0.1]
Germany loses to Switzerland:            [0.65 vs 0.11]
Poland loses to England in OT:           [0.22 vs 0.47]

QF:  ['Portugal', 'France', 'Brazil', 'Belgium', 'Spain', 'Argentina', 'Switzerland', 'England']
Portugal loses to France:                [0.32 vs 0.33]
Brazil beats Belgium:                    [0.6 vs 0.13]
Spain beats Argentina:                   [0.4 vs 0.25]
Switzerland loses to England in OT:      [0.25 vs 0.42]

SF:  ['France', 'Brazil', 'Spain', 'England']
France 

('Brazil', 'England', 'Spain', 'France')

In [17]:
iterations = 100
winners_dict = {}
finalists_dict = {}
for i in range(iterations):
    gw = get_knockout_round(groups_list[YEAR], False)
    final_four = simulate_knockout_round(gw, pr=False)
    final_four_set = frozenset(final_four)
    if final_four_set in finalists_dict.keys():
        finalists_dict[final_four_set] += 1
    else:
        finalists_dict[final_four_set] = 1
        
    winner = final_four[0]
    if winner in winners_dict.keys():
        winners_dict[winner] += 1
    else:
        winners_dict[winner] = 1

print(f"Results for {YEAR} World Cup...")

print("Most common winners")        
for team, num in sorted(winners_dict.items(), key=lambda x:x[1], reverse=True)[:5]:  # top 5 most common winners
    print(f"{team}: {round((num / iterations) * 100, 1)}%")

print("\nMost common final fours")
for teams, num in sorted(finalists_dict.items(), key=lambda x:x[1], reverse=True)[:5]: # top 5 most commmon final fours
    print(f"{list(teams)}: {round((num / iterations) * 100, 1)}%")


Results for 2018 World Cup...
Most common winners
Brazil: 30.0%
Germany: 24.0%
Spain: 17.0%
Portugal: 7.0%
Colombia: 7.0%

Most common final fours
['Germany', 'France', 'Spain', 'Brazil']: 10.0%
['Germany', 'Spain', 'Brazil', 'Portugal']: 10.0%
['Colombia', 'Spain', 'Brazil', 'Portugal']: 6.0%
['Germany', 'France', 'Argentina', 'Brazil']: 4.0%
['Argentina', 'Germany', 'Iran', 'Brazil']: 3.0%


In [18]:
# 199/900 world cup games end in draw  # 0.2211111111111111
# 10084/43752 of all games end in draw. # 0.2304808923020662