In [1]:
import numpy as np
import pandas as pd
import random as rnd
# import statistics

from itertools import combinations
from collections import Counter
from get_match_data import *

from group_stages import *
from prediction_model import get_model

In [2]:
YEAR = 2018  # year for the model

In [3]:
rankings, data = get_data(YEAR)
logreg, match_data = get_model(data, report=True)

Mean Validation accuracy: 0.7466066164815621
Test data model accuracy: 0.754321554229769

               precision    recall  f1-score   support

       False       0.78      0.84      0.81      3980
        True       0.71      0.62      0.67      2557

    accuracy                           0.75      6537
   macro avg       0.74      0.73      0.74      6537
weighted avg       0.75      0.75      0.75      6537

Baseline {predict higher ranked team}:
               precision    recall  f1-score   support

       False       0.84      0.69      0.76      3980
        True       0.62      0.79      0.70      2557

    accuracy                           0.73      6537
   macro avg       0.73      0.74      0.73      6537
weighted avg       0.75      0.73      0.73      6537



In [4]:
match_data

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,year,team_rank,team_points,opponent_rank,opponent_points,rank_diff,avg_rank,score_diff,team_won,result,comp
589,1998-01-10,Zambia,Malawi,1,0,COSAFA Cup qualification,Blantyre,Malawi,False,1998,54,1616.70,119,1333.92,-65,86.5,1,True,1,False
10,1998-01-10,Malawi,Zambia,0,1,COSAFA Cup qualification,Blantyre,Malawi,False,1998,119,1333.92,54,1616.70,65,86.5,-1,False,-1,False
4,1998-01-18,Lesotho,Zimbabwe,0,2,COSAFA Cup qualification,Maseru,Lesotho,False,1998,152,1159.80,86,1471.12,66,119.0,-2,False,-1,False
681,1998-01-18,Zimbabwe,Lesotho,2,0,COSAFA Cup qualification,Maseru,Lesotho,False,1998,86,1471.12,152,1159.80,-66,119.0,2,True,1,False
393,1998-01-24,Namibia,South Africa,3,2,COSAFA Cup qualification,Windhoek,Namibia,False,1998,112,1364.70,46,1657.97,66,79.0,1,True,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33723,2018-12-06,Philippines,Vietnam,1,2,AFF Championship,Hanoi,Vietnam,False,2018,149,1172.39,128,1277.79,21,138.5,-1,False,-1,False
33718,2018-12-11,Malaysia,Vietnam,2,2,AFF Championship,Kuala Lumpur,Malaysia,False,2018,160,1057.30,128,1277.79,32,144.0,0,False,0,False
34477,2018-12-11,Vietnam,Malaysia,2,2,AFF Championship,Kuala Lumpur,Malaysia,False,2018,128,1277.79,160,1057.30,-32,144.0,0,False,0,False
33720,2018-12-15,Malaysia,Vietnam,0,1,AFF Championship,Hanoi,Vietnam,False,2018,160,1057.30,128,1277.79,32,144.0,-1,False,-1,False


In [5]:
def get_ranks_and_scores(year):
    team_ranks = {}
    team_scores = {}

    for group in groups_list[year]:
        for team in group:
            team_ranks[team] = match_data[match_data["team"] == team].iloc[-1]["team_rank"]
            team_scores[team] = match_data[match_data["team"] == team].iloc[-1]["team_points"]
            
    return team_ranks, team_scores

team_ranks, team_scores = get_ranks_and_scores(YEAR)

In [6]:
def get_probabs(team1, team2):
    """Get the probabilities"""
    rank1 = team_ranks[team1]
    rank2 = team_ranks[team2]
    points1 = team_scores[team1]
    points2 = team_scores[team2]
    
    rank_diff = rank1 - rank2
    avg_rank = (rank1 + rank2) / 2

    cols = ['rank_diff', 'avg_rank', 'neutral', 'team_points', 'opponent_points']
    row = pd.DataFrame(np.array([[rank_diff, avg_rank, True, points1, points2]]), columns=cols)
    team1_prob = round(logreg.predict_proba(row)[:,1][0], 2)   # probablity team1 wins

    row = pd.DataFrame(np.array([[-1 * rank_diff, avg_rank, True, points2, points1]]), columns=cols)
    team2_prob = round(logreg.predict_proba(row)[:,1][0], 2)   # probablity team2 wins

    tie_prob = 1 - team1_prob - team2_prob # probability of a tie
    return [team1_prob, tie_prob, team2_prob]
    
def simulate_group_match(team1, team2, pr=False):
    """Simulate a group stage match between two teams. Returns result (-1, 0, or 1)"""
    trials = 1
    team1_prob, tie_prob, team2_prob = get_probabs(team1, team2)
    #result = Counter(np.random.choice([1, 0, -1], trials, p =[team1_prob, tie_prob, team2_prob]))
    #result = result.most_common()[0][0]  # either -1, 0, or 1
    result = np.random.choice([1, 0, -1], p = [team1_prob, tie_prob, team2_prob]) # either -1, 0, or 1 (1 trial)
    return result, team1_prob, tie_prob, team2_prob

def simulate_group(group, pr=True):
    """Simulate a group or a matchup"""
    table = {team: 0 for team in group}
    draws = 0
    
    for team1, team2 in combinations(group, 2):  # each match
        result, team1_prob, tie_prob, team2_prob = simulate_group_match(team1, team2)
        
        if result == 1: # team1 wins
            table[team1] += 3
            result_str = f"{team1} beats {team2}:"

        elif result == -1: # team2 wins
            table[team2] += 3
            result_str = f"{team1} loses to {team2}"

        else: # result == 0 (tie)
            draws += 1
            table[team1] += 1
            table[team2] += 1
            result_str = f"{team1} draws {team2}"
            
        if pr:
            print(result_str, [round(team1_prob, 2), round(tie_prob, 2), round(team2_prob, 2)],  "-> ", result)

    standings = sorted(table.items(), key=lambda x: x[1], reverse=True)
    
    if pr:
        print(standings, "draws:", draws)
    return standings


In [7]:
def get_knockout_round(groups_list, pr=True):
    """Get teams advancing to the knockout round"""
    gw = []  # group winners
    for group in groups_list:
        standings = [i[0] for i in simulate_group(group, pr=pr)]
        gw.append(standings[:2])
        
        if pr:
            print("")
    return gw

def simulate_knockout_match(team1, team2, pr=False):
    """Simulate a match between two teams
    Returns the winning team"""
    trials = 1 #5
    team1_prob, tie_prob, team2_prob = get_probabs(team1, team2)

    result = Counter(np.random.choice([1, 0, -1], trials, p =[team1_prob, tie_prob, team2_prob]))
    result = result.most_common()[0][0]
    
    tie_str = ""
    if result == 0:  # no ties for knockout round
        total = team1_prob + team2_prob
        t1, t2 = team1_prob / total, team2_prob / total
        result = Counter(np.random.choice([1, -1], trials, p =[t1, t2])) 
        result = result.most_common()[0][0] # either 1 or -1
        tie_str = " in OT"
        
    if pr:
        if result == 1: # team1 wins
            print(f"{team1} beats {team2}{tie_str}: ".ljust(40), f"[{team1_prob} vs {team2_prob}]")
            
        else: # result == -1: # team2 wins
            print(f"{team1} loses to {team2}{tie_str}: ".ljust(40), f"[{team1_prob} vs {team2_prob}]")

    if result == 1:
        return team1
    else:
        return team2

In [8]:
def simulate_knockout_round(gw, pr=False):
    """Simulate the knockout stages, determine top 4 teams"""
    r16 = [gw[0][0], gw[1][1], gw[2][0], gw[3][1], gw[4][0], gw[5][1], gw[6][0], gw[7][1],
           gw[1][0], gw[0][1], gw[3][0], gw[2][1], gw[5][0], gw[4][1], gw[7][0], gw[6][1]]

    if pr:
        print("Group Winners:", gw)
        print("\nR16: ", r16)

    qf = [simulate_knockout_match(r16[2*i], r16[2*i+1], pr=pr) for i in range(8)]

    if pr:
        print("\nQF : ", qf)

    sf = [simulate_knockout_match(qf[2*i], qf[2*i+1], pr=pr) for i in range(4)]
    
    if pr:
        print("\nSF : ", sf)

    final = [simulate_knockout_match(sf[2*i], sf[2*i+1], pr=pr) for i in range(2)]
    bronze = [t for t in sf if t not in final]
    
    if pr:
        print("\nB  :", bronze)
        print("F  :", final)
        
    third = simulate_knockout_match(bronze[0], bronze[1], pr=pr)
    fourth = [t for t in bronze if t != third][0]
    champ = simulate_knockout_match(final[0], final[1], pr=pr)
    second = [t for t in final if t != champ][0]
    return champ, second, third, fourth

In [13]:
#import time
#s = time.time()
simulate_knockout_round(get_knockout_round(groups_list[YEAR], False), pr=True)
#time.time() - s

Group Winners: [['Uruguay', 'Russia'], ['Spain', 'Portugal'], ['France', 'Peru'], ['Argentina', 'Croatia'], ['Brazil', 'Switzerland'], ['Sweden', 'Mexico'], ['Tunisia', 'Belgium'], ['Colombia', 'Poland']]

R16:  ['Uruguay', 'Portugal', 'France', 'Croatia', 'Brazil', 'Mexico', 'Tunisia', 'Poland', 'Spain', 'Russia', 'Argentina', 'Peru', 'Sweden', 'Switzerland', 'Colombia', 'Belgium']
Uruguay loses to Portugal:               [0.21 vs 0.48]
France beats Croatia:                    [0.51 vs 0.18]
Brazil beats Mexico:                     [0.7 vs 0.09]
Tunisia loses to Poland:                 [0.14 vs 0.61]
Spain beats Russia:                      [0.78 vs 0.06]
Argentina loses to Peru in OT:           [0.45 vs 0.22]
Sweden loses to Switzerland:             [0.25 vs 0.42]
Colombia loses to Belgium in OT:         [0.33 vs 0.33]

QF :  ['Portugal', 'France', 'Brazil', 'Poland', 'Spain', 'Peru', 'Switzerland', 'Belgium']
Portugal loses to France:                [0.32 vs 0.33]
Brazil beats Polan

('France', 'Peru', 'Belgium', 'Brazil')

In [14]:
def evaluate_tournament(iterations):
    winners_dict = {}
    finalists_dict = {}
    for i in range(iterations):
        gw = get_knockout_round(groups_list[YEAR], False)
        final_four = simulate_knockout_round(gw, pr=False)
        final_four_set = frozenset(final_four)
        if final_four_set in finalists_dict.keys():
            finalists_dict[final_four_set] += 1
        else:
            finalists_dict[final_four_set] = 1

        winner = final_four[0]
        if winner in winners_dict.keys():
            winners_dict[winner] += 1
        else:
            winners_dict[winner] = 1

    print(f"Results for {YEAR} World Cup...")

    print("Most common winners")        
    for team, num in sorted(winners_dict.items(), key=lambda x:x[1], reverse=True)[:10]:  # top 5 most common winners
        print(f"{team}: {round((num / iterations) * 100, 1)}%")

    print("\nMost common final fours")
    for teams, num in sorted(finalists_dict.items(), key=lambda x:x[1], reverse=True)[:10]: # top 5 most commmon final fours
        print(f"{list(teams)}: {round((num / iterations) * 100, 1)}%")

evaluate_tournament(100)

Results for 2018 World Cup...
Most common winners
Brazil: 37.0%
Germany: 28.0%
Spain: 12.0%
France: 5.0%
Portugal: 3.0%
England: 3.0%
Argentina: 3.0%
Mexico: 3.0%
Belgium: 2.0%
Iran: 1.0%

Most common final fours
['Brazil', 'Germany', 'Portugal', 'Spain']: 9.0%
['Brazil', 'Germany', 'France', 'Portugal']: 5.0%
['Brazil', 'Colombia', 'France', 'Spain']: 4.0%
['Brazil', 'Germany', 'France', 'Argentina']: 3.0%
['Brazil', 'Germany', 'France', 'Iran']: 2.0%
['Spain', 'Brazil', 'Uruguay', 'Switzerland']: 2.0%
['Brazil', 'Germany', 'Uruguay', 'Portugal']: 2.0%
['Spain', 'Brazil', 'Colombia', 'Portugal']: 2.0%
['England', 'Germany', 'France', 'Spain']: 2.0%
['Brazil', 'Argentina', 'Iceland', 'Germany']: 2.0%


In [15]:
def evaluate_groups(group, iterations):
    winners_dict = {}
    finalists_dict = {}
    for i in range(iterations):
        gw = get_knockout_round([group], False)[0] #gw = get_knockout_round(groups_list[YEAR], False)
        if gw[0] in winners_dict.keys():
            winners_dict[gw[0]] += 1
        else:
            winners_dict[gw[0]] = 1
            
        gw_set = frozenset(gw)
        if gw_set in finalists_dict.keys():
            finalists_dict[gw_set] += 1
        else:
            finalists_dict[gw_set] = 1


    print(f"Results for {YEAR} World Cup...")

    print("Most common group winners")        
    for team, num in sorted(winners_dict.items(), key=lambda x:x[1], reverse=True):
        print(f"{team}: {round((num / iterations) * 100, 1)}%")

    print("\nMost common final fours")
    for teams, num in sorted(finalists_dict.items(), key=lambda x:x[1], reverse=True):
        print(f"{list(teams)}: {round((num / iterations) * 100, 1)}%")

#for group in groups_list[YEAR]:
#    evaluate_groups(group, 500)
#    print('\n\n')

In [16]:
# 199/900 world cup games end in draw  # 0.2211111111111111
# 10084/43752 of all games end in draw. # 0.2304808923020662
rankings[rankings["Year"] == YEAR][:15]

Unnamed: 0,Rank,Team,Score,Year
0,1,Brazil,2121.91,2018
1,2,Germany,2099.42,2018
2,3,Spain,2039.0,2018
3,4,France,1986.47,2018
4,5,Argentina,1984.39,2018
5,6,Portugal,1983.57,2018
6,7,England,1937.57,2018
7,8,Colombia,1929.62,2018
8,9,Belgium,1928.18,2018
9,10,Italy,1901.35,2018
