In [1]:
import numpy as np
import pandas as pd
import random as rnd
# import statistics

from itertools import combinations
from collections import Counter
from get_match_data import *

from group_stages import *
from prediction_model import get_model
from simulation import Simulation

In [2]:
YEAR = 2018  # year for the model

In [3]:
rankings, data = get_data(YEAR)
logreg, match_data = get_model(data, report=True)

Mean Validation accuracy: 0.7466066164815621
Test data model accuracy: 0.754321554229769

               precision    recall  f1-score   support

       False       0.78      0.84      0.81      3980
        True       0.71      0.62      0.67      2557

    accuracy                           0.75      6537
   macro avg       0.74      0.73      0.74      6537
weighted avg       0.75      0.75      0.75      6537

Baseline {predict higher ranked team}:
               precision    recall  f1-score   support

       False       0.84      0.69      0.76      3980
        True       0.62      0.79      0.70      2557

    accuracy                           0.73      6537
   macro avg       0.73      0.74      0.73      6537
weighted avg       0.75      0.73      0.73      6537



In [4]:
match_data

Unnamed: 0,date,team,opponent,team_score,opponent_score,tournament,city,country,neutral,year,team_rank,team_points,opponent_rank,opponent_points,rank_diff,avg_rank,score_diff,team_won,result,comp
589,1998-01-10,Zambia,Malawi,1,0,COSAFA Cup qualification,Blantyre,Malawi,False,1998,54,1616.70,119,1333.92,-65,86.5,1,True,1,False
10,1998-01-10,Malawi,Zambia,0,1,COSAFA Cup qualification,Blantyre,Malawi,False,1998,119,1333.92,54,1616.70,65,86.5,-1,False,-1,False
4,1998-01-18,Lesotho,Zimbabwe,0,2,COSAFA Cup qualification,Maseru,Lesotho,False,1998,152,1159.80,86,1471.12,66,119.0,-2,False,-1,False
681,1998-01-18,Zimbabwe,Lesotho,2,0,COSAFA Cup qualification,Maseru,Lesotho,False,1998,86,1471.12,152,1159.80,-66,119.0,2,True,1,False
393,1998-01-24,Namibia,South Africa,3,2,COSAFA Cup qualification,Windhoek,Namibia,False,1998,112,1364.70,46,1657.97,66,79.0,1,True,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33723,2018-12-06,Philippines,Vietnam,1,2,AFF Championship,Hanoi,Vietnam,False,2018,149,1172.39,128,1277.79,21,138.5,-1,False,-1,False
33718,2018-12-11,Malaysia,Vietnam,2,2,AFF Championship,Kuala Lumpur,Malaysia,False,2018,160,1057.30,128,1277.79,32,144.0,0,False,0,False
34477,2018-12-11,Vietnam,Malaysia,2,2,AFF Championship,Kuala Lumpur,Malaysia,False,2018,128,1277.79,160,1057.30,-32,144.0,0,False,0,False
33720,2018-12-15,Malaysia,Vietnam,0,1,AFF Championship,Hanoi,Vietnam,False,2018,160,1057.30,128,1277.79,32,144.0,-1,False,-1,False


In [5]:
s = Simulation(YEAR, groups_list[YEAR], logreg, match_data)

In [6]:
import time
tm = time.time()
s.simulate_knockout_round(s.get_knockout_round(False), pr=True)
time.time() - tm

Group Winners: [['Russia', 'Uruguay'], ['Spain', 'Portugal'], ['Peru', 'France'], ['Argentina', 'Croatia'], ['Brazil', 'Serbia'], ['Germany', 'Sweden'], ['Belgium', 'England'], ['Colombia', 'Senegal']]

R16:  ['Russia', 'Portugal', 'Peru', 'Croatia', 'Brazil', 'Sweden', 'Belgium', 'Senegal', 'Spain', 'Uruguay', 'Argentina', 'France', 'Germany', 'Serbia', 'Colombia', 'England']
Russia loses to Portugal in OT:          [0.09 vs 0.72]
Peru loses to Croatia:                   [0.39 vs 0.28]
Brazil beats Sweden:                     [0.76 vs 0.07]
Belgium beats Senegal:                   [0.59 vs 0.15]
Spain beats Uruguay:                     [0.56 vs 0.16]
Argentina beats France in OT:            [0.32 vs 0.33]
Germany beats Serbia:                    [0.78 vs 0.06]
Colombia loses to England in OT:         [0.32 vs 0.34]

QF :  ['Portugal', 'Croatia', 'Brazil', 'Belgium', 'Spain', 'Argentina', 'Germany', 'England']
Portugal beats Croatia:                  [0.51 vs 0.19]
Brazil beats Belgium

0.16873812675476074

In [7]:
def evaluate_tournament(iterations):
    winners_dict = {}
    finalists_dict = {}
    for i in range(iterations):
        gw = s.get_knockout_round(False)
        final_four = s.simulate_knockout_round(gw, pr=False)
        final_four_set = frozenset(final_four)
        if final_four_set in finalists_dict.keys():
            finalists_dict[final_four_set] += 1
        else:
            finalists_dict[final_four_set] = 1

        winner = final_four[0]
        if winner in winners_dict.keys():
            winners_dict[winner] += 1
        else:
            winners_dict[winner] = 1

    print(f"Results for {YEAR} World Cup...")

    print("Most common winners")        
    for team, num in sorted(winners_dict.items(), key=lambda x:x[1], reverse=True)[:10]:  # top 5 most common winners
        print(f"{team}: {round((num / iterations) * 100, 1)}%")

    print("\nMost common final fours")
    for teams, num in sorted(finalists_dict.items(), key=lambda x:x[1], reverse=True)[:10]: # top 5 most commmon final fours
        print(f"{list(teams)}: {round((num / iterations) * 100, 1)}%")

evaluate_tournament(100)

Results for 2018 World Cup...
Most common winners
Brazil: 37.0%
Germany: 27.0%
Spain: 17.0%
France: 6.0%
Portugal: 5.0%
Argentina: 2.0%
Colombia: 2.0%
Peru: 1.0%
Mexico: 1.0%
Serbia: 1.0%

Most common final fours
['Brazil', 'Spain', 'Portugal', 'Germany']: 7.0%
['Brazil', 'Germany', 'Argentina', 'Spain']: 4.0%
['England', 'Germany', 'Spain', 'Portugal']: 2.0%
['Brazil', 'Germany', 'Argentina', 'Iran']: 2.0%
['Brazil', 'Spain', 'Portugal', 'Colombia']: 2.0%
['France', 'Brazil', 'Germany', 'Spain']: 2.0%
['France', 'Peru', 'Brazil', 'Germany']: 2.0%
['Brazil', 'Spain', 'Mexico', 'Portugal']: 2.0%
['Brazil', 'France', 'Spain', 'Colombia']: 2.0%
['Colombia', 'Spain', 'England', 'Portugal']: 2.0%


In [8]:
def evaluate_groups(group, iterations):
    winners_dict = {}
    finalists_dict = {}
    for i in range(iterations):
        gw = s.get_knockout_round_group(group, False) #gw = get_knockout_round(groups_list[YEAR], False)
        if gw[0] in winners_dict.keys():
            winners_dict[gw[0]] += 1
        else:
            winners_dict[gw[0]] = 1
            
        gw_set = frozenset(gw)
        if gw_set in finalists_dict.keys():
            finalists_dict[gw_set] += 1
        else:
            finalists_dict[gw_set] = 1


    print(f"Results for {YEAR} World Cup...")

    print("Most common group winners")        
    for team, num in sorted(winners_dict.items(), key=lambda x:x[1], reverse=True):
        print(f"{team}: {round((num / iterations) * 100, 1)}%")

    print("\nMost common final fours")
    for teams, num in sorted(finalists_dict.items(), key=lambda x:x[1], reverse=True):
        print(f"{list(teams)}: {round((num / iterations) * 100, 1)}%")

#for group in groups_list[YEAR]:
#    print(group)
#    evaluate_groups(group, 100)
#    print('\n\n')

In [9]:
rankings[rankings["Year"] == YEAR][:15]
# NOTE: 199/900 world cup games end in draw  # 0.2211111111111111
# NOTE: 10084/43752 of all games end in draw. # 0.2304808923020662

Unnamed: 0,Rank,Team,Score,Year
0,1,Brazil,2121.91,2018
1,2,Germany,2099.42,2018
2,3,Spain,2039.0,2018
3,4,France,1986.47,2018
4,5,Argentina,1984.39,2018
5,6,Portugal,1983.57,2018
6,7,England,1937.57,2018
7,8,Colombia,1929.62,2018
8,9,Belgium,1928.18,2018
9,10,Italy,1901.35,2018
