In [1]:
import pandas as pd
import pickle
from scipy.stats import poisson
import warnings
warnings.filterwarnings("ignore")

In [2]:
dict_tables = pickle.load(open("dict_tables", "rb"))
historical_data_df = pd.read_csv("clean_fifa_worldcup_matches.csv")

In [3]:
dict_tables.keys()

dict_keys(['Group A', 'Group B', 'Group C', 'Group D', 'Group E', 'Group F', 'Group G', 'Group H'])

In [4]:
historical_data_df

Unnamed: 0,HomeTeam,AwayTeam,Year,HomeGoals,AwayGoals,TotalGoals
0,France,Mexico,1930,4,1,5
1,Uruguay,Argentina,1930,4,2,6
2,Uruguay,Yugoslavia,1930,6,1,7
3,Argentina,United States,1930,6,1,7
4,Paraguay,Belgium,1930,1,0,1
...,...,...,...,...,...,...
799,Brazil,Costa Rica,2018,2,0,2
800,Serbia,Switzerland,2018,1,2,3
801,Serbia,Brazil,2018,0,2,2
802,Germany,Mexico,2018,0,1,1


In [5]:
df_fixture = pd.read_csv('clean_fifa_worldcup_fixture.csv')

In [6]:
dict_tables["Group B"]

Unnamed: 0,Pos,Team,Pld,W,D,L,GF,GA,GD,Pts
0,1,England,0,0,0,0,0,0,0,0
1,2,Iran,0,0,0,0,0,0,0,0
2,3,United States,0,0,0,0,0,0,0,0
3,4,Wales,0,0,0,0,0,0,0,0


# Calculate Team Strength

In [7]:
df_home = historical_data_df[["HomeTeam", "HomeGoals", "AwayGoals"]]
df_away = historical_data_df[["AwayTeam", "HomeGoals", "AwayGoals"]]

df_home

Unnamed: 0,HomeTeam,HomeGoals,AwayGoals
0,France,4,1
1,Uruguay,4,2
2,Uruguay,6,1
3,Argentina,6,1
4,Paraguay,1,0
...,...,...,...
799,Brazil,2,0
800,Serbia,1,2
801,Serbia,0,2
802,Germany,0,1


In [8]:
df_home.rename(columns = {"HomeTeam":"Team", "HomeGoals":"GoalsScored", "AwayGoals":"GoalsConceded"}, inplace = True)
df_home

Unnamed: 0,Team,GoalsScored,GoalsConceded
0,France,4,1
1,Uruguay,4,2
2,Uruguay,6,1
3,Argentina,6,1
4,Paraguay,1,0
...,...,...,...
799,Brazil,2,0
800,Serbia,1,2
801,Serbia,0,2
802,Germany,0,1


In [9]:
df_away = df_home.rename(columns = {"AwayTeam":"Team", "HomeGoals":"GoalsConceded", "AwayGoals":"GoalsScored"})
df_away

Unnamed: 0,Team,GoalsScored,GoalsConceded
0,France,4,1
1,Uruguay,4,2
2,Uruguay,6,1
3,Argentina,6,1
4,Paraguay,1,0
...,...,...,...
799,Brazil,2,0
800,Serbia,1,2
801,Serbia,0,2
802,Germany,0,1


In [10]:
# Comcat home and away team
df_team_strength = pd.concat([df_home, df_away], ignore_index = True).groupby("Team").mean()
df_team_strength

Unnamed: 0_level_0,GoalsScored,GoalsConceded
Team,Unnamed: 1_level_1,Unnamed: 2_level_1
Algeria,0.833333,1.666667
Argentina,1.888889,0.888889
Australia,0.666667,2.000000
Austria,2.285714,1.571429
Belgium,1.761905,1.000000
...,...,...
Uruguay,2.093750,1.093750
Wales,2.000000,1.000000
West Germany,2.500000,0.894737
Yugoslavia,2.733333,0.600000


## Function Predict Points
* Using Poisson Distribution
  - The Poisson distribution is a discrete probability distribution that describes the number of events occurring     in a fixed time interval or region of opportunity.
  - If we think of a goal as an event that might happen in the 90 minutes of a football match, we could calculate     the probability of the number of goals that could be scored in a match by Team A and Team B. We still need to     meet the assumptions of Poisson Distribution;
  
i. The number of events can be counted (a match can have 1, 2, 3 or more goals)
          
ii. The occurrence of events is independent(the occurrence of one goal should not affect the                           probability of another goal)
         
iii. The rate at which events occur is constant (the probability of a goal occurring in a certain time                   interval should be exactly the same for every other time interval of the same length)
        
iv. Two events cannot occur at exactly the same instant in time (two goals can’t occur at the same time)




$$
P(X=x)=\dfrac{\lambda_\mathrm{e}^{x-\lambda}}{x!}
$$

where;

$\lambda$: expected number of events per time interval (median of goals in 90 minutes (Team A and B))

$x$: the number of goals in a match that could be scored by Team A or Team B

In [11]:
def predict_points(home, away):
    if home in df_team_strength.index and away in df_team_strength.index:
        # goals_scored * goals_conceded
        lamb_home = df_team_strength.at[home,'GoalsScored'] * df_team_strength.at[away,'GoalsConceded']
        lamb_away = df_team_strength.at[away,'GoalsScored'] * df_team_strength.at[home,'GoalsConceded']
        prob_home, prob_away, prob_draw = 0, 0, 0
        for x in range(0,11): #number of goals home team
            for y in range(0, 11): #number of goals away team
                p = poisson.pmf(x, lamb_home) * poisson.pmf(y, lamb_away)
                if x == y:
                    prob_draw += p
                elif x > y:
                    prob_home += p
                else:
                    prob_away += p
        
        points_home = 3 * prob_home + prob_draw
        points_away = 3 * prob_away + prob_draw
        return (points_home, points_away)
    else:
        return (0, 0)

#### Test the function

In [12]:
print(predict_points('England', 'United States'))
print(predict_points('Argentina', 'Mexico'))
print(predict_points('Qatar (H)', 'Ecuador')) # Qatar vs Team X -> 0 points to both

(2.1752780278149184, 0.6158770557276528)
(1.7010539637975128, 1.037857088261646)
(0, 0)


In [13]:
df_fixture_group_48 = df_fixture[:48].copy()
df_fixture_knockout = df_fixture[48:56].copy()
df_fixture_quarter = df_fixture[56:60].copy()
df_fixture_semi = df_fixture[60:62].copy()
df_fixture_final = df_fixture[62:].copy()

# Group Stage

for group in dict_tables:
    teams_in_group = dict_tables[group]['Team'].values
    df_fixture_group_6 = df_fixture_group_48[df_fixture_group_48['home'].isin(teams_in_group)]
    for index, row in df_fixture_group_6.iterrows():
        home, away = row['home'], row['away']
        points_home, points_away = predict_points(home, away)
        dict_tables[group].loc[dict_tables[group]['Team'] == home, 'Pts'] += points_home
        dict_tables[group].loc[dict_tables[group]['Team'] == away, 'Pts'] += points_away

    dict_tables[group] = dict_tables[group].sort_values('Pts', ascending=False).reset_index()
    dict_tables[group] = dict_tables[group][['Team', 'Pts']]
    dict_tables[group] = dict_tables[group].round(0)

dict_tables['Group C']

Unnamed: 0,Team,Pts
0,Argentina,6.0
1,Mexico,4.0
2,Poland,4.0
3,Saudi Arabia,2.0


In [14]:
# Knockout
df_fixture_knockout

Unnamed: 0,home,score,away,year
48,Winners Group A,Match 49,Runners-up Group B,2022
49,Winners Group C,Match 50,Runners-up Group D,2022
50,Winners Group D,Match 52,Runners-up Group C,2022
51,Winners Group B,Match 51,Runners-up Group A,2022
52,Winners Group E,Match 53,Runners-up Group F,2022
53,Winners Group G,Match 54,Runners-up Group H,2022
54,Winners Group F,Match 55,Runners-up Group E,2022
55,Winners Group H,Match 56,Runners-up Group G,2022


In [15]:
for group in dict_tables:
    group_winner = dict_tables[group].loc[0, 'Team']
    runners_up = dict_tables[group].loc[1, 'Team']
    df_fixture_knockout.replace({f'Winners {group}':group_winner,
                                 f'Runners-up {group}':runners_up}, inplace=True)

df_fixture_knockout['winner'] = '?'
df_fixture_knockout

Unnamed: 0,home,score,away,year,winner
48,Netherlands,Match 49,Wales,2022,?
49,Argentina,Match 50,Tunisia,2022,?
50,France,Match 52,Mexico,2022,?
51,England,Match 51,Ecuador,2022,?
52,Germany,Match 53,Croatia,2022,?
53,Brazil,Match 54,Uruguay,2022,?
54,Belgium,Match 55,Spain,2022,?
55,Portugal,Match 56,Switzerland,2022,?


In [16]:
def get_winner(df_fixture_updated):
    for index, row in df_fixture_updated.iterrows():
        home, away = row['home'], row['away']
        points_home, points_away = predict_points(home, away)
        if points_home > points_away:
            winner = home
        else:
            winner = away
        df_fixture_updated.loc[index, 'winner'] = winner
    return df_fixture_updated

get_winner(df_fixture_knockout)

Unnamed: 0,home,score,away,year,winner
48,Netherlands,Match 49,Wales,2022,Netherlands
49,Argentina,Match 50,Tunisia,2022,Argentina
50,France,Match 52,Mexico,2022,France
51,England,Match 51,Ecuador,2022,Ecuador
52,Germany,Match 53,Croatia,2022,Germany
53,Brazil,Match 54,Uruguay,2022,Brazil
54,Belgium,Match 55,Spain,2022,Belgium
55,Portugal,Match 56,Switzerland,2022,Portugal


In [17]:
#Quarter Finals

def update_table(df_fixture_round_1, df_fixture_round_2):
    for index, row in df_fixture_round_1.iterrows():
        winner = df_fixture_round_1.loc[index, 'winner']
        match = df_fixture_round_1.loc[index, 'score']
        df_fixture_round_2.replace({f'Winners {match}':winner}, inplace=True)
    df_fixture_round_2['winner'] = '?'
    return df_fixture_round_2

update_table(df_fixture_knockout, df_fixture_quarter)

Unnamed: 0,home,score,away,year,winner
56,Germany,Match 58,Brazil,2022,?
57,Netherlands,Match 57,Argentina,2022,?
58,Belgium,Match 60,Portugal,2022,?
59,Ecuador,Match 59,France,2022,?


In [18]:
get_winner(df_fixture_quarter)

Unnamed: 0,home,score,away,year,winner
56,Germany,Match 58,Brazil,2022,Brazil
57,Netherlands,Match 57,Argentina,2022,Netherlands
58,Belgium,Match 60,Portugal,2022,Portugal
59,Ecuador,Match 59,France,2022,France


In [19]:
# Semi-final
update_table(df_fixture_quarter, df_fixture_semi)

Unnamed: 0,home,score,away,year,winner
60,Netherlands,Match 61,Brazil,2022,?
61,France,Match 62,Portugal,2022,?


In [20]:
get_winner(df_fixture_semi)

Unnamed: 0,home,score,away,year,winner
60,Netherlands,Match 61,Brazil,2022,Brazil
61,France,Match 62,Portugal,2022,Portugal


In [21]:
# Final
update_table(df_fixture_semi, df_fixture_final)

Unnamed: 0,home,score,away,year,winner
62,Losers Match 61,Match 63,Losers Match 62,2022,?
63,Brazil,Match 64,Portugal,2022,?


In [22]:
get_winner(df_fixture_final)

Unnamed: 0,home,score,away,year,winner
62,Losers Match 61,Match 63,Losers Match 62,2022,Losers Match 62
63,Brazil,Match 64,Portugal,2022,Brazil
