In [None]:
import pandas as pd
from google.colab import files

In [None]:
# Upload result_df_with_wins_todate.csv
u = files.upload()

Saving results_df_with_wins_todate.csv to results_df_with_wins_todate.csv


In [None]:
df = pd.read_csv("results_df_with_wins_todate.csv")

In [None]:
# Fill NaN values in winner column with the winner country if not draw.
# If draw but NaN then set winner to "Draw"
for i in range(len(df)):
  row = df.loc[i]
  if row['home_score'] > row['away_score']:
    df.at[i, 'winner'] = row['home_team']
  elif row['home_score'] < row['away_score']:
    df.at[i, 'winner'] = row['away_team']
  elif type(row['winner']) == float:
    # NaNs are float type
    df.at[i, 'winner'] = 'Draw'

In [None]:
# Create 2-dimensional dictionary that maps (country1, country2) to country1's 
# head-to-head stats against country2.
# 
# head-to-head stats is a dictionary with the following keys and corresponding
# values:
# 'wins': number of times country1 beat country2
# 'draws': number of times country1 drew to country2
# 'losses': number of times country1 lost to country2
# 'games_played': total number of times country1 played country2
# '(home|away)(wins|losses)': number of country1 wins/losses against country2
#                             when they played country2 at (home|away)
# '(GA|GF)(home|away)(wins|losses): number of goals scored or received for home
#                                   or away wins/losses

h2h_dict = {}

def insert_into_h2h_dict(country1, country2, winner, goals1, goals2, ground):
  if not h2h_dict.get(country1):
    h2h_dict[country1] = {}
  
  if not h2h_dict[country1].get(country2):
    h2h_dict[country1][country2] = {
        'wins': 0,
        'draws': 0,
        'losses': 0,
        'games_played': 0,
        'home_games': 0,
        'away_games': 0,
        'home_wins': 0,
        'home_losses': 0,
        'away_wins': 0,
        'away_losses': 0,
        'GF_home_wins': 0,
        'GA_home_wins': 0,
        'GF_home_losses': 0,
        'GA_home_losses': 0,
        'GF_away_wins': 0,
        'GA_away_wins': 0,
        'GF_away_losses': 0,
        'GA_away_losses': 0,
    }

  h2h = h2h_dict[country1][country2]
  h2h['games_played'] += 1
  if winner == country1:
    h2h['wins'] += 1
    if ground == country1:
      # country1 wins in their home ground
      h2h['home_games'] += 1
      h2h['home_wins'] += 1
      h2h['GF_home_wins'] += goals1
      h2h['GA_home_wins'] += goals2
    else:
      # country1 wins in away ground
      h2h['away_games'] += 1
      h2h['away_wins'] += 1
      h2h['GF_away_wins'] += goals1
      h2h['GA_away_wins'] += goals2
  elif winner == country2:
    h2h['losses'] += 1
    if ground == country1:
      # country1 loses in home ground
      h2h['home_games'] += 1
      h2h['home_losses'] += 1
      h2h['GF_home_losses'] += goals1
      h2h['GA_home_losses'] += goals2
    else:
      h2h['away_games'] += 1
      h2h['away_losses'] += 1
      h2h['GF_away_losses'] += goals1
      h2h['GA_away_losses'] += goals2
  else:
    h2h['draws'] += 1


for i in range(len(df)):
  match = df.loc[i]
  country1 = match['home_team']
  country2 = match['away_team']
  goals1 = match['home_score']
  goals2 = match['away_score']
  winner = match['winner']
  ground = match['country']
  insert_into_h2h_dict(country1, country2, winner, goals1, goals2, ground)
  insert_into_h2h_dict(country2, country1, winner, goals2, goals1, ground)

In [None]:
# Example
h2h_dict['Brazil']['Germany']

In [None]:
# Upload Countries and Regions FIFA.xlsx and Soccer_Country_World_Rankings.xlsx
u = files.upload()
u = files.upload()

Saving Countries and Regions FIFA.xlsx to Countries and Regions FIFA.xlsx


Saving Soccer_Country_World_Rankings.xlsx to Soccer_Country_World_Rankings.xlsx


In [None]:
country_regions = pd.read_excel('Countries and Regions FIFA.xlsx')
country_rankings = pd.read_excel('Soccer_Country_World_Rankings.xlsx')

In [None]:
import re

# Construct maps from country to region, world ranking, world ranking points
country_to_region = {}
for i in range(len(country_regions)):
  row = country_regions.loc[i]
  country = re.sub(' +$', '', row['Country'])
  region = row['Region']
  country_to_region[country] = region

country_to_points = {}
country_to_rank = {}
for i in range(len(country_rankings)):
  row = country_rankings.loc[i]
  country = re.sub(' +$', '', row['Country'])
  points = row['Total Points']
  rank = row['Rank']
  country_to_points[country] = points
  country_to_rank[country] = rank

In [None]:
# Examples
print(country_to_region['Republic of Ireland'])
print(country_to_points['Australia'])
print(country_to_rank['Brazil'])

In [None]:
# Constants
WIN = 3
LOSS = 0
DRAW = 1
PENALTY_WIN = 2
PENALTY_LOSS = 1

FRIENDLY = 1
QUALIFICATION = 2.5
CONFEDERATION_LEVEL = 3
FIFA_WORLD_CUP = 4


# Given a match played by the given country, return the name of the 
# opponent country.
def get_opponent(country, match):
  if match['home_team'] == country:
    opponent = match['away_team']
  else:
    opponent = match['home_team']

  return opponent


# Return the outcome score for the given country and match as per FIFA's world
# ranking point system
def match_outcome(country, match):
  if match['home_score'] == match['away_score']:
    # Tied score means draw or penalty win/loss
    if match['winner'] == 'Draw':
      outcome = DRAW
    elif match['winner'] == country:
      outcome = PENALTY_WIN
    else:
      outcome = PENALTY_LOSS
  elif match['winner'] == country:
    outcome = WIN
  else:
    outcome = LOSS
  
  return outcome


# Return the match importance for the given match as per FIFA's world ranking
# points system.
def match_importance(match):
  tournament_name = match['tournament'].lower()
  if tournament_name == 'friendly':
    importance = FRIENDLY
  elif 'qualification' in tournament_name:
    importance = QUALIFICATION
  elif tournament_name == 'fifa world cup':
    importance = FIFA_WORLD_CUP
  else:
    importance = CONFEDERATION_LEVEL
  
  return importance


# Return the oppenent strength for the given match played by the given country,
# as per FIFA's world ranking points system.
def match_opponent_strength(country, match):
  opponent = get_opponent(country, match)
  if not country_to_rank.get(opponent):
    strength = 50
  else: 
    strength = 200 - country_to_rank[opponent]

  return max(50, strength)


# Return the weight associated with the opponent's confederation for the given
# match played by the given country.
def match_opponent_weight(country, match):
  opponent = get_opponent(country, match)

  if not country_to_region.get(opponent):
    region = 'Other'
  else:
    region = country_to_region[opponent]

  weights = {
      'South America': 1,
      'Europe': 0.99,
      'Asia': 0.85,
      'Africa': 0.85,
      'Oceania': 0.85,
      'North America, Central America': 0.85, 
      'Other': 0.5
  }

  return weights[region]


# Returns the score gained by the given country from the match with given
# match data, using FIFA world rankings points formula (2017).
def match_score(country, match):
  outcome = match_outcome(country, match)
  importance = match_importance(match)
  opponent_strength = match_opponent_strength(country, match)
  opponent_weight = match_opponent_weight(country, match)
  
  return outcome * importance * opponent_strength * opponent_weight
  

# Returns the performance of a country in the last x matches. 
# Performance is defined as the number of points calculated using 
# FIFA world rankings points formula (2017).
# https://www.goal.com/en-us/news/fifa-world-ranking-how-it-is-calculated-what-it-is-used-for/16w60sntgv7x61a6q08b7ooi0p
def get_recent_performance(country, lookback_matches):
  # Filter the dataframe of match data to only include games played by
  # the given country
  country_games = df.loc[(df['home_team'] == country) | (df['away_team'] == country)]
  lookback = min(len(country_games), lookback_matches)

  if lookback == 0:
    # No recent matches
    return 0

  recent_games = country_games[-lookback:]
  total_score = 0
  for game in recent_games.iterrows():
    # game[1] is the match information
    match_info = game[1]
    total_score += match_score(country, match_info)
    
  average_score = total_score / lookback
  return round(total_score / lookback, 2)


In [None]:
# Create dictionary that maps countries to recent performance
def calculate_recent_performance_points(lookback_matches):
  recent_performances = {}
  for country in country_to_region:
    score = get_recent_performance(country, lookback_matches)
    recent_performances[country] = score

  return recent_performances

recent_performance_points = calculate_recent_performance_points(5)

In [None]:
# Example
recent_performance_points['Mexico']

In [None]:
def get_recent_performance_points(country):
  if recent_performance_points.get(country):
    return recent_performance_points[country]
  else:
    # Assign the lowest points
    return 0


def get_h2h(country1, country2):
  if h2h_dict.get(country1) and h2h_dict[country1].get(country2):
    return h2h_dict[country1][country2]
  else:
    return None


def get_overall_points(country):
  if country_to_points.get(country):
    return country_to_points[country]
  else:
    # Assign the lowest points
    return 0

In [None]:
from functools import cmp_to_key

POINT_THRESHOLD = 1.2
PERFORMANCE_THRESHOLD = 1.2
WIN_DEFAULT_GOALS = 3
LOSE_DEFAULT_GOALS = 0
DRAW_DEFAULT_GOALS = 1
WINRATE_THRESHOLD = 0.55

# Predict the outcome and statistics of a match between the given home and
# away countries and return a corresponding dictionary of the form
# { winner, home_goals, away_goals }.
#
# Match predictions are done using a combination of recent performance as
# well as country rank, and head to head data.
def get_match_prediction(home, away, can_draw=True):
  home_recent_performance = get_recent_performance_points(home)
  away_recent_performance = get_recent_performance_points(away)
  home_points = get_overall_points(home)
  away_points = get_overall_points(away)

  # Note, higher rank number means worse team
  h2h_stats = get_h2h(home, away)
  if home_points >= POINT_THRESHOLD * away_points:
    winner = home
  elif away_points >= POINT_THRESHOLD * away_points:
    winner = away
  else:
    if home_recent_performance >= PERFORMANCE_THRESHOLD * away_recent_performance:
      winner = home
    elif away_recent_performance >= PERFORMANCE_THRESHOLD * home_recent_performance:
      winner = away
    else:
      if h2h_stats == None and can_draw:
        winner = 'Draw'
      elif h2h_stats == None and not can_draw:
        winner = home if home_points > away_points else away
      else:
        if h2h_stats['home_games'] != 0:
          home_winrate = h2h_stats['home_wins'] / h2h_stats['home_games']
        else:
          home_winrate = 0.5
        
        if home_winrate >= WINRATE_THRESHOLD:
          winner = home
        elif home_winrate <= 1 - WINRATE_THRESHOLD:
          winner = away
        elif can_draw:
          winner = 'Draw'
        else:
          winner = home if home_points > away_points else away

  if winner == home:
    home_goals = WIN_DEFAULT_GOALS
    away_goals = LOSE_DEFAULT_GOALS
  elif winner == away:
    home_goals = LOSE_DEFAULT_GOALS
    away_goals = WIN_DEFAULT_GOALS
  else:
    home_goals = away_goals = DRAW_DEFAULT_GOALS
  
  return {'winner': winner, 'home_goals': home_goals, 'away_goals': away_goals}


# Function for simulating a group containing the given list of countries. 
# Returns a list of countries by rank in the group. 
# games is the number of times each team plays one another.
def simulate_group(countries, games=2):
  # group_table is a dict that maps country to 
  # { points, goals_for, goals_against }.
  group_table = {
      country: {
          'points': 0,
          'goals_for': 0, 
          'goals_against': 0
      } for country in countries
  }
  
  country_count = len(countries)
  for _ in range(games):
    for i in range(country_count):
      for j in range(i + 1, country_count):
        result = get_match_prediction(countries[i], countries[j])
        group_table[countries[i]]['goals_for'] += result['home_goals']
        group_table[countries[i]]['goals_against'] += result['away_goals']
        group_table[countries[j]]['goals_for'] += result['away_goals']
        group_table[countries[j]]['goals_against'] += result['home_goals']

        outcome = result['winner']
        if outcome == 'Draw':
          group_table[countries[i]]['points'] += 1
          group_table[countries[j]]['points'] += 1
        else:
          group_table[outcome]['points'] += 3

  final_rankings = countries[:]
  def compare_stats(country1, country2):
    country1_data = group_table[country1]
    country2_data = group_table[country2]
    if country1_data['points'] > country2_data['points']:
      return -1
    elif country1_data['points'] < country2_data['points']:
      return 1
    
    country1_gd = country1_data['goals_for'] - country1_data['goals_against']
    country2_gd = country2_data['goals_for'] - country2_data['goals_against']
    if country1_gd > country2_gd:
      return -1
    elif country1_gd < country2_gd:
      return 1

    return 0
  
  return sorted(final_rankings, key=cmp_to_key(compare_stats))


# Given a path representing a knockout tournament (path described in more
# detail later), predict the winner. Assume that the number of countries in
# the given path is a power of 2.
def simulate_knockout(path):
  homes = path[0]
  aways = path[1]

  if len(homes) == 1:
    return get_match_prediction(homes[0], aways[0], can_draw=False)['winner']

  winners = []
  for i in range(len(homes)):
    result = get_match_prediction(homes[i], aways[i], can_draw=False)
    winners.append(result['winner'])

  new_homes = [winners[i] for i in range(1, len(winners), 2)]
  new_aways = [winners[i] for i in range(0, len(winners), 2)]
  next_path = [new_homes, new_aways]

  return simulate_knockout(next_path)
  

In [None]:
# Example
simulate_group('England, Poland, Hungary, Albania, Andorra, San Marino'.split(', '))

['England', 'Hungary', 'Poland', 'Albania', 'Andorra', 'San Marino']

In [None]:
# Function for converting dataframe of groups into dictionary
def df_to_groups(df):
  groups = {}

  for i in range(len(df)):
    row = df.loc[i]
    country = row['Country']
    group = row['Group']

    if groups.get(group):
      groups[group].append(country)
    else:
      groups[group] = [country]
  
  return groups


Europe Simulation

In [None]:
# Upload Europe FIFA Qualifier Groups.xlsx
u = files.upload()

In [None]:
# Get groups in Europe in the form of a dictionary mapping group to 
# list of countries that are part of that group
europe = pd.read_excel('Europe FIFA Qualifier Groups.xlsx')
europe_groups = df_to_groups(europe)

In [None]:
from itertools import permutations

# For Europe, winners in each group become qualifiers.
europe_qualifiers = []
europe_runners_up = []
europe_not_top2 = []
for group in europe_groups:
  result = simulate_group(europe_groups[group])

  # result[0] is the first ranked country of the group
  europe_qualifiers.append(result[0])
  europe_runners_up.append(result[1])

  for i in range(2, len(result)):
    europe_not_top2.append(result[i])

# Next, the runners up of each group are joined by two other countries that
# finished outside the top 2 in their group. These two countries will be
# selected in the preference order France, Belgium, Italy, Spain, Wales, 
# Austria, Czech Republic, Hungary, Slovenia, Montenegro, Albania, Armenia, 
# Gibraltar, Faroe Islands.
preferences = ['France', 'Belgium', 'Italy', 'Spain', 'Wales', 'Austria', 
               'Czech Republic', 'Hungary', 'Slovenia', 'Montenegro', 'Albania',
               'Armenia', 'Gibraltar', 'Faroe Islands']

# Promote two non-top 2 countries to runner up status
selected = 0
for country in preferences:
  if selected == 2:
    break

  if country in europe_not_top2:
    selected += 1 
    europe_runners_up.append(country)


# Next, find the 6 best ranked runner ups and seed them in order, as a list
# from highest rank to lowest.
europe_runners_up.sort(key=lambda x: country_to_rank[x])
europe_runners_up_top6 = europe_runners_up[:6]
europe_runners_up_bot6 = europe_runners_up[6:]

# Each of the 6 seeded countries are matched up with an unseeded country, but
# this is chosen randomly. The 12 countries in total play 3 semifinal paths
# where each path produces a winner which will become qualifiers. So, for
# all possible paths, predict the 3 winners. Then calculate the probability of
# each country becoming a qualifier and select the 3 most likely out of these.
# 
# For example, if the top 6 and bottom 6 are represented as lists:
# top 6: [A, B, C, D, E, F],
# bot 6: [G, H, I, J, K, L],
# then this represents three paths, each represented by a list of 2 lists: 
# [[A, B], [G, H]], [[C, D], [I, J]], [[E, F], [K, L]].
# In each path, the first of the lists is the seeded countries in that path,
# and the second of the lists is the unseeded countries in that path.
# The games are played in the following way - e.g. in the first path,
# A vs G, B vs H, and the winners of each match play again for the qualifying
# spot for that path.

# path winner_probabilities maps countries to probability of qualifying.
path_winner_probabilities = {c: 0 for c in europe_runners_up}
permutation_count = 0
for p_bot6 in permutations(europe_runners_up_bot6):
  for p_top6 in permutations(europe_runners_up_top6):
    permutation_count += 1

    paths = [[p_top6[i:i+2], p_bot6[i:i+2]] for i in range(0, 6, 2)]
    path_winners = [simulate_knockout(path) for path in paths]

    for country in path_winners:
      path_winner_probabilities[country] += 1

# Add the three most probable new qualifiers to the europe qualifiers list
europe_runners_up.sort(key=lambda c: path_winner_probabilities[c], reverse=True)
europe_qualifiers += europe_runners_up[:3]

print(europe_qualifiers)

['Portugal', 'Spain', 'Italy', 'France', 'Belgium', 'Denmark', 'Netherlands', 'Croatia', 'England', 'Germany', 'Switzerland', 'Sweden', 'Ukraine']


Africa Simulation

In [None]:
# Upload Africa FIFA Qualifier Groups.xlsx
u = files.upload()

Saving Africa FIFA Qualifier Groups.xlsx to Africa FIFA Qualifier Groups.xlsx


In [None]:
africa = pd.read_excel('Africa FIFA Qualifier Groups.xlsx')
africa_groups = df_to_groups(africa)

In [None]:
from itertools import permutations

# 10 groups of 4 countries in the African region play one another twice, and
# only the group winners proceed to the next round.

africa_winners = []
for group in africa_groups:
  result = simulate_group(africa_groups[group])
  africa_winners.append(result[0])


# Next, the group winners are formed into 5 pairs, where each pair plays each
# other twice. Winners from each pair become qualifiers. So, all possible
# pairings are generated, and the 5 most probable winners become winners.
africa_qualifier_probabilities = {c: 0 for c in africa_winners}
for home in africa_winners:
  for away in africa_winners:
    if home == away:
      continue

    result = get_match_prediction(home, away, can_draw=False)
    winner = result['winner']
    africa_qualifier_probabilities[winner] += 1

africa_winners.sort(key=lambda c: africa_qualifier_probabilities[c], reverse=True)
africa_qualifiers = africa_winners[:5]

print(africa_qualifiers)

['South Africa', 'Senegal', 'Algeria', 'Tunisia', 'Ivory Coast']


Asia Simulation

In [None]:
# Upload Asia FIFA Qualifier Groups.xlsx
u = files.upload()

Saving Asia FIFA Qualifier Groups.xlsx to Asia FIFA Qualifier Groups.xlsx


In [None]:
asia = pd.read_excel('Asia FIFA Qualifier Groups.xlsx')
asia_groups = df_to_groups(asia)

# Currently asia has 2 groups where each country in each group play one another
# twice. The top 2 in each group become qualifiers, and the 3rd ranked country
# in each group play each other for a place in the intercontinental playoffs.
asia_qualifiers = []
asia_thirds = []
for group in asia_groups:
  result = simulate_group(asia_groups[group])
  asia_qualifiers += result[:2]
  asia_thirds.append(result[2])

print(asia_qualifiers)

# 3rd ranked countries play each other
result = get_match_prediction(asia_thirds[0], asia_thirds[1], can_draw=False)
asia_intercontinental = result['winner']

print(asia_intercontinental)

['Iran', 'South Korea', 'Australia', 'Japan']
United Arab Emirates


CONCACAF Simulation

In [None]:
# Upload North and Central America FIFA Qualifier Groups.xlsx
u = files.upload()

In [None]:
concacaf = pd.read_excel('North and Central America FIFA Qualifier Groups.xlsx')
concacaf_groups = df_to_groups(concacaf)

# In CONCACAF, currently there are 8 countries remaining, who play one another
# twice, for a total of 14 matches for each country. The top 3 become 
# qualifiers, and the 4th goes to the intercontinental playoffs.
concacaf_group = concacaf_groups['A']
result = simulate_group(concacaf_group)
concacaf_qualifiers = result[:3]
concacaf_intercontinental = result[3]

print(concacaf_qualifiers)
print(concacaf_intercontinental)

['United States', 'Mexico', 'Canada']
Costa Rica


South America Simulation

In [None]:
# upload South America FIFA Qualifier Groups.xlsx
u = files.upload()

Saving South America FIFA Qualifier Groups.xlsx to South America FIFA Qualifier Groups.xlsx


In [None]:
sa = pd.read_excel('South America FIFA Qualifier Groups.xlsx')
sa_groups = df_to_groups(sa)

# In SA, all countries play as a single group, each playing one another twice.
# The top 4 become qualifiers, and the 5th goes to the intercontinental
# playoffs.
sa_group = sa_groups['A']
result = simulate_group(sa_group)
sa_qualifiers = result[:4]
sa_intercontinental = result[4]

print(sa_qualifiers)
print(sa_intercontinental)

['Argentina', 'Brazil', 'Uruguay', 'Colombia']
Peru


Oceania Simulation

In [None]:
# upload Oceania FIFA Qualifier Groups.xlsx
u = files.upload()

Saving Oceania FIFA Qualifier Groups.xlsx to Oceania FIFA Qualifier Groups.xlsx


In [None]:
ofc = pd.read_excel('Oceania FIFA Qualifier Groups.xlsx')
ofc_groups = df_to_groups(ofc)

# In OFC, the 11 teams are divided into 2 groups based on their FIFA ranking,
# however the groups have not been finalised yet. So simply advance the highest
# ranked country to the intercontinental playoffs.

ofc_group = ofc_groups['A']
ofc_group.sort(key=lambda c: country_to_rank[c] if country_to_rank.get(c) else 1000)
ofc_intercontinental = ofc_group[0]

print(ofc_intercontinental)


New Zealand


Intercontinental Playoffs Simulation

In [None]:
# The 4 teams that made the intercontinental playoffs are randomly formed into
# 2 pairs. Winners from each pair become qualifiers.
intercontinental_playoffs = [concacaf_intercontinental, asia_intercontinental,
                             sa_intercontinental, ofc_intercontinental]

qualifying_probabilities = {c: 0 for c in intercontinental_playoffs}
for c1 in intercontinental_playoffs:
  for c2 in intercontinental_playoffs:
    if c1 == c2:
      continue
    
    result = get_match_prediction(c1, c2, can_draw=False)
    winner = result['winner']
    qualifying_probabilities[winner] += 1

intercontinental_playoffs.sort(key=lambda c: qualifying_probabilities[c], reverse=True)
intercontinental_qualifiers = intercontinental_playoffs[:2]

print(intercontinental_qualifiers)

['Peru', 'Costa Rica']


Final Prediction

In [None]:
# Qatar automatically qualified as they are the hosts
qualifiers = (europe_qualifiers + africa_qualifiers + concacaf_qualifiers +
              sa_qualifiers + asia_qualifiers + intercontinental_qualifiers)
qualifiers.append('Qatar')

for q in qualifiers:
  print(q)

Portugal
Spain
Italy
France
Belgium
Denmark
Netherlands
Croatia
England
Germany
Switzerland
Sweden
Ukraine
South Africa
Senegal
Algeria
Tunisia
Ivory Coast
United States
Mexico
Canada
Argentina
Brazil
Uruguay
Colombia
Iran
South Korea
Australia
Japan
Peru
Costa Rica
Qatar
