# World cup 2022 forcast
Inspired by 
https://www.kaggle.com/code/agostontorok/soccer-world-cup-2018-winner/notebook  
https://www.kaggle.com/code/scottsuk0306/fifa-world-cup-2022-group-stage-prediction  

Dataset:  
https://www.kaggle.com/datasets/piterfm/fifa-football-world-cup  
https://www.kaggle.com/datasets/cashncarry/fifaworldranking  
https://www.kaggle.com/datasets/brenda89/fifa-world-cup-2022  
https://www.kaggle.com/datasets/amineteffal/qatar2022worldcupschudule  


In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
import seaborn as sns

# https://www.kaggle.com/datasets/brenda89/fifa-world-cup-2022
matches = pd.read_csv('./data/international_matches.csv')
matches.head(5)


Unnamed: 0,date,home_team,away_team,home_team_continent,away_team_continent,home_team_fifa_rank,away_team_fifa_rank,home_team_total_fifa_points,away_team_total_fifa_points,home_team_score,...,shoot_out,home_team_result,home_team_goalkeeper_score,away_team_goalkeeper_score,home_team_mean_defense_score,home_team_mean_offense_score,home_team_mean_midfield_score,away_team_mean_defense_score,away_team_mean_offense_score,away_team_mean_midfield_score
0,1993-08-08,Bolivia,Uruguay,South America,South America,59,22,0,0,3,...,No,Win,,,,,,,,
1,1993-08-08,Brazil,Mexico,South America,North America,8,14,0,0,1,...,No,Draw,,,,,,,,
2,1993-08-08,Ecuador,Venezuela,South America,South America,35,94,0,0,5,...,No,Win,,,,,,,,
3,1993-08-08,Guinea,Sierra Leone,Africa,Africa,65,86,0,0,1,...,No,Win,,,,,,,,
4,1993-08-08,Paraguay,Argentina,South America,South America,67,5,0,0,1,...,No,Lose,,,,,,,,


In [7]:
schedule = pd.read_csv('./data/matchs-schudule.csv', sep=";")
teams = pd.read_csv('./data/Qatar2022-teams.csv', sep=";")
teams

schedule.head(100)


schedule[schedule["date"]=='01/12/2022']


Unnamed: 0,match,date,country1,coutry2,phase
40,41,01/12/2022,Croatia,Belgium,group matches
41,42,01/12/2022,Canada,Morocco,group matches
42,43,01/12/2022,Japan,Spain,group matches
43,44,01/12/2022,Costa Rica,Germany,group matches


In [3]:
#https://www.kaggle.com/datasets/cashncarry/fifaworldranking  
ranking = pd.read_csv('./data/fifa_ranking-2022-10-06.csv')
ranking.head(5)


matches = matches.replace({"IR Iran": "Iran", "Korea Republic" : "South Korea"})
ranking = ranking.replace({"IR Iran": "Iran", "Korea Republic" : "South Korea"})

# feature extraction
matches['rank_difference'] = matches['home_team_fifa_rank'] - matches['away_team_fifa_rank']
matches['average_rank'] = (matches['home_team_fifa_rank'] + matches['away_team_fifa_rank'])/2
matches['point_difference'] = matches['home_team_total_fifa_points'] - matches['away_team_total_fifa_points']
matches['score_difference'] = matches['home_team_score'] - matches['away_team_score']
matches['is_stake'] = matches['tournament'] != 'Friendly'
matches['is_won'] = matches['score_difference'] > 0 # take draw as lost


from sklearn import linear_model
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

# machine learning
from sklearn.linear_model import LogisticRegression

X = matches.loc[:,['rank_difference', 'average_rank', 'point_difference', 'is_stake']]
y = matches['is_won']

X

Unnamed: 0,rank_difference,average_rank,point_difference,is_stake
0,37,40.5,0,True
1,-6,11.0,0,False
2,-59,64.5,0,True
3,-21,75.5,0,False
4,62,36.0,0,True
...,...,...,...,...
23916,27,166.5,-108,True
23917,57,163.5,-210,True
23918,-32,44.0,139,True
23919,-12,29.0,54,True


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

logreg.predict(X_test)
accuracy = logreg.score(X_test, y_test)
accuracy

# let's define a small margin when we safer to predict draw then win
margin = 0.05

# let's define the rankings at the time of the World Cup
ranking = ranking.loc[(ranking['rank_date'] == ranking['rank_date'].max()) & (ranking['country_full'].isin(teams['Team'].unique()))]

worldcup_ranking = ranking.set_index(['country_full'])
worldcup_ranking



Unnamed: 0_level_0,rank,country_abrv,total_points,previous_points,rank_change,confederation,rank_date
country_full,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Brazil,1,BRA,1841.3,1837.56,0,CONMEBOL,2022-10-06
South Korea,28,KOR,1530.3,1526.02,0,AFC,2022-10-06
Tunisia,30,TUN,1507.54,1507.86,0,CAF,2022-10-06
Costa Rica,31,CRC,1503.59,1500.06,-3,CONCACAF,2022-10-06
Australia,38,AUS,1488.72,1483.73,-1,AFC,2022-10-06
Canada,41,CAN,1475.0,1473.82,-2,CONCACAF,2022-10-06
Cameroon,43,CMR,1471.44,1484.95,5,CAF,2022-10-06
Ecuador,44,ECU,1464.39,1463.74,0,CONMEBOL,2022-10-06
Poland,26,POL,1548.59,1546.18,0,UEFA,2022-10-06
Japan,24,JPN,1559.54,1554.69,0,AFC,2022-10-06


In [12]:
row = pd.DataFrame(np.array([[np.nan, np.nan, np.nan, True]]), 
                   columns=X_train.columns)
# Australia 	Denmark 	
row['average_rank'] = (38 + 10) / 2
row['rank_difference'] = 38 - 10
row['point_difference'] = 1488.72-1666.57
logreg.predict_proba(row)[:,1][0]

# Tunisia vs France
Tunisia = ranking[ranking["country_full"]=='Tunisia'] 
France = ranking[ranking["country_full"]=='France']
row['average_rank'] = (Tunisia.iloc[0]["rank"] + France.iloc[0]["rank"]) / 2
row['rank_difference'] = Tunisia.iloc[0]["rank"] - France.iloc[0]["rank"]
row['point_difference'] = Tunisia.iloc[0]["total_points"]-France.iloc[0]["total_points"]
logreg.predict_proba(row)[:,1][0]

# Poland 	Argentina
Poland = ranking[ranking["country_full"]=='Poland'] 
Argentina = ranking[ranking["country_full"]=='Argentina']
row['average_rank'] = (Poland.iloc[0]["rank"] + Argentina.iloc[0]["rank"]) / 2
row['rank_difference'] = Poland.iloc[0]["rank"] - Argentina.iloc[0]["rank"]
row['point_difference'] = Poland.iloc[0]["total_points"]-Argentina.iloc[0]["total_points"]
logreg.predict_proba(row)[:,1][0]

#Saudi Arabia 	Mexico
home = ranking[ranking["country_full"]=='Saudi Arabia'] 
away = ranking[ranking["country_full"]=='Mexico']
row['average_rank'] = (home.iloc[0]["rank"] + away.iloc[0]["rank"]) / 2
row['rank_difference'] = home.iloc[0]["rank"] - away.iloc[0]["rank"]
row['point_difference'] = home.iloc[0]["total_points"]-away.iloc[0]["total_points"]
logreg.predict_proba(row)[:,1][0]

#Saudi Arabia 	Mexico
home = ranking[ranking["country_full"]=='Belgium'] 
away = ranking[ranking["country_full"]=='Iran']
row['average_rank'] = (home.iloc[0]["rank"] + away.iloc[0]["rank"]) / 2
row['rank_difference'] = home.iloc[0]["rank"] - away.iloc[0]["rank"]
row['point_difference'] = home.iloc[0]["total_points"]-away.iloc[0]["total_points"]
logreg.predict_proba(row)[:,1][0]



0.5448482419344355

In [None]:
y

In [None]:
china_ranking = ranking[ranking['country_abrv']=='CHN'].sort_values(by=['rank_date'])
china_ranking.tail(20)
#china_ranking["rank"].idxmin()
#ranking.iloc[china_ranking["rank"].idxmin()]
sns.lineplot(data=china_ranking, x="rank_date", y="rank")
pl

In [None]:
china_ranking
sns.lineplot(data=flights_wide["May"])