# Capstone: Predicting the winner in the world cup


##### by: Awab Idris
---

## 1) Importing the data and libraries:

In [1]:
# Importing the libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score
np.random.seed(42)
%matplotlib inline

In [2]:
# Loading the datasets
wc = pd.read_csv("./datasets/WorldCups.csv")
matches = pd.read_csv("./datasets/WorldCupMatches.csv")
matches_2018 = pd.read_csv("./datasets/fifa-world-cup-2018-matches.csv")

## 2) Basic EDA and Data Cleaning:

In [3]:
# Checking the data
matches

Unnamed: 0,Year,Datetime,Stage,Stadium,City,Home Team Name,Home Team Goals,Away Team Goals,Away Team Name,Win conditions,Attendance,Half-time Home Goals,Half-time Away Goals,Referee,Assistant 1,Assistant 2,RoundID,MatchID,Home Team Initials,Away Team Initials
0,1930.0,13 Jul 1930 - 15:00,Group 1,Pocitos,Montevideo,France,4.0,1.0,Mexico,,4444.0,3.0,0.0,LOMBARDI Domingo (URU),CRISTOPHE Henry (BEL),REGO Gilberto (BRA),201.0,1096.0,FRA,MEX
1,1930.0,13 Jul 1930 - 15:00,Group 4,Parque Central,Montevideo,USA,3.0,0.0,Belgium,,18346.0,2.0,0.0,MACIAS Jose (ARG),MATEUCCI Francisco (URU),WARNKEN Alberto (CHI),201.0,1090.0,USA,BEL
2,1930.0,14 Jul 1930 - 12:45,Group 2,Parque Central,Montevideo,Yugoslavia,2.0,1.0,Brazil,,24059.0,2.0,0.0,TEJADA Anibal (URU),VALLARINO Ricardo (URU),BALWAY Thomas (FRA),201.0,1093.0,YUG,BRA
3,1930.0,14 Jul 1930 - 14:50,Group 3,Pocitos,Montevideo,Romania,3.0,1.0,Peru,,2549.0,1.0,0.0,WARNKEN Alberto (CHI),LANGENUS Jean (BEL),MATEUCCI Francisco (URU),201.0,1098.0,ROU,PER
4,1930.0,15 Jul 1930 - 16:00,Group 1,Parque Central,Montevideo,Argentina,1.0,0.0,France,,23409.0,0.0,0.0,REGO Gilberto (BRA),SAUCEDO Ulises (BOL),RADULESCU Constantin (ROU),201.0,1085.0,ARG,FRA
5,1930.0,16 Jul 1930 - 14:45,Group 1,Parque Central,Montevideo,Chile,3.0,0.0,Mexico,,9249.0,1.0,0.0,CRISTOPHE Henry (BEL),APHESTEGUY Martin (URU),LANGENUS Jean (BEL),201.0,1095.0,CHI,MEX
6,1930.0,17 Jul 1930 - 12:45,Group 2,Parque Central,Montevideo,Yugoslavia,4.0,0.0,Bolivia,,18306.0,0.0,0.0,MATEUCCI Francisco (URU),LOMBARDI Domingo (URU),WARNKEN Alberto (CHI),201.0,1092.0,YUG,BOL
7,1930.0,17 Jul 1930 - 14:45,Group 4,Parque Central,Montevideo,USA,3.0,0.0,Paraguay,,18306.0,2.0,0.0,MACIAS Jose (ARG),APHESTEGUY Martin (URU),TEJADA Anibal (URU),201.0,1097.0,USA,PAR
8,1930.0,18 Jul 1930 - 14:30,Group 3,Estadio Centenario,Montevideo,Uruguay,1.0,0.0,Peru,,57735.0,0.0,0.0,LANGENUS Jean (BEL),BALWAY Thomas (FRA),CRISTOPHE Henry (BEL),201.0,1099.0,URU,PER
9,1930.0,19 Jul 1930 - 12:50,Group 1,Estadio Centenario,Montevideo,Chile,1.0,0.0,France,,2000.0,0.0,0.0,TEJADA Anibal (URU),LOMBARDI Domingo (URU),REGO Gilberto (BRA),201.0,1094.0,CHI,FRA


In [4]:
# Checking null values
matches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4572 entries, 0 to 4571
Data columns (total 20 columns):
Year                    852 non-null float64
Datetime                852 non-null object
Stage                   852 non-null object
Stadium                 852 non-null object
City                    852 non-null object
Home Team Name          852 non-null object
Home Team Goals         852 non-null float64
Away Team Goals         852 non-null float64
Away Team Name          852 non-null object
Win conditions          852 non-null object
Attendance              850 non-null float64
Half-time Home Goals    852 non-null float64
Half-time Away Goals    852 non-null float64
Referee                 852 non-null object
Assistant 1             852 non-null object
Assistant 2             852 non-null object
RoundID                 852 non-null float64
MatchID                 852 non-null float64
Home Team Initials      852 non-null object
Away Team Initials      852 non-null object
dtype

In [5]:
# Until 2014 there are only 836  world cup games, the null values are empty rows
matches=matches.dropna()

In [6]:
# Checking duplicates
matches.duplicated().value_counts()

False    835
True      15
dtype: int64

In [7]:
# Dropping duplicates
matches.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [8]:
# Checking the number of rows
len(matches)

835

In [9]:
# Fixing teams names
matches = matches.replace({"Germany FR":"Germany", "IR Iran": "Iran", "German DR":"Germany",
                           "C�te d'Ivoire": "Ivory Coast", 'rn">United Arab Emirates':"United Arab Emirates",
                          'rn">Bosnia and Herzegovina':"Bosnia and Herzegovina",
                           'rn">Serbia and Montenegro':"Serbia and Montenegro",
                           'rn">Trinidad and Tobago':"Trinidad and Tobago",
                          'rn">Republic of Ireland': "Ireland", "China PR": "China",
                          'Korea DPR': "Korea Republic", "Soviet Union": "Russia", "Costarica":"Costa Rica"})

In [10]:
# Adding "winning_team" to a an empty array
# This function works until row 822
# The function add "H" to winning team array if the home team wins by goal difference
# The function add "A" to winning team array if the away team wins by goal difference
# The function add "D" as Draw if both teams scored the same amount of goals
winning_team = []
for x in range(823):
    if matches["Home Team Goals"][x] > matches["Away Team Goals"][x]:
        winning_team.append("H")
    elif matches["Home Team Goals"][x] < matches["Away Team Goals"][x]:
        winning_team.append("A")
    else:
        winning_team.append("D")
        

In [12]:
# "winning_team" length is 823 rows
len(winning_team)

823

In [13]:
# Checking the last 12 rows
matches.tail(12)

Unnamed: 0,Year,Datetime,Stage,Stadium,City,Home Team Name,Home Team Goals,Away Team Goals,Away Team Name,Win conditions,Attendance,Half-time Home Goals,Half-time Away Goals,Referee,Assistant 1,Assistant 2,RoundID,MatchID,Home Team Initials,Away Team Initials
824,2014.0,04 Jul 2014 - 17:00,Quarter-finals,Estadio Castelao,Fortaleza,Brazil,2.0,1.0,Colombia,,60342.0,1.0,0.0,Carlos VELASCO CARBALLO (ESP),ALONSO FERNANDEZ Roberto (ESP),YUSTE Juan (ESP),255953.0,300186461.0,BRA,COL
825,2014.0,04 Jul 2014 - 13:00,Quarter-finals,Estadio do Maracana,Rio De Janeiro,France,0.0,1.0,Germany,,74240.0,0.0,1.0,PITANA Nestor (ARG),MAIDANA Hernan (ARG),BELATTI Juan Pablo (ARG),255953.0,300186485.0,FRA,GER
826,2014.0,08 Jul 2014 - 17:00,Semi-finals,Estadio Mineirao,Belo Horizonte,Brazil,1.0,7.0,Germany,,58141.0,0.0,5.0,RODRIGUEZ Marco (MEX),TORRENTERA Marvin (MEX),QUINTERO Marcos (MEX),255955.0,300186474.0,BRA,GER
827,2014.0,12 Jul 2014 - 17:00,Play-off for third place,Estadio Nacional,Brasilia,Brazil,0.0,3.0,Netherlands,,68034.0,0.0,2.0,HAIMOUDI Djamel (ALG),ACHIK Redouane (MAR),ETCHIALI Abdelhak (ALG),255957.0,300186502.0,BRA,NED
828,2014.0,13 Jul 2014 - 16:00,Final,Estadio do Maracana,Rio De Janeiro,Germany,1.0,0.0,Argentina,Germany win after extra time,74738.0,0.0,0.0,Nicola RIZZOLI (ITA),Renato FAVERANI (ITA),Andrea STEFANI (ITA),255959.0,300186501.0,GER,ARG
829,2014.0,09 Jul 2014 - 17:00,Semi-finals,Arena de Sao Paulo,Sao Paulo,Netherlands,0.0,0.0,Argentina,Argentina win on penalties (2 - 4),63267.0,0.0,0.0,C�neyt �AKIR (TUR),DURAN Bahattin (TUR),ONGUN Tarik (TUR),255955.0,300186490.0,NED,ARG
830,2014.0,05 Jul 2014 - 17:00,Quarter-finals,Arena Fonte Nova,Salvador,Netherlands,0.0,0.0,Costa Rica,Netherlands win on penalties (4 - 3),51179.0,0.0,0.0,Ravshan IRMATOV (UZB),RASULOV Abduxamidullo (UZB),KOCHKAROV Bakhadyr (KGZ),255953.0,300186488.0,NED,CRC
831,2014.0,05 Jul 2014 - 13:00,Quarter-finals,Estadio Nacional,Brasilia,Argentina,1.0,0.0,Belgium,,68551.0,1.0,0.0,Nicola RIZZOLI (ITA),Renato FAVERANI (ITA),Andrea STEFANI (ITA),255953.0,300186504.0,ARG,BEL
832,2014.0,29 Jun 2014 - 13:00,Round of 16,Estadio Castelao,Fortaleza,Netherlands,2.0,1.0,Mexico,,58817.0,0.0,0.0,PROENCA Pedro (POR),MIRANDA Bertino (POR),TRIGO Jose (POR),255951.0,300186508.0,NED,MEX
833,2014.0,29 Jun 2014 - 17:00,Round of 16,Arena Pernambuco,Recife,Costa Rica,1.0,1.0,Greece,Costa Rica win on penalties (5 - 3),41242.0,0.0,0.0,Ben WILLIAMS (AUS),CREAM Matthew (AUS),ANAZ Hakan (AUS),255951.0,300186459.0,CRC,GRE


In [14]:
# Appending the last 12 winning team to the array
winning_team.append("H")
winning_team.append("A")
winning_team.append("A")
winning_team.append("A")
winning_team.append("H")
winning_team.append("D")
winning_team.append("D")
winning_team.append("H")
winning_team.append("H")
winning_team.append("D")
winning_team.append("H")
winning_team.append("H")

In [16]:
# Adding "Winning Team" column to the dataframe because that's gonna be our target column
matches["Winning Team"] = winning_team

In [17]:
# Checking columns names
matches.columns

Index(['Year', 'Datetime', 'Stage', 'Stadium', 'City', 'Home Team Name',
       'Home Team Goals', 'Away Team Goals', 'Away Team Name',
       'Win conditions', 'Attendance', 'Half-time Home Goals',
       'Half-time Away Goals', 'Referee', 'Assistant 1', 'Assistant 2',
       'RoundID', 'MatchID', 'Home Team Initials', 'Away Team Initials',
       'Winning Team'],
      dtype='object')

In [18]:
# Dropping all columns that won't be included in the model
# Most of the columns are results from the matches or irrelevant
matches.drop(columns=['Win conditions', 'Attendance', 'Half-time Home Goals',
       'Half-time Away Goals', 'Referee', 'Assistant 1', 'Assistant 2',
       'RoundID', 'MatchID', 'Home Team Initials', 'Away Team Initials',
        'Year','Datetime', 'Stage', 'Stadium','Home Team Goals',
       'Away Team Goals',"City"], inplace=True)

In [19]:
# One-Hot-Encoding for the remaining columns
matches = pd.get_dummies(matches, columns=["Home Team Name", "Away Team Name"])

In [20]:
# Specifying our features data "X" and our target data "y" 
X = matches.drop(columns=["Winning Team"])
y = matches["Winning Team"]

# Splitting our data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

## 3) Modeling and Evaluation

In [21]:
# LogisticRegression
lr = LogisticRegression(random_state=(42))

# Cross validation score for our training data
cross_val_score(lr, X_train, y_train, cv=5).mean()

0.582408062973656

In [22]:
# Cross validation score for our testing data
cross_val_score(lr, X_test, y_test, cv=5).mean()

0.5513368983957219

In [23]:
# RandomForestClassifier
rf = RandomForestClassifier(random_state=(42))

# Cross validation score for our training data
cross_val_score(rf, X_train, y_train, cv=5).mean()

0.5015948132051824

In [24]:
# Cross validation score for our testing data
cross_val_score(rf, X_test, y_test, cv=5).mean()

0.5212121212121212

In [25]:
# ExtraTreesClassifier
etc = ExtraTreesClassifier(random_state=(42))

# Cross validation score for our training data
cross_val_score(etc, X_train, y_train, cv=5).mean()

0.47755508173418626

In [26]:
# Cross validation score for our testing data
cross_val_score(etc, X_test, y_test, cv=5).mean()

0.45561497326203204

In [27]:
# BaggingClassifier
bc = BaggingClassifier(random_state=(42))

# Cross validation score for our training data
cross_val_score(bc, X_train, y_train, cv=5).mean()

0.5059933528983018

In [28]:
# Cross validation score for our testing data
cross_val_score(bc, X_test, y_test, cv=5).mean()

0.4971479500891266

In [29]:
abc = AdaBoostClassifier(random_state=(42))

# Cross validation score for our training data
cross_val_score(abc, X_train, y_train, cv=5).mean()

0.539065857769707

In [30]:
# Cross validation score for our testing data
cross_val_score(abc, X_test, y_test, cv=5).mean()

0.5572192513368983

In [31]:
gbc = GradientBoostingClassifier(random_state=(42))

# Cross validation score for our training data
cross_val_score(gbc, X_train, y_train, cv=5).mean()

0.5344991341613494

In [32]:
# Cross validation score for our testing data
cross_val_score(gbc, X_test, y_test, cv=5).mean()

0.5215686274509804

In [33]:
xgbc = XGBClassifier(random_state=(42))

# Cross validation score for our training data
cross_val_score(xgbc, X_train, y_train, cv=5).mean()

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


0.5508721487118974

In [34]:
# Cross validation score for our testing data
cross_val_score(xgbc, X_test, y_test, cv=5).mean()

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


0.5274509803921569

In [35]:
# Predicting the group stages results

In [36]:
# Taking only the group stages games for predictions
matches_2018 = matches_2018.head(48)

In [37]:
# Checking the columns
matches_2018.columns

Index(['Round Number', 'Date', 'Location', 'Home Team', 'Away Team', 'Group',
       'Result'],
      dtype='object')

In [38]:
# Dropping unwanted columns
matches_2018.drop(columns=['Round Number', 'Date', 'Location', 'Group', 'Result'], inplace=True)

In [39]:
# Renaming the columns
matches_2018.rename(columns={"Home Team":"Home Team Name", "Away Team": "Away Team Name"}, inplace=True)

In [40]:
# One-Hot-Encoding for the remaining columns and assign it to test
test = pd.get_dummies(matches_2018)

In [41]:
# I chose AdaBoostClassiefier because it had the best in the test data and it's not overfitted

# Fitting AdaBoostClassiefier with the whole data to get better results 
abc.fit(X, y)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=42)

In [42]:
# Fixing the diffrence of shapes between "X" and "test"  
cols = [col for col in X.columns if col not in test.columns]
    

In [43]:
# Making new columns in "test" and fill it with zeros to match the shape of "X"
for x in cols:
    test[x] = 0

In [44]:
# Taking only the columns that "X" has
test = test[X.columns]

In [45]:
# Checking the difference of shapes between "X" and "test"
test.columns.all() == X.columns.all()

True

In [46]:
# Predicting the results of all matches in the group stages
preds = abc.predict(test)

In [47]:
# Adding our predictions to the dataframe
matches_2018["Winning Team"] = preds

In [48]:
# The group stages predictions
matches_2018

Unnamed: 0,Home Team Name,Away Team Name,Winning Team
0,Russia,Saudi Arabia,H
1,Egypt,Uruguay,H
2,Morocco,Iran,H
3,Portugal,Spain,A
4,France,Australia,H
5,Argentina,Iceland,H
6,Peru,Denmark,A
7,Croatia,Nigeria,A
8,Costa Rica,Serbia,A
9,Germany,Mexico,H


In [49]:
# Round of 16
# The qualifying 16 teams based on the predictions from the group stages
# For team that had the same points:
# I assumed some teams will qualify by the goal difference for each team.

In [50]:
round_16 = pd.read_csv("./datasets/fifa-world-cup-2018-matches.csv")

In [51]:
# Getting round of 16 structure 
round_16 =round_16.iloc[48:56, :]

In [52]:
# Dropping unwanted columns 
round_16.drop(columns=['Round Number', 'Date', 'Location', 'Group', 'Result'], inplace=True)

In [53]:
# Renaming the columns
round_16.rename(columns={"Home Team":"Home Team Name", "Away Team": "Away Team Name"}, inplace=True)

In [54]:
# Checking the structure
round_16

Unnamed: 0,Home Team Name,Away Team Name
48,Winner Group C,Runner-up Group D
49,Winner Group A,Runner-up Group B\t
50,Winner Group B,Runner-up Group A\t
51,Winner Group D,Runner-up Group C
52,Winner Group E,Runner-up Group F
53,Winner Group G,Runner-up Group H
54,Winner Group F,Runner-up Group E
55,Winner Group H,Runner-up Group G


In [55]:
# Adding the qualifed teams
round_16["Home Team Name"] = ["France","Uruguay","Spain","Nigeria","Brazil","Belgium", "Germany", "Senegal"]
round_16["Away Team Name"] = ["Argentina","Portugal","Russia","Denmark","Mexico","Poland","Serbia","England"]

In [56]:
# Now we have our round of 16 matches
round_16

Unnamed: 0,Home Team Name,Away Team Name
48,France,Argentina
49,Uruguay,Portugal
50,Spain,Russia
51,Nigeria,Denmark
52,Brazil,Mexico
53,Belgium,Poland
54,Germany,Serbia
55,Senegal,England


In [57]:
# One-Hot-Encoding for the remaining columns and assign it to test
test2 = pd.get_dummies(round_16)

In [58]:
# Fixing the diffrence of shapes between "X" and "test"
cols = [col for col in X.columns if col not in test2.columns]

In [59]:
# Making new columns in "test" and fill it with zeros to match the shape of "X"
for x in cols:
    test2[x] = 0

In [60]:
# Taking only the columns that "X" has
test2 = test2[X.columns]

In [61]:
# Checking the difference of shapes between "X" and "test"
test2.columns.all() == X.columns.all()

True

In [62]:
# Predicting for round of 16
preds_16 = abc.predict(test2)

In [63]:
# Adding our predictions to the dataframe
round_16["Winning Team"] = preds_16

In [64]:
# Predictions for round of 16
round_16

Unnamed: 0,Home Team Name,Away Team Name,Winning Team
48,France,Argentina,H
49,Uruguay,Portugal,H
50,Spain,Russia,H
51,Nigeria,Denmark,A
52,Brazil,Mexico,H
53,Belgium,Poland,H
54,Germany,Serbia,H
55,Senegal,England,H


In [65]:
# Quarter Final

# Making a dataframe for the quarter final
Q_final = {"Home Team Name":["France","Spain","Brazil","Germany"],
           "Away Team Name":["Uruguay","Denmark","Belgium","Senegal"]}
Q_final = pd.DataFrame(Q_final)

In [66]:
# Putting the columns in the right order for predictions
Q_final = Q_final[["Home Team Name","Away Team Name"]]

In [67]:
# Our quarter final matches
Q_final

Unnamed: 0,Home Team Name,Away Team Name
0,France,Uruguay
1,Spain,Denmark
2,Brazil,Belgium
3,Germany,Senegal


In [68]:
# One-Hot-Encoding for the remaining columns and assign it to test
test3 = pd.get_dummies(Q_final)

In [69]:
# Fixing the diffrence of shapes between "X" and "test"
cols = [col for col in X.columns if col not in test3.columns]

In [70]:
# Making new columns in "test" and fill it with zeros to match the shape of "X"
for x in cols:
    test3[x] = 0

In [71]:
# Taking only the columns that "X" has
test3 = test3[X.columns]

In [72]:
# Checking the difference of shapes between "X" and "test"
test3.columns.all() == X.columns.all()

True

In [73]:
# Predicting for quarter final
preds_Q = abc.predict(test3)

In [74]:
# Adding our predictions to the dataframe
Q_final["Winning Team"] = preds_Q

In [75]:
# Predictions for quarter final
Q_final

Unnamed: 0,Home Team Name,Away Team Name,Winning Team
0,France,Uruguay,H
1,Spain,Denmark,A
2,Brazil,Belgium,H
3,Germany,Senegal,D


In [76]:
# Checking The Probability for "Germany" to win
abc.predict_proba(test3)[3]

array([0.39676998, 0.39976924, 0.20346078])

In [77]:
# "Germany" has 39% probability of winning which is more than "Senegal" probability of winning 20%
# so we'll consider "Germany" the winner

In [78]:
# Semi Final

# Making a dataframe for the semi final
S_final = {"Home Team Name":["France","Denmark"],
           "Away Team Name":["Brazil","Germany"]} 
S_final = pd.DataFrame(S_final)

In [79]:
# Putting the columns in the right order for predictions
S_final = S_final[["Home Team Name", "Away Team Name"]]

In [80]:
# Our semi final matches
S_final

Unnamed: 0,Home Team Name,Away Team Name
0,France,Brazil
1,Denmark,Germany


In [81]:
# One-Hot-Encoding for the remaining columns and assign it to test
test4 = pd.get_dummies(S_final)

In [82]:
# Fixing the diffrence of shapes between "X" and "test"
cols = [col for col in X.columns if col not in test4.columns]

In [83]:
# Making new columns in "test" and fill it with zeros to match the shape of "X"
for x in cols:
    test4[x] = 0

In [84]:
# Taking only the columns that "X" has
test4 = test4[X.columns]

In [85]:
# Checking the difference of shapes between "X" and "test"
test4.columns.all() == X.columns.all()

True

In [86]:
# Predictions for semi final
preds_S = abc.predict(test4)

In [87]:
# Adding our predictions to the dataframe
S_final["Winning Team"] = preds_S

In [88]:
# Predictions for Semi final
S_final

Unnamed: 0,Home Team Name,Away Team Name,Winning Team
0,France,Brazil,A
1,Denmark,Germany,A


In [89]:
# Determining  Third and Fourth place

# Making a dataframe for third place match
Third_place = {"Home Team Name":["France"],
           "Away Team Name":["Denmark"]} 
Third_place = pd.DataFrame(Third_place)

In [90]:
# Putting the columns in the right order for predictions
Third_place = Third_place[["Home Team Name", "Away Team Name"]]

In [91]:
# Our third place match
Third_place

Unnamed: 0,Home Team Name,Away Team Name
0,France,Denmark


In [92]:
# One-Hot-Encoding for the remaining columns and assign it to test
test5 = pd.get_dummies(Third_place)

In [93]:
# Fixing the diffrence of shapes between "X" and "test"
cols = [col for col in X.columns if col not in test5.columns]

In [94]:
# Making new columns in "test" and fill it with zeros to match the shape of "X"
for x in cols:
    test5[x] = 0

In [95]:
# Taking only the columns that "X" has
test5 = test5[X.columns]

In [96]:
# Checking the difference of shapes between "X" and "test"
test5.columns.all() == X.columns.all()

True

In [97]:
# Predictions for third place match
preds_third = abc.predict(test5)

In [98]:
# Adding our predictions to the dataframe
Third_place["Winning Team"] = preds_third

In [99]:
# Predictions for Third place match
Third_place

Unnamed: 0,Home Team Name,Away Team Name,Winning Team
0,France,Denmark,A


In [100]:
# "Denmark" takes the Third place
# "France takes the Fourth place

In [101]:
# Final

# Making a dataframe for the semi final
Final = {"Home Team Name":["Brazil"],
           "Away Team Name":["Germany"]} 
Final = pd.DataFrame(Final)

In [102]:
# Putting the columns in the right order for predictions
Final = Final[["Home Team Name", "Away Team Name"]]

In [103]:
# Our final match
Final

Unnamed: 0,Home Team Name,Away Team Name
0,Brazil,Germany


In [104]:
# One-Hot-Encoding for the remaining columns and assign it to test
test6 = pd.get_dummies(Final)

In [105]:
# Fixing the diffrence of shapes between "X" and "test"
cols = [col for col in X.columns if col not in test6.columns]

In [106]:
# Making new columns in "test" and fill it with zeros to match the shape of "X"
for x in cols:
    test6[x] = 0

In [107]:
# Taking only the columns that "X" has
test6 = test6[X.columns]

In [108]:
# Checking the difference of shapes between "X" and "test"
test6.columns.all() == X.columns.all()

True

In [109]:
# Predictions for the final match
preds_final = abc.predict(test6)

In [110]:
# Adding our predictions to the dataframe
Final["Winning Team"] = preds_final

In [111]:
# Predictions for the Final
Final

Unnamed: 0,Home Team Name,Away Team Name,Winning Team
0,Brazil,Germany,H


In [112]:
# The winner is "Brazil"
# "Germany" takes the second place