In [None]:
# given data about the womens football tournamant, we will try to predict 
# weather the home team will win or not against the away team.

In [1]:
# getting started 
import numpy as np 
import pandas as pd 
import plotly.express as px


from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split 

import tensorflow as tf 

In [47]:
data = pd.read_csv("../input/womens-international-football-results/results.csv")

In [48]:
data.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1969-11-01,Italy,France,1,0,Euro,Novara,Italy,False
1,1969-11-01,Denmark,England,4,3,Euro,Aosta,Italy,True
2,1969-11-02,England,France,2,0,Euro,Turin,Italy,True
3,1969-11-02,Italy,Denmark,3,1,Euro,Turin,Italy,False
4,1970-07-06,England,West Germany,5,1,World Cup,Genova,Italy,True


In [49]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5659 entries, 0 to 5658
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   date        5659 non-null   object
 1   home_team   5659 non-null   object
 2   away_team   5659 non-null   object
 3   home_score  5659 non-null   int64 
 4   away_score  5659 non-null   int64 
 5   tournament  5659 non-null   object
 6   city        5659 non-null   object
 7   country     5659 non-null   object
 8   neutral     5659 non-null   bool  
dtypes: bool(1), int64(2), object(6)
memory usage: 359.3+ KB


In [50]:
# feature engineering + target creation 
data['date']

0       1969-11-01
1       1969-11-01
2       1969-11-02
3       1969-11-02
4       1970-07-06
           ...    
5654    2023-08-12
5655    2023-08-15
5656    2023-08-16
5657    2023-08-19
5658    2023-08-20
Name: date, Length: 5659, dtype: object

In [51]:
data['date'].apply(lambda x : x[0:4]) # year 

0       1969
1       1969
2       1969
3       1969
4       1970
        ... 
5654    2023
5655    2023
5656    2023
5657    2023
5658    2023
Name: date, Length: 5659, dtype: object

In [52]:
data['date'].apply(lambda x : x[5:7]) # month 

0       11
1       11
2       11
3       11
4       07
        ..
5654    08
5655    08
5656    08
5657    08
5658    08
Name: date, Length: 5659, dtype: object

In [53]:
data['date'].apply(lambda x : x[8:]) # date 

0       01
1       01
2       02
3       02
4       06
        ..
5654    12
5655    15
5656    16
5657    19
5658    20
Name: date, Length: 5659, dtype: object

In [54]:
data['year'] = data['date'].apply(lambda x : x[0:4])
data['month'] = data['date'].apply(lambda x : x[5:7])
data['date'] = data['date'].apply(lambda x : x[8:])
data = data.drop('date',axis = 1)

In [55]:
data.head()

Unnamed: 0,home_team,away_team,home_score,away_score,tournament,city,country,neutral,year,month
0,Italy,France,1,0,Euro,Novara,Italy,False,1969,11
1,Denmark,England,4,3,Euro,Aosta,Italy,True,1969,11
2,England,France,2,0,Euro,Turin,Italy,True,1969,11
3,Italy,Denmark,3,1,Euro,Turin,Italy,False,1969,11
4,England,West Germany,5,1,World Cup,Genova,Italy,True,1970,7


In [56]:
sum(data['home_score'] >  data['away_score']) # wins

2976

In [57]:
sum(data['home_score'] < data['away_score']) # loss

1889

In [58]:
sum(data['home_score'] == data['away_score']) # tie

794

In [59]:
data['home_victory'] = (data['home_score'] > data['away_score']).astype('int')

In [60]:
data

Unnamed: 0,home_team,away_team,home_score,away_score,tournament,city,country,neutral,year,month,home_victory
0,Italy,France,1,0,Euro,Novara,Italy,False,1969,11,1
1,Denmark,England,4,3,Euro,Aosta,Italy,True,1969,11,1
2,England,France,2,0,Euro,Turin,Italy,True,1969,11,1
3,Italy,Denmark,3,1,Euro,Turin,Italy,False,1969,11,1
4,England,West Germany,5,1,World Cup,Genova,Italy,True,1970,07,1
...,...,...,...,...,...,...,...,...,...,...,...
5654,England,Colombia,2,1,FIFA World Cup,Sydney,Australia,True,2023,08,1
5655,Spain,Sweden,2,1,FIFA World Cup,Auckland,New Zealand,True,2023,08,1
5656,Australia,England,1,3,FIFA World Cup,Sydney,Australia,False,2023,08,0
5657,Australia,Sweden,0,2,FIFA World Cup,Brisbane,Australia,False,2023,08,0


In [61]:
data = data.drop(['home_score' , 'away_score'],axis = 1)

In [62]:
data.head()

Unnamed: 0,home_team,away_team,tournament,city,country,neutral,year,month,home_victory
0,Italy,France,Euro,Novara,Italy,False,1969,11,1
1,Denmark,England,Euro,Aosta,Italy,True,1969,11,1
2,England,France,Euro,Turin,Italy,True,1969,11,1
3,Italy,Denmark,Euro,Turin,Italy,False,1969,11,1
4,England,West Germany,World Cup,Genova,Italy,True,1970,7,1


In [63]:
data['neutral'] = data['neutral'].astype('int')

In [64]:
data.head()

Unnamed: 0,home_team,away_team,tournament,city,country,neutral,year,month,home_victory
0,Italy,France,Euro,Novara,Italy,0,1969,11,1
1,Denmark,England,Euro,Aosta,Italy,1,1969,11,1
2,England,France,Euro,Turin,Italy,1,1969,11,1
3,Italy,Denmark,Euro,Turin,Italy,0,1969,11,1
4,England,West Germany,World Cup,Genova,Italy,1,1970,7,1


In [65]:
# encoding the  values 
def onehot_encode(df , columns , prefixes):
    df = df.copy()
    for column,prefix in zip(columns , prefixes) :
        dummies_df = pd.get_dummies(df[column] , prefix = prefix)
        df = pd.concat([df , dummies_df], axis = 1)
        df = df.drop(column ,axis=1)
    return df   

In [26]:
# data

In [66]:
data = onehot_encode(data, 
                     ['home_team' , 'away_team' ,'tournament' ,'city' ,'country'],
                     ['home','away','tourn','city','country'])

In [69]:
data.head()

Unnamed: 0,neutral,year,month,home_victory,home_Albania,home_Algeria,home_American Samoa,home_Andorra,home_Angola,home_Anguilla,...,country_Turkey,country_Uganda,country_Ukraine,country_United States,country_United States Virgin Islands,country_Uzbekistan,country_Vietnam,country_Wales,country_Zambia,country_Zimbabwe
0,0,1969,11,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1969,11,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1969,11,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1969,11,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1970,7,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [70]:
data.isnull().sum()

neutral               0
year                  0
month                 0
home_victory          0
home_Albania          0
                     ..
country_Uzbekistan    0
country_Vietnam       0
country_Wales         0
country_Zambia        0
country_Zimbabwe      0
Length: 1763, dtype: int64

In [71]:
# split the data 
X = data.drop('home_victory' , axis = 1)
y = data.loc[: ,'home_victory']

In [72]:
X

Unnamed: 0,neutral,year,month,home_Albania,home_Algeria,home_American Samoa,home_Andorra,home_Angola,home_Anguilla,home_Antigua and Barbuda,...,country_Turkey,country_Uganda,country_Ukraine,country_United States,country_United States Virgin Islands,country_Uzbekistan,country_Vietnam,country_Wales,country_Zambia,country_Zimbabwe
0,0,1969,11,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1969,11,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1969,11,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1969,11,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1970,07,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5654,1,2023,08,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5655,1,2023,08,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5656,0,2023,08,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5657,0,2023,08,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [73]:
y

0       1
1       1
2       1
3       1
4       1
       ..
5654    1
5655    1
5656    0
5657    0
5658    1
Name: home_victory, Length: 5659, dtype: int64

In [74]:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X) , columns = X.columns)

In [75]:
X

Unnamed: 0,neutral,year,month,home_Albania,home_Algeria,home_American Samoa,home_Andorra,home_Angola,home_Anguilla,home_Antigua and Barbuda,...,country_Turkey,country_Uganda,country_Ukraine,country_United States,country_United States Virgin Islands,country_Uzbekistan,country_Vietnam,country_Wales,country_Zambia,country_Zimbabwe
0,-0.905649,-3.578354,1.388152,-0.059554,-0.054892,-0.013294,-0.013294,-0.035192,-0.023031,-0.042074,...,-0.131365,-0.029738,-0.084372,-0.240738,-0.013294,-0.039911,-0.111101,-0.086471,-0.047985,-0.047985
1,1.104181,-3.578354,1.388152,-0.059554,-0.054892,-0.013294,-0.013294,-0.035192,-0.023031,-0.042074,...,-0.131365,-0.029738,-0.084372,-0.240738,-0.013294,-0.039911,-0.111101,-0.086471,-0.047985,-0.047985
2,1.104181,-3.578354,1.388152,-0.059554,-0.054892,-0.013294,-0.013294,-0.035192,-0.023031,-0.042074,...,-0.131365,-0.029738,-0.084372,-0.240738,-0.013294,-0.039911,-0.111101,-0.086471,-0.047985,-0.047985
3,-0.905649,-3.578354,1.388152,-0.059554,-0.054892,-0.013294,-0.013294,-0.035192,-0.023031,-0.042074,...,-0.131365,-0.029738,-0.084372,-0.240738,-0.013294,-0.039911,-0.111101,-0.086471,-0.047985,-0.047985
4,1.104181,-3.487723,0.140942,-0.059554,-0.054892,-0.013294,-0.013294,-0.035192,-0.023031,-0.042074,...,-0.131365,-0.029738,-0.084372,-0.240738,-0.013294,-0.039911,-0.111101,-0.086471,-0.047985,-0.047985
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5654,1.104181,1.315708,0.452744,-0.059554,-0.054892,-0.013294,-0.013294,-0.035192,-0.023031,-0.042074,...,-0.131365,-0.029738,-0.084372,-0.240738,-0.013294,-0.039911,-0.111101,-0.086471,-0.047985,-0.047985
5655,1.104181,1.315708,0.452744,-0.059554,-0.054892,-0.013294,-0.013294,-0.035192,-0.023031,-0.042074,...,-0.131365,-0.029738,-0.084372,-0.240738,-0.013294,-0.039911,-0.111101,-0.086471,-0.047985,-0.047985
5656,-0.905649,1.315708,0.452744,-0.059554,-0.054892,-0.013294,-0.013294,-0.035192,-0.023031,-0.042074,...,-0.131365,-0.029738,-0.084372,-0.240738,-0.013294,-0.039911,-0.111101,-0.086471,-0.047985,-0.047985
5657,-0.905649,1.315708,0.452744,-0.059554,-0.054892,-0.013294,-0.013294,-0.035192,-0.023031,-0.042074,...,-0.131365,-0.029738,-0.084372,-0.240738,-0.013294,-0.039911,-0.111101,-0.086471,-0.047985,-0.047985


In [76]:
X_train , X_test , y_train , y_test = train_test_split(X , y ,train_size = 0.70 , random_state = 67)

In [77]:
X.shape , y.mean()

((5659, 1762), 0.5258879660717442)

In [79]:
# model const.

inputs = tf.keras.Input(shape = (1762 , ))
x = tf.keras.layers.Dense(64 , 'relu')(inputs)
x = tf.keras.layers.Dense(64 , 'relu')(x)
outputs = tf.keras.layers.Dense(1 , 'sigmoid')(x)

model = tf.keras.Model(inputs = inputs , outputs = outputs)


model.compile(loss = 'binary_crossentropy',
             optimizer = 'adam',
             metrics = ['accuracy' , tf.keras.metrics.AUC(name = 'auc')])

In [80]:
batch_size = 32
epochs = 50 


In [81]:
history = model.fit(X_train , y_train , validation_split = 0.20 , epochs = epochs , batch_size = batch_size)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [82]:
fig = px.line(history.history , y=['loss' , 'val_loss'] , labels = {'x':'Epochs' , "y":'loss'} , title = 'loss over time')
fig.show()