Goal: investigate the connection betweeen shots-on-target, half-time scores, and home/away team wins.

We use historical data downloaded from football-data.co.uk

The first step is to load the data into a dataframe

In [1]:
import pandas as pd
import glob

# get filenames for Premier League data
# a given season, say 1992-1993, is stored in /data/E09293.csv

filenames = glob.glob('data/pred_winner/*.csv')

# make a list of DataFrames, one for each season file

season_data = []

cols = ['FTHG','FTAG','HTHG','HTAG','HST','AST']

seasons_skipped = 0

for file in filenames:
    # read the file into a DataFrame season, selecting only the required columns, and dropping any rows of all NaNs
    # some seasons don't have shots on target data, we'll skip those
    try:
        season = pd.read_csv(file,encoding = "ISO-8859-1",usecols=cols).dropna(axis=0,how='any')
        season_data.append(season)
    except ValueError:
        seasons_skipped += 1

data = pd.concat(season_data).reset_index(drop=True)
print('num seasons skipped =' + str(seasons_skipped))

num seasons skipped =19


Having done so, we get the following DataFrame

FTHG = number of goals scored by the home team at full time  
FTAG = number of goals scored by the away team at full time  
HTHG = number of goals scored by the home team at half time  
HTAG = number of goals scored by the away team at half time

In [2]:
data.head()

Unnamed: 0,FTHG,FTAG,HTHG,HTAG,HST,AST
0,1.0,0.0,0.0,0.0,7.0,2.0
1,4.0,1.0,1.0,0.0,6.0,5.0
2,4.0,0.0,2.0,0.0,7.0,5.0
3,2.0,2.0,2.0,2.0,5.0,7.0
4,0.0,1.0,0.0,0.0,2.0,2.0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24099 entries, 0 to 24098
Data columns (total 6 columns):
FTHG    24099 non-null float64
FTAG    24099 non-null float64
HTHG    24099 non-null float64
HTAG    24099 non-null float64
HST     24099 non-null float64
AST     24099 non-null float64
dtypes: float64(6)
memory usage: 1.1 MB


We want to use the HomeTeam, AwayTeam, HTHG, and HTAG data to predict whether or not the home team wins the game.

In [4]:
data.head()


Unnamed: 0,FTHG,FTAG,HTHG,HTAG,HST,AST
0,1.0,0.0,0.0,0.0,7.0,2.0
1,4.0,1.0,1.0,0.0,6.0,5.0
2,4.0,0.0,2.0,0.0,7.0,5.0
3,2.0,2.0,2.0,2.0,5.0,7.0
4,0.0,1.0,0.0,0.0,2.0,2.0


In [5]:
# add a column for win/draw/loss to the DataFrame
# this column will have 2 for a win, 1 for a draw, 0 for a loss

data['win_drawloss'] = (data.FTHG - data.FTAG > 0).astype(int)
data = data.drop(['FTHG','FTAG'],axis=1)
data['shots'] = data['HST'] - data['AST']
data['halftime'] = data['HTHG'] - data['HTAG']
data = data.drop(['HST','AST','HTHG','HTAG'],axis=1)
data.head()

Unnamed: 0,win_drawloss,shots,halftime
0,1,5.0,0.0
1,1,1.0,1.0
2,1,2.0,2.0
3,0,-2.0,0.0
4,0,0.0,0.0


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
half_tie = data[data['halftime']==0]
sns.swarmplot(half_tie.iloc[:10000,0],half_tie.iloc[:10000,1])
plt.show()

In [6]:
import numpy as np
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping

target = to_categorical(data.win_drawloss)
#data_numeric = pd.get_dummies(data,columns=['HomeTeam','AwayTeam'],drop_first=True)
predictors = np.array(data.drop(['win_drawloss'],axis=1))

# Construct a sequential neural network model
neural_net = Sequential()

# Construct hidden layers
neural_net.add(Dense(32,activation='relu',input_shape=(predictors.shape[1],)))
neural_net.add(Dense(32,activation='relu'))

# Construct output layer
neural_net.add(Dense(2,activation='softmax'))

# Compile the model
neural_net.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

# Stop after
stop_early = EarlyStopping(patience=2)

# Fit the model
neural_net.fit(predictors,target,validation_split=.3,callbacks=[stop_early])

Using TensorFlow backend.


Train on 16869 samples, validate on 7230 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10


<keras.callbacks.History at 0x1eef65a3e80>

In [12]:
neural_net.predict([[2,2],[0,0]])

array([[ 0.07043995,  0.92956001],
       [ 0.71501887,  0.2849811 ]], dtype=float32)