<a href="https://colab.research.google.com/github/annabelcoates/redev_machine_learning_samples/blob/master/fp_train_names_odds.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import libraries


In [0]:
import sys
import os
import urllib.request
import datetime;
import pandas as pd
import numpy as np
import math

from keras import optimizers
from keras.models import Sequential
from keras.optimizers import SGD
from keras.layers import Dense, Dropout, Activation, Flatten, Conv2D
from keras.layers import Convolution2D, MaxPooling2D
from keras.utils import np_utils, to_categorical
from keras.utils import plot_model


Using TensorFlow backend.


# Load Preprocessed Data

In [0]:
url='https://raw.githubusercontent.com/RedevLtd/redev_machine_learning_samples/master/FootballPredicator/preprocessed_data/preprocessed_data.csv'
raw_df=pd.read_csv(url)

# Add Categoric Names to the Dataframe

In [0]:
def add_categoric_names(df):
  home_list=np.asarray(df.HomeTeam)# Home team
  away_list=np.asarray(df.AwayTeam) # Away team

  # Put all the teams into one column so that the same team
  # is given the same ID regardless of whether it is playing 
  # a home or away game
  all_teams=np.append(home_list,away_list,axis=0)
  # Assign each team an ID
  all_teams_factorised=pd.factorize(all_teams) 
  all_teams_ids=all_teams_factorised[0]
  cat_key=all_teams_factorised[1]
  # Get the number of data points (not the length of all the names together)
  number_dp= len(home_list)
  # 1st half of list is home teams, 2nd is away
  home_id_list=all_teams_ids[0:number_dp]
  away_id_list=all_teams_ids[number_dp:(2*number_dp)]
  # Now put team IDs into categoric form e.g [2,3,0] would
  # become [[0 0 1 0],[0 0 0 1],[1 0 0 0]]
  home_id_array=to_categorical(home_id_list)
  away_id_array=to_categorical(away_id_list)
  # Append the categoric arrays as columns at the end of the original array
  both_ids_array=np.append(home_id_array,away_id_array,axis=1)
  new_df_array=np.append(df,both_ids_array,axis=1)
  # Turn the array back into a dataframe now that manipulation is finished
  column_titles=[  "Div", "MatchNo", "SeasonYear", "SeasonMatchNo", "MatchDate",
                 "HomeTeam","AwayTeam","OddsHomeWin","OddsDraw","OddsAwayWin",
                 "OddsWinDiff","Attendance","F_FTHomeGoals","F_FTAwayGoals",
                 "F_FTResult","F_FTHomeWin","F_FTDraw", "F_FTAwayWin", "F_HTHomeGoals",
                 "F_HTAwayGoals","F_HTResult", "F_HTHomeWin","F_HTDraw", "F_HTAwayWin"]
  start=len(column_titles)
  end=new_df_array.shape[1]
  categoric_columns= list(map(str,range(start,end)))
  column_titles=column_titles + categoric_columns
  new_df=pd.DataFrame(data=new_df_array,columns=column_titles)
  return new_df, cat_key

In [0]:
df, cat_key=add_categoric_names(raw_df)
df_array=np.asarray(df)
# Determine cause of random 0 in names of 

View the data in a table

In [0]:
df.head(3)

Unnamed: 0,Div,MatchNo,SeasonYear,SeasonMatchNo,MatchDate,HomeTeam,AwayTeam,OddsHomeWin,OddsDraw,OddsAwayWin,OddsWinDiff,Attendance,F_FTHomeGoals,F_FTAwayGoals,F_FTResult,F_FTHomeWin,F_FTDraw,F_FTAwayWin,F_HTHomeGoals,F_HTAwayGoals,F_HTResult,F_HTHomeWin,F_HTDraw,F_HTAwayWin,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243
0,E0,20000001,2000,1,2000-08-19,Charlton,Man City,2.1,3.2,3.1,1.0,20043,4,0,H,1,0,0,2,0,H,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,E0,20000002,2000,2,2000-08-19,Chelsea,West Ham,1.44,3.6,6.5,5.06,34914,4,2,H,1,0,0,1,0,H,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,E0,20000003,2000,3,2000-08-19,Coventry,Middlesbrough,2.3,3.2,2.62,0.32,20624,1,3,A,0,0,1,1,1,D,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Get the input data (categoric names and odds) in a single array

In [0]:
odds=df_array[:,[7,8,9]]
teams_categoric=df_array[:,list(range(24,243))]
input_array=np.append(odds,teams_categoric, axis=1)
results=df_array[:,[15,16,17]]

In [0]:
# Get the number of inputs and the number of datapoints
number_dp=input_array.shape[0]
n_inputs= input_array.shape[1]

Reshape to pass into the neural net

In [0]:
input_array=input_array.reshape(number_dp,1,1,n_inputs)
results=results.reshape(number_dp,1,1,3)

# Seperate into Training and Testing Data

Usually approx 80% of the data is used as training data and 20% of the data is used as test data

In [0]:
p=0.8
last_train= math.ceil(p*number_dp)
# Training data
input_train=input_array[0:last_train,:]
results_train=results[0:last_train,:]
# Test data
input_test=input_array[(last_train+1):number_dp,:]
results_test=results[(last_train+1):number_dp,:]

In [0]:
print(input_test.shape)
print(results_test.shape)

(7558, 1, 1, 222)
(7558, 1, 1, 3)


Print input shape

# Define Model Architecture

In [0]:
model=Sequential()
model.add(Dense(9,input_shape=(1,1,n_inputs), activation='relu')) # Adds a densely connected NN layer
model.add(Dense(3,activation='relu')) 
model.add(Dense(3,activation='softmax'))

# Compile Model

Categorial crossentropy loss is a loss algorithm for when there are multiple categories to sort each data point into, in this case  home win, draw or away win. The optimizer is a feature that makes the model work better, adam is the default.

In [0]:
#opt = SGD(lr=0.000001,momentum=0.004)
model.compile(loss='categorical_crossentropy',optimizer='adagrad',metrics=['accuracy']) 
# Want to know the accuracy of the model so specify it as a metric
plot_model(model,to_file='model.png')



# Fit Model from Training Data


* The number of epochs are the number of phases of training
* The shuffle parameter determines whether the training data is shuffled between epochs
* Batch size is the number of data points between updates to the weights
* **Greater number of epochs** can increase accuracy



In [39]:


model.fit(input_train,results_train,batch_size=1000,epochs=100,verbose=1,shuffle=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f7db9ca32e8>

# Evaluate Model

In [40]:
[loss_value,accuracy]=model.evaluate(input_test,results_test,verbose=1)
print("The loss value for the model is "+ str(loss_value))
print("The accuracy value for the model is "+ "{:.3f}".format(accuracy))


The loss value for the model is 1.0347239428898998
The accuracy value for the model is 0.478


# Make Prediction

In [38]:
counter=0
for team in cat_key:
  print(str(counter)+') ' + team)
  counter+=1
home_id=int(input('Enter the home team ID: '))
away_id=int(input('Enter the away team ID: '))
ids=list(range(len(cat_key)))
ids=to_categorical(ids)
home_team=ids[home_id]
away_team= ids[away_id]
input_array=np.asarray([1.2,3,4]) #Odds that favour home team
input_array=np.append(input_array,home_team)
input_array=np.append(input_array,away_team)
print(input_array.shape)
input_array=input_array.reshape(1,1,1,n_inputs)
prediction=model.predict(input_array)
print(prediction)

0) Charlton
1) Chelsea
2) Coventry
3) Derby
4) Leeds
5) Leicester
6) Liverpool
7) Sunderland
8) Tottenham
9) Man United
10) Arsenal
11) Bradford
12) Ipswich
13) Middlesbrough
14) Everton
15) Man City
16) Newcastle
17) Southampton
18) West Ham
19) Aston Villa
20) Barnsley
21) Blackburn
22) Bolton
23) Fulham
24) Gillingham
25) Grimsby
26) Huddersfield
27) Nott'm Forest
28) QPR
29) Sheffield United
30) Wimbledon
31) Wolves
32) Birmingham
33) Burnley
34) Crewe
35) Norwich
36) Portsmouth
37) Preston
38) Sheffield Weds
39) Stockport
40) Tranmere
41) Watford
42) West Brom
43) Crystal Palace
44) Bristol Rvs
45) Cambridge
46) Luton
47) Millwall
48) Northampton
49) Oldham
50) Oxford
51) Rotherham
52) Stoke
53) Swansea
54) Swindon
55) Wrexham
56) Colchester
57) Bournemouth
58) Brentford
59) Bristol City
60) Bury
61) Notts County
62) Port Vale
63) Reading
64) Walsall
65) Wigan
66) Wycombe
67) Peterboro
68) Barnet
69) Blackpool
70) Carlisle
71) Cheltenham
72) Chesterfield
73) Exeter
74) Kidderminst

KeyboardInterrupt: ignored