In [1]:
# Import the libraries all at the beginning 
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split

In [2]:
# Read in the data 
data = pd.read_csv("EPL 2000-2018.csv")

In [3]:
data.columns

Index(['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG',
       'HTR', 'Referee', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC',
       'HY', 'AY', 'HR', 'AR', 'Season'],
      dtype='object')

In [4]:
data=data.rename(columns={"FTHG":"Full Home Goals", "FTAG":"Full Away Goals", 
                    "HTHG":"Half Home Goals","HTAG":"Half Away Goals",
                    "HS":"Home Shots","AS":"Away Shots", "HST":"Home Shots on Target",
                    "AST":"Away Shots on Target", "HF":"Home Fouls", "AF":"Away Fouls",
                    "HC":"Home Corners","AC":"Away Corners","HY":"Home Yellow Cards",
                    "AY":"Away Yellow Cards","HR":"Home Red Cards","AR":"Away Red Cards"})

In [5]:
#Clean the data 
data.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,Full Home Goals,Full Away Goals,FTR,Half Home Goals,Half Away Goals,HTR,Referee,...,Away Shots on Target,Home Fouls,Away Fouls,Home Corners,Away Corners,Home Yellow Cards,Away Yellow Cards,Home Red Cards,Away Red Cards,Season
0,19/08/00,Charlton,Man City,4.0,0.0,H,2.0,0.0,H,Rob Harris,...,4.0,13.0,12.0,6.0,6.0,1.0,2.0,0.0,0.0,00-01
1,19/08/00,Chelsea,West Ham,4.0,2.0,H,1.0,0.0,H,Graham Barber,...,5.0,19.0,14.0,7.0,7.0,1.0,2.0,0.0,0.0,00-01
2,19/08/00,Coventry,Middlesbrough,1.0,3.0,A,1.0,1.0,D,Barry Knight,...,9.0,15.0,21.0,8.0,4.0,5.0,3.0,1.0,0.0,00-01
3,19/08/00,Derby,Southampton,2.0,2.0,D,1.0,2.0,A,Andy D'Urso,...,6.0,11.0,13.0,5.0,8.0,1.0,1.0,0.0,0.0,00-01
4,19/08/00,Leeds,Everton,2.0,0.0,H,2.0,0.0,H,Dermot Gallagher,...,6.0,21.0,20.0,6.0,4.0,1.0,3.0,0.0,0.0,00-01


In [6]:
# Functions pertaining to subsetting the data set

## Find all of a particular team's results
def ExploreTeam(TeamName):
    exploration = data[(data.HomeTeam == TeamName) | (data.AwayTeam == TeamName)]
    return exploration

## Find all the meetings between two teams 
def ExploreFixtureWithHomeAndAway(Team1, Team2, Specific):
    if (Specific == True):
        exploration = data[(data.HomeTeam == Team1) & (data.AwayTeam == Team2)]
    else:
        exploration = data[((data.HomeTeam == Team1) & (data.AwayTeam == Team2)) |
                      (data.HomeTeam == Team2) & (data.AwayTeam == Team1)]
    return exploration

## Find the counts of wins per team in their fixtures 
def WhoWonMore(Team1, Team2):
    dataToUse = data[((data.HomeTeam == Team1) & (data.AwayTeam == Team2)) |
                      (data.HomeTeam == Team2) & (data.AwayTeam == Team1)]
    Team1Win = 0
    Team2Win = 0
    draw = 0
    for index, row in dataToUse.iterrows():
        if (row['HomeTeam'] == Team1):
            if (row['Full Home Goals'] > row['Full Away Goals']):
                Team1Win = Team1Win + 1
            elif (row['Full Home Goals'] < row['Full Away Goals']):
                Team2Win = Team2Win + 1
            else:
                draw = draw + 1
        elif (row['HomeTeam'] == Team2):
            if (row['Full Home Goals'] > row['Full Away Goals']):
                Team2Win = Team2Win + 1
            elif (row['Full Home Goals'] < row['Full Away Goals']):
                Team1Win = Team1Win + 1
            else:
                draw = draw + 1
    
    print(str(Team1) + " has beaten " + str(Team2) + " " + str(Team1Win) + " times. ")
    print(str(Team2) + " has beaten " + str(Team1) + " " + str(Team2Win) + " times. ")
    print(str(Team1) + " and " + str(Team2) + " have drawn " + str(draw) + " times. ")
    
def WhoWasTheRef(Team1, Team2, Date): 
    for index, row in data.iterrows():
        if (row['HomeTeam'] == Team1 and row['AwayTeam'] == Team2 and row['Date'] == Date):
            referee = row['Referee']
        elif (row['HomeTeam'] == Team2 and row['AwayTeam'] == Team1 and row['Date'] == Date):
            referee = row['Referee']
    return referee


In [10]:
WhoWonMore('Arsenal','Liverpool')

Arsenal has beaten Liverpool 12 times. 
Liverpool has beaten Arsenal 9 times. 
Arsenal and Liverpool have drawn 13 times. 


In [260]:
WhoWasTheRef("Chelsea","Arsenal","29/10/11")

'A Marriner'

In [261]:
analysisData = data.copy()

In [262]:
analysisData['HomeTeam'] = analysisData['HomeTeam'].astype('category')
analysisData['AwayTeam'] = analysisData['AwayTeam'].astype('category')
analysisData['Referee'] = analysisData['Referee'].astype('category')

In [263]:
analysisData['HomeEncoded'] = analysisData['HomeTeam'].cat.codes
analysisData['AwayEncoded'] = analysisData['AwayTeam'].cat.codes
analysisData['RefereeEncoded'] = analysisData['Referee'].cat.codes

In [264]:
dict(enumerate(analysisData['HomeTeam'].cat.categories))

{0: 'Arsenal',
 1: 'Aston Villa',
 2: 'Birmingham',
 3: 'Blackburn',
 4: 'Blackpool',
 5: 'Bolton',
 6: 'Bournemouth',
 7: 'Bradford',
 8: 'Brighton',
 9: 'Burnley',
 10: 'Cardiff',
 11: 'Charlton',
 12: 'Chelsea',
 13: 'Coventry',
 14: 'Crystal Palace',
 15: 'Derby',
 16: 'Everton',
 17: 'Fulham',
 18: 'Huddersfield',
 19: 'Hull',
 20: 'Ipswich',
 21: 'Leeds',
 22: 'Leicester',
 23: 'Liverpool',
 24: 'Man City',
 25: 'Man United',
 26: 'Middlesboro',
 27: 'Middlesbrough',
 28: 'Newcastle',
 29: 'Norwich',
 30: 'Portsmouth',
 31: 'QPR',
 32: 'Reading',
 33: 'Sheffield United',
 34: 'Southampton',
 35: 'Stoke',
 36: 'Sunderland',
 37: 'Swansea',
 38: 'Tottenham',
 39: 'Watford',
 40: 'West Brom',
 41: 'West Ham',
 42: 'Wigan',
 43: 'Wolves'}

In [265]:
dict(enumerate(analysisData['Referee'].cat.categories ))

{0: "A D'Urso",
 1: 'A Marriner',
 2: 'A Taylor',
 3: 'A Wiley',
 4: 'Alan Wiley',
 5: "Andy D'Urso",
 6: "Andy D'Urso ",
 7: 'Andy Hall',
 8: 'B Knight',
 9: 'Barry Knight',
 10: 'C Foy',
 11: 'C Kavanagh',
 12: 'C Pawson',
 13: 'C Wilkes',
 14: 'Clive Wilkes',
 15: 'D Coote',
 16: 'D Elleray',
 17: 'D Gallagh',
 18: 'D Gallaghe',
 19: 'D Gallagher',
 20: 'D Pugh',
 21: 'David Ellaray',
 22: 'Dermot Gallagher',
 23: 'E Wolstenholme',
 24: 'F Taylor',
 25: 'G Barber',
 26: 'G Poll',
 27: 'G Scott',
 28: 'Graham Barber',
 29: 'Graham Poll',
 30: 'H Webb',
 31: 'I Williamson',
 32: 'Ian Harris',
 33: 'J Moss',
 34: 'J Winter',
 35: 'Jeff Winter',
 36: 'K Friend',
 37: 'K Stroud',
 38: 'L Mason',
 39: 'L Probert',
 40: 'M Atkinson',
 41: 'M Clattenburg',
 42: 'M Dean',
 43: 'M Halsey',
 44: 'M Jones',
 45: 'M Messias',
 46: 'M Oliver',
 47: 'M Riley',
 48: 'Mark Halsey',
 49: 'Mark Halsey ',
 50: 'Matt Messias',
 51: 'Mike Dean',
 52: 'Mike Riley',
 53: 'Mn Atkinson',
 54: 'N Barry',
 55:

In [266]:
analysisData = analysisData.drop(['Half Home Goals', 'Half Away Goals',
                                  'Home Shots on Target', 'Away Shots on Target'], axis=1)


In [267]:
analysisData = analysisData.dropna()
analysisData['Full Home Goals'] = analysisData['Full Home Goals'].apply(np.int64)
analysisData['Full Away Goals'] = analysisData['Full Away Goals'].apply(np.int64)
analysisData['Home Shots'] = analysisData['Home Shots'].apply(np.int64)
analysisData['Away Shots'] = analysisData['Away Shots'].apply(np.int64)
analysisData['Home Corners']= analysisData['Home Corners'].apply(np.int64)
analysisData['Away Corners']= analysisData['Away Corners'].apply(np.int64)
analysisData['Home Yellow Cards']= analysisData['Home Yellow Cards'].apply(np.int64)
analysisData['Away Yellow Cards']= analysisData['Away Yellow Cards'].apply(np.int64)
analysisData['Home Red Cards']= analysisData['Home Red Cards'].apply(np.int64)
analysisData['Away Red Cards']= analysisData['Away Red Cards'].apply(np.int64)

In [268]:
analysisData.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,Full Home Goals,Full Away Goals,FTR,HTR,Referee,Home Shots,Away Shots,...,Home Corners,Away Corners,Home Yellow Cards,Away Yellow Cards,Home Red Cards,Away Red Cards,Season,HomeEncoded,AwayEncoded,RefereeEncoded
0,19/08/00,Charlton,Man City,4,0,H,H,Rob Harris,17,8,...,6,6,1,2,0,0,00-01,11,24,70
1,19/08/00,Chelsea,West Ham,4,2,H,H,Graham Barber,17,12,...,7,7,1,2,0,0,00-01,12,41,28
2,19/08/00,Coventry,Middlesbrough,1,3,A,D,Barry Knight,6,16,...,8,4,5,3,1,0,00-01,13,27,9
3,19/08/00,Derby,Southampton,2,2,D,A,Andy D'Urso,6,13,...,5,8,1,1,0,0,00-01,15,34,5
4,19/08/00,Leeds,Everton,2,0,H,H,Dermot Gallagher,17,12,...,6,4,1,3,0,0,00-01,21,16,22


In [269]:
features = ['HomeEncoded','AwayEncoded','RefereeEncoded', 'Home Shots','Away Shots']
target = ['Full Away Goals']

x_train, x_test, y_train, y_test = train_test_split(analysisData[features], analysisData[target],
                                                    test_size=0.25,random_state=42)
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=20).fit(x_train,y_train)

  model = RandomForestClassifier(n_estimators=20).fit(x_train,y_train)


In [270]:
x_test

Unnamed: 0,HomeEncoded,AwayEncoded,RefereeEncoded,Home Shots,Away Shots
1480,27,40,89,12,15
5754,25,22,42,16,10
4011,16,3,43,15,14
4732,12,37,44,19,8
247,15,36,9,9,17
...,...,...,...,...,...
109,11,7,56,11,12
3139,16,1,39,9,13
6403,0,34,1,13,15
5414,29,40,36,16,7


In [271]:
y_test

Unnamed: 0,Full Away Goals
1480,0
5754,1
4011,1
4732,0
247,0
...,...
109,0
3139,1
6403,2
5414,1


In [272]:
model.predict(np.array([[42,16,42,14,8]]))

array([3])

In [273]:
y_test.shape

(1615, 1)

In [274]:
analysisData[features]

Unnamed: 0,HomeEncoded,AwayEncoded,RefereeEncoded,Home Shots,Away Shots
0,11,24,70,17,8
1,12,41,28,17,12
2,13,27,9,6,16
3,15,34,5,6,13
4,21,16,22,17,12
...,...,...,...,...,...
6456,28,12,40,16,6
6457,34,24,1,8,13
6458,37,35,2,26,8
6459,38,22,12,14,16
