In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from functions import *
from datetime import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import normalize
from sklearn.svm import LinearSVC

In [2]:
%%time
path = '/Users/allanbellahsene/Desktop/PROJECT_ALLAN/data/PREMIER_LEAGUE/PL_'
years = ['2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010',
         '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']
data = import_data(init_path=path, years=years)

CPU times: user 254 ms, sys: 23.9 ms, total: 278 ms
Wall time: 303 ms


The variable data is a list, and each element of this list corresponds to the pandas dataframe that captures the data of one Premier League season. For instance, the first element of the list corresponds to the data of the 2000-2001 Premier League season.

In [3]:
data[0].head()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,IWA,LBH,LBD,LBA,SBH,SBD,SBA,WHH,WHD,WHA
0,E0,19/08/00,Charlton,Man City,4,0,H,2,0,H,...,2.7,2.2,3.25,2.75,2.2,3.25,2.88,2.1,3.2,3.1
1,E0,19/08/00,Chelsea,West Ham,4,2,H,1,0,H,...,4.2,1.5,3.4,6.0,1.5,3.6,6.0,1.44,3.6,6.5
2,E0,19/08/00,Coventry,Middlesbrough,1,3,A,1,1,D,...,2.7,2.25,3.2,2.75,2.3,3.2,2.75,2.3,3.2,2.62
3,E0,19/08/00,Derby,Southampton,2,2,D,1,2,A,...,3.5,2.2,3.25,2.75,2.05,3.2,3.2,2.0,3.2,3.2
4,E0,19/08/00,Leeds,Everton,2,0,H,2,0,H,...,4.5,1.55,3.5,5.0,1.57,3.6,5.0,1.61,3.5,4.5


In [4]:
def merge_list(data):
    df = pd.merge(data[0], data[1], 'outer')
    for i in range(2, len(data)):
        df = pd.merge(df, data[i], 'outer')
    return df

In [5]:
df = merge_list(data)

In [6]:
df.head()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,BbAvAHA,BSH,BSD,BSA,PSH,PSD,PSA,PSCH,PSCD,PSCA
0,E0,19/08/00,Charlton,Man City,4.0,0.0,H,2.0,0.0,H,...,,,,,,,,,,
1,E0,19/08/00,Chelsea,West Ham,4.0,2.0,H,1.0,0.0,H,...,,,,,,,,,,
2,E0,19/08/00,Coventry,Middlesbrough,1.0,3.0,A,1.0,1.0,D,...,,,,,,,,,,
3,E0,19/08/00,Derby,Southampton,2.0,2.0,D,1.0,2.0,A,...,,,,,,,,,,
4,E0,19/08/00,Leeds,Everton,2.0,0.0,H,2.0,0.0,H,...,,,,,,,,,,


Key to results data:

- Div = League Division
- Date = Match Date (dd/mm/yy)
- Time = Time of match kick off
- HomeTeam = Home Team
- AwayTeam = Away Team
- FTHG and HG = Full Time Home Team Goals
- FTAG and AG = Full Time Away Team Goals
- FTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)
- HTHG = Half Time Home Team Goals
- HTAG = Half Time Away Team Goals
- HTR = Half Time Result (H=Home Win, D=Draw, A=Away Win)

Match Statistics (where available)
- Attendance = Crowd Attendance
- Referee = Match Referee
- HS = Home Team Shots
- AS = Away Team Shots
- HST = Home Team Shots on Target
- AST = Away Team Shots on Target
- HHW = Home Team Hit Woodwork
- AHW = Away Team Hit Woodwork
- HC = Home Team Corners
- AC = Away Team Corners
- HF = Home Team Fouls Committed
- AF = Away Team Fouls Committed
- HFKC = Home Team Free Kicks Conceded
- AFKC = Away Team Free Kicks Conceded
- HO = Home Team Offsides
- AO = Away Team Offsides
- HY = Home Team Yellow Cards
- AY = Away Team Yellow Cards
- HR = Home Team Red Cards
- AR = Away Team Red Cards
- HBP = Home Team Bookings Points (10 = yellow, 25 = red)
- ABP = Away Team Bookings Points (10 = yellow, 25 = red)

Key to 1X2 (match) betting odds data:

- B365H = Bet365 home win odds
- B365D = Bet365 draw odds
- B365A = Bet365 away win odds
- BSH = Blue Square home win odds
- BSD = Blue Square draw odds
- BSA = Blue Square away win odds
- BWH = Bet&Win home win odds
- BWD = Bet&Win draw odds
- BWA = Bet&Win away win odds
- GBH = Gamebookers home win odds
- GBD = Gamebookers draw odds
- GBA = Gamebookers away win odds
- IWH = Interwetten home win odds
- IWD = Interwetten draw odds
- IWA = Interwetten away win odds
- LBH = Ladbrokes home win odds
- LBD = Ladbrokes draw odds
- LBA = Ladbrokes away win odds
- PSH and PH = Pinnacle home win odds
- PSD and PD = Pinnacle draw odds
- PSA and PA = Pinnacle away win odds
- SOH = Sporting Odds home win odds
- SOD = Sporting Odds draw odds
- SOA = Sporting Odds away win odds
- SBH = Sportingbet home win odds
- SBD = Sportingbet draw odds
- SBA = Sportingbet away win odds
- SJH = Stan James home win odds
- SJD = Stan James draw odds
- SJA = Stan James away win odds
- SYH = Stanleybet home win odds
- SYD = Stanleybet draw odds
- SYA = Stanleybet away win odds
- VCH = VC Bet home win odds
- VCD = VC Bet draw odds
- VCA = VC Bet away win odds
- WHH = William Hill home win odds
- WHD = William Hill draw odds
- WHA = William Hill away win odds

In [7]:
df.loc[df.FTR == "H", "FTR"] = 1 #Replace nominal target variables by numbers
df.loc[df.FTR == "D", "FTR"] = 0
df.loc[df.FTR == "A", "FTR"] = 2

In [8]:
df.head()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,BbAvAHA,BSH,BSD,BSA,PSH,PSD,PSA,PSCH,PSCD,PSCA
0,E0,19/08/00,Charlton,Man City,4.0,0.0,1,2.0,0.0,H,...,,,,,,,,,,
1,E0,19/08/00,Chelsea,West Ham,4.0,2.0,1,1.0,0.0,H,...,,,,,,,,,,
2,E0,19/08/00,Coventry,Middlesbrough,1.0,3.0,2,1.0,1.0,D,...,,,,,,,,,,
3,E0,19/08/00,Derby,Southampton,2.0,2.0,0,1.0,2.0,A,...,,,,,,,,,,
4,E0,19/08/00,Leeds,Everton,2.0,0.0,1,2.0,0.0,H,...,,,,,,,,,,


In [9]:
[n_observations, n_features] = df.shape
start_date = df['Date'][0]
final_date = df['Date'][len(df)-1]
print('This dataset contains data for ' + str(n_observations), 'Premier League games, between ' + str(start_date), 'and ' + str(final_date), ', with a total of ' + str(n_features), 
      'variables.')

This dataset contains data for 7221 Premier League games, between 19/08/00 and 12/05/2019 , with a total of 103 variables.


In [10]:
print(df.isnull().sum())

Div              1
Date             1
HomeTeam         1
AwayTeam         1
FTHG             1
FTAG             1
FTR              1
HTHG             1
HTAG             1
HTR              1
Attendance    6462
Referee          1
HS               1
AS               1
HST              1
AST              1
HHW           6461
AHW           6461
HC               1
AC               1
HF               1
AF               1
HO            6461
AO            6461
HY               1
AY               1
HR               1
AR               1
HBP           6461
ABP           6461
              ... 
VCH           1901
VCD           1901
VCA           1901
Bb1X2         1901
BbMxH         1901
BbAvH         1901
BbMxD         1901
BbAvD         1901
BbMxA         1901
BbAvA         1901
BbOU          1901
BbMx>2.5      1901
BbAv>2.5      1901
BbMx<2.5      1901
BbAv<2.5      1901
BbAH          1911
BbAHh         1911
BbMxAHH       1911
BbAvAHH       1911
BbMxAHA       1911
BbAvAHA       1911
BSH         

In [11]:
df[['B365H', 'B365D', 'B365A', 'BSH', 'BSD', 'BSA', 'BWH', 'BWD', 'BWA', 'GBH', 'GBD', 'GBA'
   , 'PSH', 'PSD', 'PSA', 'WHH', 'WHD', 'WHA', 'SBH', 'SBD', 'SBA', 'SJH', 'SJD', 'SJA', 'VCH', 'VCD', 'VCA'
   , 'SOH', 'SOD', 'SOA', 'SYH', 'SYD', 'SYA', 'LBH', 'LBD', 'LBA', 'IWH', 'IWD', 'IWA']].isnull().sum()

B365H     761
B365D     761
B365A     761
BSH      4941
BSD      4941
BSA      4941
BWH      1522
BWD      1522
BWA      1522
GBH      2365
GBD      2365
GBA      2365
PSH      4561
PSD      4561
PSA      4561
WHH       388
WHD       388
WHA       388
SBH      2717
SBD      2717
SBA      2717
SJH      3802
SJD      3802
SJA      3802
VCH      1901
VCD      1901
VCA      1901
SOH      6461
SOD      6461
SOA      6461
SYH      6848
SYD      6848
SYA      6848
LBH       443
LBD       443
LBA       443
IWH        19
IWD        19
IWA        19
dtype: int64

In [12]:
#For now, we select the features to keep manually, as they are the ones of interest for us (for now)
df = df[['Date', 'HomeTeam', 'AwayTeam', 'FTR', 'FTHG', 'FTAG', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC'
          , 'HF', 'AF', 'IWH', 'IWD', 'IWA']]

In [13]:
df.isnull().sum()

Date         1
HomeTeam     1
AwayTeam     1
FTR          1
FTHG         1
FTAG         1
HS           1
AS           1
HST          1
AST          1
HC           1
AC           1
HF           1
AF           1
IWH         19
IWD         19
IWA         19
dtype: int64

In [14]:
df.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTR,FTHG,FTAG,HS,AS,HST,AST,HC,AC,HF,AF,IWH,IWD,IWA
0,19/08/00,Charlton,Man City,1,4.0,0.0,17.0,8.0,14.0,4.0,6.0,6.0,13.0,12.0,2.2,2.9,2.7
1,19/08/00,Chelsea,West Ham,1,4.0,2.0,17.0,12.0,10.0,5.0,7.0,7.0,19.0,14.0,1.6,3.2,4.2
2,19/08/00,Coventry,Middlesbrough,2,1.0,3.0,6.0,16.0,3.0,9.0,8.0,4.0,15.0,21.0,2.2,2.9,2.7
3,19/08/00,Derby,Southampton,0,2.0,2.0,6.0,13.0,4.0,6.0,5.0,8.0,11.0,13.0,1.8,3.0,3.5
4,19/08/00,Leeds,Everton,1,2.0,0.0,17.0,12.0,8.0,6.0,6.0,4.0,21.0,20.0,1.55,3.3,4.5


In [15]:
# Drop row with nan
df.dropna(axis=0,how='any',inplace=True)

df.isnull().sum().sum()

0

In [16]:
df.reset_index(inplace=True)
df = df.drop(['index'], axis=1)

In [18]:
#We create a new date column which is in a data format so we can use later on logical operations with dates.
%%time
dates = []
for i in range(len(df)):
    date = df['Date'].iloc[i]
    try:   
        date = datetime.strptime(date, '%d/%m/%y')
    except:
        date = datetime.strptime(date, '%d/%m/%Y')   
    dates.append(date)
dates = pd.DataFrame(dates, columns=['Date'])
df['date'] = dates

UsageError: Line magic function `%%time` not found.


In [19]:
df.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTR,FTHG,FTAG,HS,AS,HST,AST,HC,AC,HF,AF,IWH,IWD,IWA
0,19/08/00,Charlton,Man City,1,4.0,0.0,17.0,8.0,14.0,4.0,6.0,6.0,13.0,12.0,2.2,2.9,2.7
1,19/08/00,Chelsea,West Ham,1,4.0,2.0,17.0,12.0,10.0,5.0,7.0,7.0,19.0,14.0,1.6,3.2,4.2
2,19/08/00,Coventry,Middlesbrough,2,1.0,3.0,6.0,16.0,3.0,9.0,8.0,4.0,15.0,21.0,2.2,2.9,2.7
3,19/08/00,Derby,Southampton,0,2.0,2.0,6.0,13.0,4.0,6.0,5.0,8.0,11.0,13.0,1.8,3.0,3.5
4,19/08/00,Leeds,Everton,1,2.0,0.0,17.0,12.0,8.0,6.0,6.0,4.0,21.0,20.0,1.55,3.3,4.5


In [20]:
# Feature with more than 10% of NaN
#NaN_feature = df.isnull().sum()[df.isnull().sum()>0.10*len(df)].index

# Drop them
#df.drop(columns = NaN_feature, inplace=True)

In [21]:
df.dtypes

Date         object
HomeTeam     object
AwayTeam     object
FTR          object
FTHG        float64
FTAG        float64
HS          float64
AS          float64
HST         float64
AST         float64
HC          float64
AC          float64
HF          float64
AF          float64
IWH         float64
IWD         float64
IWA         float64
dtype: object

# Feature Engineering

In this section, we will create new features using the ones we already have. The features we will create are:
- $Performance_{t,i}$ : the performance of team $i$ at time $t$ takes into account all relevant statistics of team $t$ for the game played at time $t$. It is computed as follows: $Performance_{t,i} = Attack_{t,i} - Defense_{t,i} + GameOutcome_{t,i}$
- $Attack_{t,i}$: the attack of team $i$ at time $t$ is computed as a linear function of the following features, for team $i$ during the game that occured at time $t$: number of goals scored, number of shots on target, number of shots, number of corners, number of fools obtained. Hence, we have: 
$Attack_{t,i} = \beta_1 Goals_{t,i} + \beta_2 TargetShots_{t,i} + \beta_3 TotalShots_{t,i} + \beta_4 Corners_{t,i} + \beta_5 FoulsObtained_{t,i} $ All coefficients are first determined arbitrarily (but one can imagine performing a hyperparameter selection), but we believe that an order between coefficients must always be respected to take into account the difference of importance between each statistics. For example, Goals are more important than Shots, which are more important than Corners, etc. Hence, the following order must be respected: $\beta_1 > \beta_2 > \beta_3 > \beta_4 > \beta_5$
- $Defense_{t,i}$: the defense of team $i$ at time $t$ is also computed as a linear function of the same statistics, except those are the ones of the adversary team. The formula is just exactly the same as for the attack except all statistics (goals, shots, etc.) are the ones of the opponent team.
- $GameOutcome_{t,i}$ = +10 for a Win, 0 for a Draw, and -10 for a Loss.

The other feature that we would like to create is in fact a function of the previous feature mentioned just above. We call this feature Momentum.

- $Momentum_{t,i}$: the momentum of team $i$ at date $t$ is a function of a team's performances before date $t$. The goal of this feature is to capture the momentum of a team at a certain date, which we believe will have an impact on the game played at time $t$. For example, if a team won all of its 5 previous games, it is more likely to win the next than if the team had really bad performances before the game. Also, one parameter to take into account is the fact that the more recent the performances, the more weight should be given to them to capture that momentum. This is why the feature Momentum is nothing else but the Exponential Weighted Moving Average of a team's perfomances: the more recent the performance, the more importance should be given to it.
- Hence, we have that: $Momentum_{t,i} = \frac{ Performance_{t-1,i} + (1-\gamma)Performance_{t-2,i} + (1-\gamma)^2 Performance_{t-3,i} + ...} {1 + (1-\gamma) + (1 - \gamma)^2 + ...}$ where the parameter $\gamma$ captures the weight given to each performance, which decreases exponentially as the observations become far in time. The higher $\gamma$, the higher the exponential decay.

The final feature we create is called the F2F score. F2F stands for Face-to-Face. This feature aims at capturing the balance of power between two teams. Put simply, if two teams are going to have a match today, if team 1 has most of the time performed better than team 2 during their past confrontations, then this should be taken into account to predict the outcome of the next game between the two teams (this will more likely positively impact the odds of team 1 beating team 2). Hence, the F2F score will be computed as the EWMA of the past games performances between the two teams. Again, we use EWMA to give more weight to the most recent games.

In [25]:
def bookmaker_accuracy(df):
    """
    This function returns the accuracy of the bookmaker Interwetten (the one for which we have the most data).
    To compute the accuracy, we consider that for each game, the lowest odd proposed by the bookmaker corresponds
    to the most probable outcome according to the bookmaker. Hence, we compare the realized outcome with 
    the outcome implied by the lowest odd proposed by the bookmaker. The accuracy is then simply computed as 
    the number of correct outcomes divided by the number of total observations.
    """
    good_outcomes = []
    for i in range(len(df)):
        count = 0
        outcome = df['FTR'].iloc[i]
        min_odd = np.argmin(df[['IWH', 'IWD', 'IWA']].iloc[i])
        if min_odd == 'IWH' and outcome == 1 or min_odd == 'IWD' and outcome == 0 or min_odd == 'IWA' and outcome == 2:
            count = count + 1
        else:
            count = count
        good_outcomes.append(count)
    s = sum(good_outcomes)
    accuracy = s / len(df)
    
    return accuracy   

In [26]:
def choose_team(df, team, date):
    """
    Output: returns all the data of a certain team available at time t-1 to predict a game a time t.
    """
    return df.loc[((df['date'] < date) & (df['HomeTeam'] == team)) | (df['date'] < date) & (df['AwayTeam'] == team)] 

In [27]:
def choose_hometeam(data, team):
    """
    Returns all available home games data for a certain team.
    """
    return data.loc[(data['HomeTeam'] == team)]

In [28]:
def choose_awayteam(data, team):
    """
    Returns all available away games data for a certain team.
    """
    return data.loc[(data['AwayTeam'] == team)]

In [29]:
def choose_game(data, date, hometeam, awayteam):
    """
    Returns the data of a specific game, with a certain home team, a certain away team, at a given date.
    """
    return data.loc[(data['Date'] == date) & (data['HomeTeam'] == hometeam) & (data['AwayTeam'] == awayteam)]

In [30]:
def find_games(data, team1, team2):
    """
        Returns all available games data between team1 and team2.
    """
    return data.loc[(data['HomeTeam'] == team1) & (data['AwayTeam'] == team2) | (data['HomeTeam'] == team2) & (data['AwayTeam'] == team1) ]

In [31]:
def F2F(data, team1, team2, date):
    """
    Returns all available historical data of games between two teams that occured before a certain data.
    """
    return data.loc[(data['date'] < date) & (data['HomeTeam'] == team1) & (data['AwayTeam'] == team2) | (data['date'] < date) & (data['HomeTeam'] == team2) & (data['AwayTeam'] == team1)]
    

In [32]:
def compute_perf(team, game): 
    
    """Output: performance of a certain team at a certain game.
    The performance is calculated as follows: 
    perf = attack_perf - defense_perf + FTR
    FTR = +10 for a Win, 0 for a Deuce, -10 for a Loss
    attack_perf = Nb of goals scored * coef1 + Nb of shots * coef2 + Nb of on target shots * coef3 +
    ... Nb of corners * coef4 + Nb of fools conceeded * coef5
    defense_perf = - Nb of goals conceeded * coef1 -  Nb of shots conceed * coef2 - ... etc.
    
    All coefs are arbitrarily chosen in the code of the function below. One might decide performing hyperparemeter 
    optimization to chose them.
    
    Input: team is a string, game is an observation (row) of the initial DF.
    
    """
    
    w_bonus = 10 #+10 for a Win
    d_bonus = 0 #0 for a Draw
    l_malus = - w_bonus #-10 for a Loss
    g_coef = 0.5 * w_bonus #Coefficient for the number of goals
    ts_coef = 0.1 * g_coef #... for the nb of target shots
    s_coef = 0.5 * ts_coef #... for the nb of shots
    c_coef = 0.1 * s_coef #... for the nb of corners
    f_coef = 0.5 * c_coef #... for the nb of fouls
    
    if game['HomeTeam'] == team:
        attack = game['FTHG'] * g_coef + game['HS'] * s_coef + game['HST'] * ts_coef + game['HC'] * c_coef + game['AF'] * f_coef
        defense = - game['FTAG'] * g_coef - game['AS'] * s_coef - game['AST'] * ts_coef + game['AC'] * c_coef - game['HF'] * f_coef
        diff = attack + defense
        if game['FTR'] == 'H':
            global_perf = w_bonus + diff
        elif game['FTR'] == 'A':
            global_perf = l_malus + diff
        else:
            global_perf = diff
        
    if game['AwayTeam'] == team:
        attack = game['FTAG'] * g_coef + game['AS'] * s_coef + game['AST'] * ts_coef + game['AC'] * c_coef + game['HF'] * f_coef
        defense = -game['FTHG'] * g_coef - game['HS'] * s_coef - game['HST'] * ts_coef - game['HC'] * c_coef - game['AF'] * f_coef
        diff = attack + defense
        if game['FTR'] == 'A':
            global_perf = w_bonus + diff
        elif game['FTR'] == 'H':
            global_perf = l_malus + diff
        else:
            global_perf = diff

    return global_perf

In [33]:
def ewma_momentum(data, team, date, gamma):
    """Inputs: 
    data: Initial Pandas dataframe containing all observations and features about games
    team: Team to evaluate momentum for 
    date: Date at which the momentum of the desired team is computed
    
    Output: The Exponential Weighted Moving Average (EWMA) of all the team's performances before the desired date.
    EWMA gives more weight to more recent recent observations, and the weight decreases exponentially as 
    the observations become far in time. The higher the coefficient gamma, the higher the exponential decay.
    
    """
    subdata = choose_team(data, team, date) #all data available at t-1 to use to predict outcome at date t
    perfs=[]
    for i in range(len(subdata)):
        previous_game = subdata.iloc[i] #compute performance for all games before game at date t
        perfs.append(compute_perf(team, previous_game)) #stores performances of all games that happened until t-1
    
    #gamma = 0.01
    n = []
    d = []
    perf = [i for i in reversed(perfs)] #Now, perform an EWMA. To do so, we need to reverse the list, because
    # we go from the most recent observation (i.e. game) to the earliest one. 
    
    for i in range(len(perf)):
        #Apply EWMA formula
        coef = (1 - gamma)**i
        nominator = perf[i] * coef
        n.append(nominator)
        denominator = coef
        d.append(denominator)
    momentum = sum(n) / sum(d)
    
    return momentum

In [34]:
def F2F_score(data, team1, team2, date, alpha):
    
    """
    The goal of this function is to calculate a Face-to-Face (F2F) "score" between two teams.
    Meaning, taking in consideration all historical confrontations available in the data between two given teams,
    we would like to compute a metric that tells us which of the two teams has globally dominated the other one
    during their past confrontations. Again, we believe it makes sense to use the concept of EWMA,
    to give more importance to the most recent confrontations. The higher the parameter alpha, the less importance
    is given to observations far in the past.
    
    """
    
    subdata = F2F(data, team1, team2, date)
    perfs1=[]
    perfs2=[]
    for i in range(len(subdata)):
        previous_game = subdata.iloc[i]
        perfs1.append(compute_perf(team1, previous_game))
        perfs2.append(compute_perf(team2, previous_game))
    
    #alpha = 0.1
    n = []
    d = []
    perf1 = [i for i in reversed(perfs1)]
    for i in range(len(perf1)):
        coef = (1 - alpha)**i
        nominator = perf1[i] * coef
        n.append(nominator)
        denominator = coef
        d.append(denominator)
    momentum1 = sum(n) / sum(d)
    
    n = []
    d = []
    perf2 = [j for j in reversed(perfs2)]
    for j in range(len(perf2)):
        coef = (1 - alpha)**j
        nominator = perf2[j] * coef
        n.append(nominator)
        denominator = coef
        d.append(denominator)
    momentum2 = sum(n) / sum(d)
        
    return momentum1, momentum2

In [35]:
def ML_algo(df, algo, train_set_length):
    """
    This function returns the train accuracy and the test accuracy for a given ML algo (for now, only 
    Logistic Reg. or SVM).
    """
    from sklearn.linear_model import LogisticRegression
    from sklearn.preprocessing import normalize
    from sklearn.svm import LinearSVC
    
    Y = np.array(df.FTR)
    Y=Y.astype('int')
    train_set = int(train_set_length*len(Y))
    Y_train = Y[:train_set]
    x = np.array(df[['Home Momentum', 'Away Momentum', 'F2F Home Score', 'F2F Away Score', 'IWH', 'IWD', 'IWA']]) #Features selection
    x = normalize(x) #We normalize our features
    x_train = x[:train_set,:]
    model = algo(random_state=0).fit(x_train, Y_train)
    Y_test = Y[train_set+1:]
    x_test = x[train_set+1:,:]
    train_accuracy = model.score(x_train, Y_train)
    test_accuracy = model.score(x_test, Y_test)
    return train_accuracy, test_accuracy, model

In [36]:
#First, we fill our new features columns with NaN values.
df['Home Momentum'] = np.nan
df['Away Momentum'] = np.nan
df['F2F Home Score'] = np.nan
df['F2F Away Score'] = np.nan

In [37]:
%%time
gamma = 0.01 #We can later on use Hyperparameter selection to find the optimal gamma and alpha.
alpha = 0.2
k = 100
for i in range(k, len(df)):
    try:
        date = df['date'].iloc[i]
        hometeam = df['HomeTeam'].iloc[i]
        awayteam = df['AwayTeam'].iloc[i]
        df['Home Momentum'].iloc[i] = ewma_momentum(df, hometeam, date, gamma)
        df['Away Momentum'].iloc[i] = ewma_momentum(df, awayteam, date, gamma)
        df['F2F Home Score'].iloc[i] = F2F_score(df, hometeam, awayteam, date, alpha)[0]
        df['F2F Away Score'].iloc[i] = F2F_score(df, hometeam, awayteam, date, alpha)[1]
    except:
        pass
df1  = df.dropna()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


CPU times: user 44min 15s, sys: 12.1 s, total: 44min 27s
Wall time: 44min 41s


In [38]:
train_set_length = 0.75 #For now we use 75% of the whole date as the train set,
#but we could later on perform Cross validation
algorithm = LogisticRegression
train_acc, test_acc, model1 = ML_algo(df1, algorithm, train_set_length)
print('Train accuracy: ' + str(train_acc), 'Test accuracy: ' + str(test_acc))

Train accuracy: 0.5413672757134059 Test accuracy: 0.5539125077017868




In [39]:
algorithm = LinearSVC
train_acc, test_acc, model2 = ML_algo(df1, algorithm, train_set_length)
print('Train accuracy: ' + str(train_acc), 'Test accuracy: ' + str(test_acc))

Train accuracy: 0.5421884623280641 Test accuracy: 0.5545286506469501


In [40]:
bookmaker_accuracy(df1)

The current behaviour of 'Series.argmin' is deprecated, use 'idxmin'
instead.
The behavior of 'argmin' will be corrected to return the positional
minimum in the future. For now, use 'series.values.argmin' or
'np.argmin(np.array(values))' to get the position of the minimum
row.
  return bound(*args, **kwds)


0.5461123941493456