In [1]:
%pylab inline
%matplotlib inline

# Global Imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import os
import sys
import pickle
from pprint import pprint
from time import time
import datetime
from time import gmtime, strftime
import statsmodels.api as sm
from patsy import dmatrices

# Scikit-Learn imports
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression


from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import train_test_split

from sklearn.cross_validation import cross_val_score
from sklearn import metrics

# Local Imports
path = str(os.path.expanduser('~')) + '/git/predictEPL/config'
sys.path.append(path)
import paths

sys.path.append(paths.UTILS)
import useful_methods

Populating the interactive namespace from numpy and matplotlib


In [51]:
date_now = strftime("%Y-%m-%d %H:%M:%S", gmtime()).replace(" ", "_")

# ***************************************************
# [Step 1]: Data Load
# ***************************************************

# Read Data
df = useful_methods.csv_dic_df(paths.DATA_HOME + "EPL/all_game_emolex_counted.csv")

# Manipulations

dic_res ={
    'home_win': 1,
    'away_win': 0,
    'draw': 2
}

def Labeling(goal_diff):
    if goal_diff > 0:
        return 1
    elif goal_diff < 0:
        return 0
    else:
        return 2

df.score_ft_home = [int(score_ft_home) for score_ft_home in df.score_ft_home]
df.score_ft_away = [int(score_ft_away) for score_ft_away in df.score_ft_away]

df.pn_home = [np.array([float(pn) for pn in pn_home[1:-1].split(',')]) for pn_home in list(df.pn_home)]
df.pn_away = [np.array([float(pn) for pn in pn_away[1:-1].split(',')]) for pn_away in list(df.pn_away)]

df.emolex_home = [np.array([float(emo) for emo in emolex_home[1:-1].split(',')]) for emolex_home in list(df.emolex_home)]
df.emolex_away = [np.array([float(emo) for emo in emolex_away[1:-1].split(',')]) for emolex_away in list(df.emolex_away)]

df['goal_diff'] = df.score_ft_home - df.score_ft_away
df['result'] = [Labeling(goal_diff) for goal_diff in df.goal_diff]
# df = df[df.result != 2].copy().reset_index(drop=True)
df

Unnamed: 0,GW,away_team,date,emolex_away,emolex_home,home_team,pn_away,pn_home,score_ft_away,score_ft_home,score_ht_away,score_ht_home,time,goal_diff,result
0,5,Chelsea,Sat. 12 Sep.,"[1732.0, 1670.0, 1436.0, 1569.0, 1079.0, 2217....","[405.0, 287.0, 192.0, 322.0, 291.0, 484.0, 530...",Everton,"[3901.0, 3699.0]","[1009.0, 647.0]",1,3,1,2,11:45,2,1
1,5,Bournemouth,Sat. 12 Sep.,"[36.0, 36.0, 25.0, 22.0, 27.0, 50.0, 37.0, 37.0]","[67.0, 64.0, 22.0, 54.0, 122.0, 194.0, 122.0, ...",Norwich,"[112.0, 53.0]","[331.0, 100.0]",1,3,0,1,14:00,2,1
2,5,Swansea,Sat. 12 Sep.,"[24.0, 23.0, 10.0, 20.0, 22.0, 51.0, 33.0, 46.0]","[38.0, 22.0, 18.0, 20.0, 57.0, 66.0, 52.0, 58.0]",Watford,"[78.0, 36.0]","[98.0, 55.0]",0,1,0,0,14:00,1,1
3,5,Southampton,Sat. 12 Sep.,"[23.0, 28.0, 17.0, 23.0, 75.0, 65.0, 68.0, 67.0]","[56.0, 30.0, 38.0, 32.0, 43.0, 80.0, 48.0, 94.0]",WestBromwich,"[154.0, 62.0]","[116.0, 116.0]",0,0,0,0,14:00,0,2
4,5,Stoke,Sat. 12 Sep.,"[82.0, 59.0, 40.0, 47.0, 75.0, 130.0, 107.0, 8...","[603.0, 445.0, 421.0, 432.0, 1039.0, 1681.0, 1...",Arsenal,"[255.0, 113.0]","[2529.0, 938.0]",0,2,0,1,14:00,2,1
5,5,City,Sat. 12 Sep.,"[1029.0, 1358.0, 538.0, 805.0, 441.0, 378.0, 3...","[146.0, 113.0, 60.0, 80.0, 100.0, 190.0, 145.0...",Crystal,"[975.0, 1708.0]","[299.0, 225.0]",1,0,0,0,14:00,-1,0
6,5,Liverpool,Sat. 12 Sep.,"[1743.0, 1955.0, 1644.0, 1838.0, 1207.0, 2875....","[5228.0, 3026.0, 2403.0, 2447.0, 2705.0, 7167....",United,"[4881.0, 4464.0]","[11807.0, 9999.0]",1,3,0,0,16:30,2,1
7,5,Tottenham,Sun. 13 Sep.,"[379.0, 365.0, 248.0, 315.0, 297.0, 511.0, 421...","[158.0, 152.0, 87.0, 120.0, 205.0, 329.0, 218....",Sunderland,"[1117.0, 815.0]","[498.0, 330.0]",1,0,0,0,12:30,-1,0
8,5,Villa,Sun. 13 Sep.,"[195.0, 233.0, 203.0, 191.0, 384.0, 725.0, 881...","[70.0, 75.0, 74.0, 65.0, 99.0, 122.0, 99.0, 10...",Leicester,"[1550.0, 480.0]","[276.0, 163.0]",2,3,1,0,15:00,1,1
9,5,Newcastle,Mon. 14 Sep.,"[589.0, 697.0, 473.0, 643.0, 418.0, 919.0, 574...","[96.0, 92.0, 64.0, 68.0, 147.0, 401.0, 297.0, ...",WestHam,"[1560.0, 1563.0]","[655.0, 202.0]",0,2,0,1,19:00,2,1


In [52]:
# df = df[df.result != 2].copy().reset_index(drop=True)
dta = pd.DataFrame()

dta['pos_home'] = [pn_home[0] / sum(pn_home)  for pn_home in df.pn_home]

dta['pos_away'] = [pn_away[0] / sum(pn_away)  for pn_away in df.pn_away]

dta['diff_pos'] = dta['pos_home'] - dta['pos_away']


dta['score_ht_home'] = [int(score_ht_home) for score_ht_home in df.score_ht_home]
dta['score_ht_away'] = [int(score_ht_away) for score_ht_away in df.score_ht_away]


# 'home_win': 1, 'away_win': 0, 'draw': 2
dta['result'] = df.result
dta

Unnamed: 0,pos_home,pos_away,diff_pos,score_ht_home,score_ht_away,result
0,0.609300,0.513289,0.096010,2,1,1
1,0.767981,0.678788,0.089194,1,0,1
2,0.640523,0.684211,-0.043688,0,0,1
3,0.500000,0.712963,-0.212963,0,0,2
4,0.729449,0.692935,0.036514,1,0,1
5,0.570611,0.363399,0.207212,0,0,0
6,0.541456,0.522311,0.019145,0,0,1
7,0.601449,0.578157,0.023292,0,0,0
8,0.628702,0.763547,-0.134845,0,1,1
9,0.764294,0.499520,0.264774,1,0,1


In [53]:
# 'home_win': 1, 'away_win': 0, 'draw': 2
dta.groupby('result').mean()

Unnamed: 0_level_0,pos_home,pos_away,diff_pos,score_ht_home,score_ht_away
result,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.594051,0.626635,-0.032584,0.375,0.833333
1,0.656051,0.557368,0.098683,1.121622,0.202703
2,0.608007,0.614516,-0.006509,0.565217,0.521739


## Prepare Data for Logistic Regression

In [55]:
y, X = dmatrices('result ~ \
    pos_home + pos_away + \
    diff_pos + \
    score_ht_home + score_ht_away',
    dta, return_type="dataframe")

# flatten y into a 1-D array
y = np.ravel(y)

print(X.columns)
X.head()

Index(['Intercept', 'pos_home', 'pos_away', 'diff_pos', 'score_ht_home',
       'score_ht_away'],
      dtype='object')


Unnamed: 0,Intercept,pos_home,pos_away,diff_pos,score_ht_home,score_ht_away
0,1,0.6093,0.513289,0.09601,2,1
1,1,0.767981,0.678788,0.089194,1,0
2,1,0.640523,0.684211,-0.043688,0,0
3,1,0.5,0.712963,-0.212963,0,0
4,1,0.729449,0.692935,0.036514,1,0


In [56]:
# instantiate a logistic regression model, and fit with X and y
model = LogisticRegressionCV(multi_class='multinomial')
model = model.fit(X, y)

# check the accuracy on the training set
print(model.score(X, y))


# evaluate the model using 10-fold cross-validation
scores = cross_val_score(LogisticRegressionCV(multi_class='multinomial'), X, y, scoring='accuracy', cv=10)
print(scores)
print(scores.mean())

0.59880239521
[ 0.66666667  0.55555556  0.5         0.61111111  0.58823529  0.52941176
  0.625       0.6         0.33333333  0.46666667]
0.547598039216


In [58]:
coef = pd.DataFrame()
coef['var'] = X.columns
coef['score0'] = np.transpose(model.coef_[0])
coef['score1'] = np.transpose(model.coef_[1])
coef['score2'] = np.transpose(model.coef_[2])

coef

Unnamed: 0,var,score0,score1,score2
0,Intercept,-5.9e-05,-0.000112,0.000172
1,pos_home,0.049667,0.042654,-0.092321
2,pos_away,-0.306511,-0.067039,0.37355
3,diff_pos,0.356178,0.109693,-0.465871
4,score_ht_home,-0.960745,1.015704,-0.054959
5,score_ht_away,1.12452,-1.320847,0.196327


In [60]:
predicted = model.predict(X)
print(metrics.confusion_matrix(y, predicted))
print(metrics.classification_report(y, predicted))

[[23 16  8]
 [ 2 64  8]
 [ 8 25 13]]
             precision    recall  f1-score   support

        0.0       0.70      0.49      0.57        47
        1.0       0.61      0.86      0.72        74
        2.0       0.45      0.28      0.35        46

avg / total       0.59      0.60      0.57       167



## Model Evaluation Using a Validation Set

In [26]:
# evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
model2 = LogisticRegression()
model2.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

In [27]:
# predict class labels for the test set
predicted = model2.predict(X_test)
print(predicted)

[ 1.  1.  1.  1.  0.  1.  0.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  0.  1.  1.  1.  0.  1.  0.  1.  1.  0.  1.  1.  1.  1.  1.  1.  1.
  1.]


In [28]:
# generate class probabilities
probs = model2.predict_proba(X_test)
print(probs)

[[ 0.42838278  0.57161722]
 [ 0.35605579  0.64394421]
 [ 0.45729187  0.54270813]
 [ 0.33739516  0.66260484]
 [ 0.52311483  0.47688517]
 [ 0.38505406  0.61494594]
 [ 0.511596    0.488404  ]
 [ 0.29387505  0.70612495]
 [ 0.35660616  0.64339384]
 [ 0.31029047  0.68970953]
 [ 0.36402715  0.63597285]
 [ 0.41242044  0.58757956]
 [ 0.46405293  0.53594707]
 [ 0.47402904  0.52597096]
 [ 0.35738598  0.64261402]
 [ 0.41391875  0.58608125]
 [ 0.35829015  0.64170985]
 [ 0.43987245  0.56012755]
 [ 0.3798032   0.6201968 ]
 [ 0.59890463  0.40109537]
 [ 0.34640377  0.65359623]
 [ 0.41623275  0.58376725]
 [ 0.45496113  0.54503887]
 [ 0.53926925  0.46073075]
 [ 0.35894869  0.64105131]
 [ 0.53070609  0.46929391]
 [ 0.31515686  0.68484314]
 [ 0.48426414  0.51573586]
 [ 0.51347575  0.48652425]
 [ 0.42579091  0.57420909]
 [ 0.31970059  0.68029941]
 [ 0.45299943  0.54700057]
 [ 0.43038888  0.56961112]
 [ 0.42089156  0.57910844]
 [ 0.43015197  0.56984803]
 [ 0.38263135  0.61736865]
 [ 0.42777012  0.57222988]]


In [29]:
# generate evaluation metrics
print(metrics.accuracy_score(y_test, predicted))
print(metrics.roc_auc_score(y_test, probs[:, 1]))

0.756756756757
0.772727272727


In [30]:
print(metrics.confusion_matrix(y_test, predicted))
print(metrics.classification_report(y_test, predicted))

[[ 4  7]
 [ 2 24]]
             precision    recall  f1-score   support

        0.0       0.67      0.36      0.47        11
        1.0       0.77      0.92      0.84        26

avg / total       0.74      0.76      0.73        37



## Model Evaluation Using Cross-Validation

In [None]:
# evaluate the model using 10-fold cross-validation
scores = cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=10)
print(scores)
print(scores.mean())