In [1]:
%pylab inline
%matplotlib inline

# Global Imports
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
import numpy as np
import pandas as pd
from sklearn import datasets
import os
import sys
import pickle
from pprint import pprint
from time import time
import datetime
from time import gmtime, strftime
import statsmodels.api as sm
from patsy import dmatrices

# Scikit-Learn imports
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import train_test_split

from sklearn.cross_validation import cross_val_score
from sklearn import metrics

# Local Imports
path = str(os.path.expanduser('~')) + '/git/predictEPL/config'
sys.path.append(path)
import paths

sys.path.append(paths.UTILS)
import useful_methods

Populating the interactive namespace from numpy and matplotlib


# 0. Data

In [2]:
def ReadEmolexDf():
    # Read game_infos as df
    dfGameInfo = useful_methods.csv_dic_df(paths.READ_PATH_GAME_INFO + 'game_infos.csv')
    dfGameInfo = useful_methods.DropNanGames(dfGameInfo).copy().reset_index(drop=True)
    dfGameInfo.GW = [int(gw) for gw in dfGameInfo.GW]
    dfGameInfo = dfGameInfo.sort_values(['GW', 'away_team'], ascending=[True, True]).copy().reset_index(drop=True)

    # Read Hash Emolex Model result
    df = useful_methods.csv_dic_df(paths.READ_PATH_RESULTS + RESULT_FILE_NAME)
    df.GW = [int(gw) for gw in df.GW]
    df = df.sort_values(['GW', 'away_team'], ascending=[True, True]).copy().reset_index(drop=True)

    df.emolex_home = [np.array([float(emo.strip()) for emo in emolex_home[1:-2].split('.')]) for emolex_home in list(df.emolex_home)]
    df.emolex_away = [np.array([float(emo.strip()) for emo in emolex_away[1:-2].split('.')]) for emolex_away in list(df.emolex_away)]


    # Combine 2 dfs
    df['score_ht_away'] = [int(item) for item in dfGameInfo.score_ht_away]
    df['score_ht_home'] = [int(item) for item in dfGameInfo.score_ht_home]
    df['score_ft_away'] = [int(item) for item in dfGameInfo.score_ft_away]
    df['score_ft_home'] = [int(item) for item in dfGameInfo.score_ft_home]

    # 'home_win': 1, 'away_win': 0, 'draw': 2
    def Labeling(goal_diff):
        if goal_diff > 0:
            return 1
        elif goal_diff < 0:
            return 0
        else:
            return 2

    df['goal_diff_ht'] = df.score_ht_home - df.score_ht_away
    df['goal_diff_ft'] = df.score_ft_home - df.score_ft_away
    df['result'] = [Labeling(item) for item in df.goal_diff_ft]


    # **********************************************
    # Add previous 4 games points sum
    dfBe4GameSum = useful_methods.csv_dic_df(paths.DATA_HOME + "EPL/info_before_4game_sum.csv")

    team_homes = list(df.home_team)
    team_aways = list(df.away_team)
    GWs = [int(gw) for gw in df.GW]

    be_4game_sum_home = []
    be_4game_sum_away = []

    for index in range(len(GWs)):
        team_home = team_homes[index]
        team_away = team_aways[index]
        gw = GWs[index]

        be_4game_sum_home.append(int(dfBe4GameSum[dfBe4GameSum.team == team_home][str(gw)]))
        be_4game_sum_away.append(int(dfBe4GameSum[dfBe4GameSum.team == team_away][str(gw)]))

    df['be_4game_sum_away'] = be_4game_sum_away
    df['be_4game_sum_home'] = be_4game_sum_home    

    return df

In [3]:
# Create df for models.
def CreateDfForModel(ht_draw=False, ft_wld=False):
    df = ReadEmolexDf()
        
    # only for Win or Lose
    if not ft_wld:
        df = df[df.result != 2].copy().reset_index(drop=True)
    
    # HT: Equal
    if ht_draw:
        df = df[df.goal_diff_ht == 0].copy().reset_index(drop=True)
    
    dta = pd.DataFrame()
    
    # Teams
    dta['team_home'] = df.home_team
    dta['team_away'] = df.away_team
    
    # HF scores
    dta['score_ht_home'] = df.score_ht_home
    dta['score_ht_away'] = df.score_ht_away
    
    dta['goal_diff_ht'] = df['goal_diff_ht']
    dta['goal_diff_ft'] = df['goal_diff_ft']

    # 'home_win': 1, 'away_win': 0, 'draw': 2
    dta['result'] = df.result
    
    # be_4game_sum
    dta['be_4game_sum_home'] = df.be_4game_sum_home / 12.0
    dta['be_4game_sum_away'] = df.be_4game_sum_away / 12.0
    
    # Emolex 8
    dta['anger_home'] = [emolex[0] / sum(emolex[:-2]) for emolex in df.emolex_home]
    dta['fear_home'] = [emolex[1] / sum(emolex[:-2]) for emolex in df.emolex_home]
    dta['disgust_home'] = [emolex[2] / sum(emolex[:-2]) for emolex in df.emolex_home]
    dta['sadness_home'] = [emolex[3] / sum(emolex[:-2])  for emolex in df.emolex_home]
    dta['surprise_home'] = [emolex[4] / sum(emolex[:-2])  for emolex in df.emolex_home]
    dta['trust_home'] = [emolex[5] / sum(emolex[:-2])  for emolex in df.emolex_home]
    dta['joy_home'] = [emolex[6] / sum(emolex[:-2])  for emolex in df.emolex_home]
    dta['anticipation_home'] = [emolex[7] / sum(emolex[:-2])  for emolex in df.emolex_home]
    dta['pos_home'] = [emolex[8] / sum(emolex[-2:])  for emolex in df.emolex_home]
    dta['neg_home'] = [emolex[9] / sum(emolex[-2:])  for emolex in df.emolex_home]

    dta['anger_away'] = [emolex[0] / sum(emolex[:-2])  for emolex in df.emolex_away]
    dta['fear_away'] = [emolex[1] / sum(emolex[:-2])  for emolex in df.emolex_away]
    dta['disgust_away'] = [emolex[2] / sum(emolex[:-2])  for emolex in df.emolex_away]
    dta['sadness_away'] = [emolex[3] / sum(emolex[:-2])  for emolex in df.emolex_away]
    dta['surprise_away'] = [emolex[4] / sum(emolex[:-2])  for emolex in df.emolex_away]
    dta['trust_away'] = [emolex[5] / sum(emolex[:-2])  for emolex in df.emolex_away]
    dta['joy_away'] = [emolex[6] / sum(emolex[:-2])  for emolex in df.emolex_away]
    dta['anticipation_away'] = [emolex[7] / sum(emolex[:-2])  for emolex in df.emolex_away]
    dta['pos_away'] = [emolex[8] / sum(emolex[-2:])  for emolex in df.emolex_away]
    dta['neg_away'] = [emolex[9] / sum(emolex[-2:])  for emolex in df.emolex_away]
    
    
    # Diffs
    dta['diff_anger'] = dta['anger_home'] - dta['anger_away']
    dta['diff_fear'] = dta['fear_home'] - dta['fear_away']
    dta['diff_disgust'] = dta['disgust_home'] - dta['disgust_away']
    dta['diff_sadness'] = dta['sadness_home'] - dta['sadness_away']
    dta['diff_surprise'] = dta['surprise_home'] - dta['surprise_away']
    dta['diff_trust'] = dta['trust_home'] - dta['trust_away']
    dta['diff_joy'] = dta['joy_home'] - dta['joy_away']
    dta['diff_anticipation'] = dta['anticipation_home'] - dta['anticipation_away']
    dta['diff_pos'] = dta['pos_home'] - dta['pos_away']
    dta['diff_neg'] = dta['neg_home'] - dta['neg_away']
    
    
    return dta

### ODDS data

In [4]:
def ReadOddsDf():
    # Read Scores
    dfOdds = useful_methods.OddsPortalDf()

    df = dfOdds.copy()
    drop_index = []

    # drop Nan Games
    for ith_row in range(len(df)):
        # Team names
        week = df.iloc[ith_row]['GW']
        team_home = df.iloc[ith_row]['team_home']
        team_away = df.iloc[ith_row]['team_away']

        isFile = os.path.isfile(
        "/Users/Bya/Dropbox/Research/datas/EPL/ExtractedCsvData/" + "GW" + str(int(week)) + "/SingleGames/" + \
        team_home + "_vs_" + team_away + ".csv")

        if not isFile:
            drop_index.append(ith_row)

    df = df.drop(df.index[drop_index]).copy().reset_index(drop=True)


    # str to int
    str_to_num_cols = ["score_ft_home", "score_ft_away", "odds_home", "odds_away", "odds_draw"]

    for col in str_to_num_cols:
        df[col] = [np.float(item) for item in df[col]]
    
    return df

In [5]:
def CreateDfForModelOdds(ft_wld=True):
    df = CreateDfForModel(ft_wld=ft_wld)
    dfOdds = ReadOddsDf()

    odds_homes = []
    odds_aways = []
    odds_draws = []

    for ith_row in range(len(df)):
        team_home = df.loc[ith_row]['team_home']
        team_away = df.loc[ith_row]['team_away']

        odds_home = dfOdds[(dfOdds.team_home == team_home) & (dfOdds.team_away == team_away)]['odds_home']
        odds_away = dfOdds[(dfOdds.team_home == team_home) & (dfOdds.team_away == team_away)]['odds_away']
        odds_draw = dfOdds[(dfOdds.team_home == team_home) & (dfOdds.team_away == team_away)]['odds_draw']

        odds_homes.append(np.float(odds_home))
        odds_aways.append(np.float(odds_away))
        odds_draws.append(np.float(odds_draw))

    df['odds_home'] = odds_homes
    df['odds_away'] = odds_aways
    df['odds_draw'] = odds_draws
    
    return df

# 1. Emolex Log Model: Each Team

In [37]:
# Single Team Accuracu Score and Report
def SingleTeamAccuracy(df, team):
    print("\n==============================")
    print(team, ":\n")
    
    dfTeam = df[(df.team_home == team) | (df.team_away == team)].copy().reset_index(drop=True)

    # Create X, y for model
    y, X = dmatrices(
        'result ~ \
        anger_home + fear_home + disgust_home + sadness_home + \
        surprise_home + trust_home + joy_home + anticipation_home + \
        anger_away + fear_away + disgust_away + sadness_away + \
        surprise_away + trust_away + joy_away + anticipation_away + \
        pos_home + neg_home + pos_away + neg_away + \
        diff_anger + diff_fear + diff_disgust + diff_sadness + diff_surprise + \
        diff_trust + diff_joy + diff_anticipation + diff_pos + diff_neg',
        dfTeam, return_type="dataframe")
    y = np.ravel(y)


    # prediction reports
    predictions = model.predict(X)
    print(metrics.confusion_matrix(y, predictions))
    print(metrics.classification_report(y, predictions))
    print(metrics.accuracy_score(y, predictions))
    accuracy_score = metrics.accuracy_score(y, predictions)

    return accuracy_score

In [40]:
# ==================================================
# Read Model
MODEL_EMOLEX_WL_LOG = "model_emolex_wld_log.pkl"
with open(paths.DATA_HOME + 'Models/' + MODEL_EMOLEX_WL_LOG, 'rb') as f:
    u = pickle._Unpickler(f)
    u.encoding = 'utf-8'
    model = u.load()
    

# ==================================================
# File Name
RESULT_FILE_NAME = "emolex_all_ht.csv"

# Load data as DF
df = CreateDfForModel(ft_wld=True)

# team names
teams = list(set(df.team_home))
teams.sort()

[SingleTeamAccuracy(df, team) for team in teams]


Arsenal :

[[2 3 0]
 [1 9 0]
 [1 2 0]]
             precision    recall  f1-score   support

        0.0       0.50      0.40      0.44         5
        1.0       0.64      0.90      0.75        10
        2.0       0.00      0.00      0.00         3

avg / total       0.50      0.61      0.54        18

0.611111111111

Bournemouth :

[[3 1 0]
 [0 8 0]
 [2 4 0]]
             precision    recall  f1-score   support

        0.0       0.60      0.75      0.67         4
        1.0       0.62      1.00      0.76         8
        2.0       0.00      0.00      0.00         6

avg / total       0.41      0.61      0.49        18

0.611111111111

Chelsea :

[[3 2 0]
 [0 6 0]
 [2 4 0]]
             precision    recall  f1-score   support

        0.0       0.60      0.60      0.60         5
        1.0       0.50      1.00      0.67         6
        2.0       0.00      0.00      0.00         6

avg / total       0.35      0.53      0.41        17

0.529411764706

City :

[[2 2 0]
 [1 9 0]


  'precision', 'predicted', average, warn_for)


[0.61111111111111116,
 0.61111111111111116,
 0.52941176470588236,
 0.61111111111111116,
 0.58823529411764708,
 0.5,
 0.35294117647058826,
 0.33333333333333331,
 0.61111111111111116,
 0.66666666666666663,
 0.61111111111111116,
 0.72222222222222221,
 0.55555555555555558,
 0.5,
 0.3888888888888889,
 0.47058823529411764,
 0.61111111111111116,
 0.625,
 0.5,
 0.44444444444444442]

# Multi-Class LOG

In [51]:
# Calculate Accuracy
# Log & SVM, cv = 18
def ModelAccuracy(X, y, penalty='l2', multi_class='ovr', log_variables=False):
    # Set Models
    model_log = LogisticRegression(penalty=penalty, multi_class=multi_class)

    # Fit to models
    model_log = model_log.fit(X, y)

    # Cross Validation
    scores_log = cross_val_score(model_log, X, y, scoring='accuracy', cv=18)

    # Accuracy scores
    print("--------------------------\n")
    print("[Log]: \t%.3f (cv: %.3f)" % (model_log.score(X, y), scores_log.mean()))


    # Variable Scores
    if log_variables:
        dfVars = pd.DataFrame()
        dfVars['var'] = X.columns
        dfVars['score'] = np.transpose(model_log.coef_)
        print("\n", dfVars)
    
    return model_log

In [48]:
df = CreateDfForModel(ft_wld=True)

# Create X, y for model
y, X = dmatrices(
    'result ~ \
    anger_home + fear_home + disgust_home + sadness_home + \
    surprise_home + trust_home + joy_home + anticipation_home + \
    anger_away + fear_away + disgust_away + sadness_away + \
    surprise_away + trust_away + joy_away + anticipation_away + \
    pos_home + neg_home + pos_away + neg_away + \
    diff_anger + diff_fear + diff_disgust + diff_sadness + diff_surprise + \
    diff_trust + diff_joy + diff_anticipation + diff_pos + diff_neg',
    df, return_type="dataframe")
y = np.ravel(y)


# prediction reports

## Grid Search

In [81]:
df = CreateDfForModel(ft_wld=True)

# Results
# 1: Win, 2:Lose, 3:Draw
print(df["result"].value_counts())
print(df["result"].value_counts() / df['result'].count())

1    76
0    52
2    49
Name: result, dtype: int64
1    0.429379
0    0.293785
2    0.276836
Name: result, dtype: float64


In [161]:
####################################
# WL, WD, LD
dfBin = df[(df.result != 2)].copy().reset_index(drop=True)


# 2. Train Classifier

# Create X, y for model
y, X = dmatrices(
    'result ~ \
    anger_home + fear_home + disgust_home + sadness_home + \
    surprise_home + trust_home + joy_home + anticipation_home + \
    anger_away + fear_away + disgust_away + sadness_away + \
    surprise_away + trust_away + joy_away + anticipation_away + \
    pos_home + neg_home + pos_away + neg_away + \
    diff_anger + diff_fear + diff_disgust + diff_sadness + diff_surprise + \
    diff_trust + diff_joy + diff_anticipation + diff_pos + diff_neg',
    dfBin, return_type="dataframe")
y = np.ravel(y)

In [162]:
# =============================================================
# =============================================================

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)


# Pipelines
pipeline = Pipeline([
        ('clf', LogisticRegression(penalty='l2'))
    ])


# Hyper Parameters
parameters = {
    'clf__C': (0.1, 1, 10),
}


# Grid Search
grid_search = GridSearchCV(
    pipeline,
    parameters,
    n_jobs=-1,
    verbose=1,
    scoring='accuracy',
    cv=18
)



# =============================================================
# =============================================================

# Computing
# grid_search.fit(X_train, y_train)
grid_search.fit(X_train, y_train)


print('Best score: %0.3f' % grid_search.best_score_)
print('Best parameters set:')

best_parameters = grid_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print('\t%s: %r' % (param_name, best_parameters[param_name]))
    
    

# =============================================================
# =============================================================

predictions = grid_search.predict(X_test)
print("------------------------------")
print("TEST:\n")
print('\n\nAccuracy:', metrics.accuracy_score(y_test, predictions))
print('\n\nConfusion Matrix:\n', metrics.confusion_matrix(y_test, predictions))
print('\n\nClassification Report:\n', metrics.classification_report(y_test, predictions))

Fitting 18 folds for each of 3 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:    0.2s finished


Best score: 0.755
Best parameters set:
	clf__C: 1
------------------------------
TEST:



Accuracy: 0.615384615385


Confusion Matrix:
 [[ 2  8]
 [ 2 14]]


Classification Report:
              precision    recall  f1-score   support

        0.0       0.50      0.20      0.29        10
        1.0       0.64      0.88      0.74        16

avg / total       0.58      0.62      0.56        26



### Win Lose Model => Draw Games

In [186]:
dfDraw = df[df.result == 2].copy().reset_index(drop=True)

# 2. Train Classifier

# Create X, y for model
y_draw, X_draw = dmatrices(
    'result ~ \
    anger_home + fear_home + disgust_home + sadness_home + \
    surprise_home + trust_home + joy_home + anticipation_home + \
    anger_away + fear_away + disgust_away + sadness_away + \
    surprise_away + trust_away + joy_away + anticipation_away + \
    pos_home + neg_home + pos_away + neg_away + \
    diff_anger + diff_fear + diff_disgust + diff_sadness + diff_surprise + \
    diff_trust + diff_joy + diff_anticipation + diff_pos + diff_neg',
    dfDraw, return_type="dataframe")
y_draw = np.ravel(y_draw)


predictions_draw = grid_search.predict_proba(X_draw)

pre_0 = np.array([item[0] for item in predictions_draw])
pre_1 = np.array([item[1] for item in predictions_draw])

print(pre_0.mean(), pre_1.mean())

0.442713419434 0.557286580566


In [187]:
y_wl, X_wl = dmatrices(
    'result ~ \
    anger_home + fear_home + disgust_home + sadness_home + \
    surprise_home + trust_home + joy_home + anticipation_home + \
    anger_away + fear_away + disgust_away + sadness_away + \
    surprise_away + trust_away + joy_away + anticipation_away + \
    pos_home + neg_home + pos_away + neg_away + \
    diff_anger + diff_fear + diff_disgust + diff_sadness + diff_surprise + \
    diff_trust + diff_joy + diff_anticipation + diff_pos + diff_neg',
    dfBin, return_type="dataframe")
y_wl = np.ravel(y_wl)


predictions_wl = grid_search.predict_proba(X_wl)

pre_0 = np.array([item[0] for item in predictions_wl])
pre_1 = np.array([item[1] for item in predictions_wl])

print(pre_0.mean(), pre_1.mean())

0.40732972387 0.59267027613
