In [1]:
%pylab inline
%matplotlib inline

# Global Imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import os
import sys
import pickle
from pprint import pprint
from time import time
import datetime
from time import gmtime, strftime
import statsmodels.api as sm
from patsy import dmatrices

# Scikit-Learn imports
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import train_test_split

from sklearn.cross_validation import cross_val_score
from sklearn import metrics

# Local Imports
path = str(os.path.expanduser('~')) + '/git/predictEPL/config'
sys.path.append(path)
import paths

sys.path.append(paths.UTILS)
import useful_methods

Populating the interactive namespace from numpy and matplotlib


## Data Manipulation Functions

In [2]:
# Read Data: all_game_emolex_counted
def ReadEmoleDf():
#     df = useful_methods.csv_dic_df(paths.DATA_HOME + "EPL/all_game_emolex_counted.csv")
    df = useful_methods.csv_dic_df(paths.DATA_HOME + "EPL/all_game_emolex_counted_nonretweet.csv")


    # Manipulations

    # 'home_win': 1, 'away_win': 0, 'draw': 2
    def Labeling(goal_diff):
        if goal_diff > 0:
            return 1
        elif goal_diff < 0:
            return 0
        else:
            return 2

    df.score_ft_home = [int(score_ft_home) for score_ft_home in df.score_ft_home]
    df.score_ft_away = [int(score_ft_away) for score_ft_away in df.score_ft_away]

    df.pn_home = [np.array([float(pn) for pn in pn_home[1:-1].split(',')]) for pn_home in list(df.pn_home)]
    df.pn_away = [np.array([float(pn) for pn in pn_away[1:-1].split(',')]) for pn_away in list(df.pn_away)]

    df.emolex_home = [np.array([float(emo) for emo in emolex_home[1:-1].split(',')]) for emolex_home in list(df.emolex_home)]
    df.emolex_away = [np.array([float(emo) for emo in emolex_away[1:-1].split(',')]) for emolex_away in list(df.emolex_away)]

    df['goal_diff'] = df.score_ft_home - df.score_ft_away
    df['result'] = [Labeling(goal_diff) for goal_diff in df.goal_diff]
    
    return df

In [3]:
# Create df for models.
def CreateDfModel(draw=False):
    df = ReadEmoleDf()
    
    if not draw:
        df = df[df.result != 2].copy().reset_index(drop=True)
    
    dta = pd.DataFrame()
    
    # Teams
    dta['team_home'] = df.home_team
    dta['team_away'] = df.away_team

    # POS, NEG
    dta['pos_home'] = [pn_home[0] / sum(pn_home)  for pn_home in df.pn_home]
    dta['neg_home'] = [pn_home[1] / sum(pn_home)  for pn_home in df.pn_home]

    dta['pos_away'] = [pn_away[0] / sum(pn_away)  for pn_away in df.pn_away]
    dta['neg_away'] = [pn_away[1] / sum(pn_away)  for pn_away in df.pn_away]
    
    dta['diff_pos'] = dta['pos_home'] - dta['pos_away']
    
    # HF scores
    dta['score_ht_home'] = [int(score_ht_home) for score_ht_home in df.score_ht_home]
    dta['score_ht_away'] = [int(score_ht_away) for score_ht_away in df.score_ht_away]

    # Emolex 8
    dta['anger_home'] = [emolex[0] / sum(emolex) for emolex in df.emolex_home]
    dta['fear_home'] = [emolex[1] / sum(emolex) for emolex in df.emolex_home]
    dta['disgust_home'] = [emolex[2] / sum(emolex) for emolex in df.emolex_home]
    dta['sadness_home'] = [emolex[3] / sum(emolex)  for emolex in df.emolex_home]
    dta['surprise_home'] = [emolex[4] / sum(emolex)  for emolex in df.emolex_home]
    dta['trust_home'] = [emolex[5] / sum(emolex)  for emolex in df.emolex_home]
    dta['joy_home'] = [emolex[6] / sum(emolex)  for emolex in df.emolex_home]
    dta['anticipation_home'] = [emolex[7] / sum(emolex)  for emolex in df.emolex_home]

    dta['anger_away'] = [emolex[0] / sum(emolex)  for emolex in df.emolex_away]
    dta['fear_away'] = [emolex[1] / sum(emolex)  for emolex in df.emolex_away]
    dta['disgust_away'] = [emolex[2] / sum(emolex)  for emolex in df.emolex_away]
    dta['sadness_away'] = [emolex[3] / sum(emolex)  for emolex in df.emolex_away]
    dta['surprise_away'] = [emolex[4] / sum(emolex)  for emolex in df.emolex_away]
    dta['trust_away'] = [emolex[5] / sum(emolex)  for emolex in df.emolex_away]
    dta['joy_away'] = [emolex[6] / sum(emolex)  for emolex in df.emolex_away]
    dta['anticipation_away'] = [emolex[7] / sum(emolex)  for emolex in df.emolex_away]

    # 'home_win': 1, 'away_win': 0, 'draw': 2
    dta['result'] = df.result
    
    return dta

In [4]:
# X: df, y: list
def CreateXy(df, team_name=False, emolex=False):
    if team_name and emolex:
        y, X = dmatrices('result ~ \
            pos_home + neg_home + pos_away + neg_away + \
            score_ht_home + score_ht_away + \
            diff_pos + \
            anger_home + fear_home + disgust_home + sadness_home + \
            surprise_home + trust_home + joy_home + anticipation_home + \
            anger_away + fear_away + disgust_away + sadness_away + \
            surprise_away + trust_away + joy_away + anticipation_away + \
            C(team_home) + C(team_away)',
            df, return_type="dataframe")
    elif team_name:
        y, X = dmatrices('result ~ \
            pos_home + neg_home + pos_away + neg_away + \
            score_ht_home + score_ht_away + \
            diff_pos + \
            C(team_home) + C(team_away)',
            df, return_type="dataframe")
    elif emolex:
        y, X = dmatrices('result ~ \
            pos_home + neg_home + pos_away + neg_away + \
            score_ht_home + score_ht_away + \
            diff_pos + \
            anger_home + fear_home + disgust_home + sadness_home + \
            surprise_home + trust_home + joy_home + anticipation_home + \
            anger_away + fear_away + disgust_away + sadness_away + \
            surprise_away + trust_away + joy_away + anticipation_away',
            df, return_type="dataframe")
    else:
        y, X = dmatrices('result ~ \
            pos_home + neg_home + pos_away + neg_away + \
            score_ht_home + score_ht_away + \
            diff_pos',
            df, return_type="dataframe")

    # flatten y into a 1-D array
    y = np.ravel(y)
    
    return X, y

## Model Manipulation Functions

In [5]:
# Print Training Parameters
def DetecterParams(detecter, title="", all_tunes=True):
    print("\n\n### PARAMS ################################\n")

    if all_tunes:
        print("[All Params Results]:\n")
        pprint(detecter.grid_scores_)
        print("\n")

    print("[%s Detecter Params]: \n" % title)
    print("Best Score: ", detecter.best_score_)
    print("Best Params: ", detecter.best_params_)


# Print Test Prediction
def DetecterMetrics(features, labels, detecter, title=""):
    predictions = detecter.predict(features)
    print("\n\n### METRICS ###############################\n")

    print("[%s Results]: \n" % title)
    print(metrics.classification_report(labels, predictions))
    print('[Accuracy]: ', metrics.accuracy_score(labels, predictions))


# Receiver Operating Characteristic = ROC curve
# Visualizes a classifier's performance
# for all values of the discrimination threshold. 
# fall out: F = FP / (TN + FP)
# AUC (area under the curve)
def PlotRocAuc(features, labels, detecter, title=""):
    # predict features
    predictions = detecter.predict_proba(features)
    
    # calculate Fall Out & Recall
    false_positive_rate, recall, thresholds = metrics.roc_curve(
        labels, predictions[:, 1])

    # ROC AUC
    roc_auc = metrics.auc(false_positive_rate, recall)

    # Plot
    plt.title('Receiver Operating Characteristic: ' + title)
    plt.plot(false_positive_rate, recall, 'b', label='AUC = %0.2f' % roc_auc)

    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.ylabel('Recall')
    plt.xlabel('Fall-out')
    plt.show()

# ****************************************************************************
# ****************************************************************************

# Define Logistic Recression
def Log(y_train, n_folds=10):
    # putting the steps explicitly into Pipeline
    pipeline_log = Pipeline([
            ('clf', LogisticRegression())
        ])

    # tunning parameters
    params_log = {
        'clf__C': (1, 10, 100),
#         'clf__multi_class': ('ovr', 'multinomial'),
#         'clf__penalty': ('l1', 'l2')
    }

    # grid search
    grid_log = GridSearchCV(
        pipeline_log,        # pipeline from above
        params_log,          # parameters to tune via cross validation
        refit=True,          # fit using all available data at the end, on the best found param combination
        n_jobs=-1,           # number of cores to use for parallelization; -1 for "all cores"
        scoring='accuracy',  # what score are we optimizing?
        cv=StratifiedKFold(y_train, n_folds=n_folds),  # what type of cross validation to use
#         cv=n_folds
    )

    return grid_log


# ****************************************************************************
# ****************************************************************************

# Define Support Vector Machine
# return: gridsearch SVM
def SVM(y_train, n_folds=10):
    # putting the steps explicitly into Pipeline
    pipeline_svm = Pipeline([
            # train on vectors with classifier
            ('clf', SVC())
        ])

    # tunning parameters
    params_svm = {
        'clf__kernel': ('linear', 'poly', 'rbf'),
        'clf__gamma': (0.00001, 0.0001, 00.1),
        'clf__C': (1, 10, 100),
    }

    # grid search
    grid_svm = GridSearchCV(
        pipeline_svm,        # pipeline from above
        params_svm,          # parameters to tune via cross validation
        refit=True,          # fit using all available data at the end, on the best found param combination
        n_jobs=-1,           # number of cores to use for parallelization; -1 for "all cores"
        scoring='accuracy',  # what score are we optimizing?
        cv=StratifiedKFold(y_train, n_folds=n_folds),  # what type of cross validation to use
    )

    return grid_svm



# ****************************************************************************
# ****************************************************************************

# Define Decision Trees
def DT(y_train, n_folds=10):
    # putting the steps explicitly into Pipeline
    pipeline_dt = Pipeline([
            # train on vectors with classifier
            ('clf', RandomForestClassifier(criterion='entropy'))
        ])

    # tunning parameters
    params_dt = {
        'clf__n_estimators': (5, 10, 20, 50),
        'clf__max_depth': (50, 150, 250),
        'clf__min_samples_split': (1, 2, 3),
        'clf__min_samples_leaf': (1, 2, 3)
    }

    # grid search
    grid_dt = GridSearchCV(
        pipeline_dt,        # pipeline from above
        params_dt,          # parameters to tune via cross validation
        refit=True,          # fit using all available data at the end, on the best found param combination
        n_jobs=-1,           # number of cores to use for parallelization; -1 for "all cores"
        scoring='accuracy',  # what score are we optimizing?
        cv=StratifiedKFold(y_train, n_folds=n_folds),  # what type of cross validation to use
    )

    return grid_dt

## Model Training

In [6]:
def ModelTrain(X, y, model, n_folds=10, test_size=0.2):
    # ***************************************************
    # [Step 2]: Data Split(train=0.8, test=0.2)
    # ***************************************************

    date_now = strftime("%Y-%m-%d %H:%M:%S", gmtime()).replace(" ", "_")

    # Split data Train and Test data
    X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size)

#     print(
#         "\n\n### DATA ##################################\n",
#         "\n\tTrain data: \t", len(X_train),
#         "\n\tTest data: \t", len(X_test),
#         "\n\tAll data: \t", len(y_train) + len(y_test)
#     )

    # ***************************************************
    # [Step 3]: Define Classifier
    # ***************************************************
    
    if model == 'LOG':
        grid_search = Log(y_train, n_folds)
    elif model == 'SVM':
        grid_search = SVM(y_train, n_folds)
    elif model == 'DT':
        grid_search = DT(y_train, n_folds)

    # ***************************************************
    # [Step 4]: Compute Classifier
    # ***************************************************

    start_time = time()

    # fitting training sets to classifier
    grid_search.fit(X_train, y_train)

    # ***************************************************
    # [Step 4]: Print Classifier Details
    # ***************************************************

    # print trained parameters
    DetecterParams(grid_search, title=model, all_tunes=False)

    # print computed time
#     print("\n\n### COMPUTED TIME #########################\n")
#     taken_time = time() - start_time
#     print("[Started Time]: ", date_now)
#     print("\n[Taken Time]: ", str(datetime.timedelta(seconds=taken_time)))

    # print classifier test results
#     DetecterMetrics(X_train, y_train, grid_search, title=model + ": Train")
#     DetecterMetrics(X_test, y_test, grid_search, title=model + ": Test")

In [8]:
# ***************************************************
# [Step 2]: Data Prepare
# ***************************************************


# Create DFs for models
# WL: win, lose; WLD: win, lose, draw
dfWL = CreateDfModel()
dfWLD = CreateDfModel(draw=True)

# 'home_win': 1, 'away_win': 0, 'draw': 2
dfWLD.groupby('result').mean()

# Prepare Data for Logistic Regression
X, y = CreateXy(dfWL, team_name=False, emolex=True)

# Train Model
ModelTrain(X, y, model='LOG', n_folds=30, test_size=0.1)
ModelTrain(X, y, model='SVM', n_folds=30, test_size=0.1)
# ModelTrain(X, y, model='DT', n_folds=30, test_size=0.1)




# Prepare Data for Logistic Regression
X, y = CreateXy(dfWLD, team_name=False, emolex=True)

# Train Model
ModelTrain(X, y, model='LOG', n_folds=30, test_size=0.1)
ModelTrain(X, y, model='SVM', n_folds=30, test_size=0.1)
# ModelTrain(X, y, model='DT', n_folds=30, test_size=0.1)



### PARAMS ################################

[LOG Detecter Params]: 

Best Score:  0.75652173913
Best Params:  {'clf__C': 1}


### PARAMS ################################

[SVM Detecter Params]: 

Best Score:  0.773913043478
Best Params:  {'clf__kernel': 'rbf', 'clf__C': 1, 'clf__gamma': 0.1}


### PARAMS ################################

[LOG Detecter Params]: 

Best Score:  0.509433962264
Best Params:  {'clf__C': 1}


### PARAMS ################################

[SVM Detecter Params]: 

Best Score:  0.603773584906
Best Params:  {'clf__kernel': 'rbf', 'clf__C': 1, 'clf__gamma': 0.1}


In [9]:
X

Unnamed: 0,Intercept,pos_home,neg_home,pos_away,neg_away,score_ht_home,score_ht_away,diff_pos,anger_home,fear_home,...,joy_home,anticipation_home,anger_away,fear_away,disgust_away,sadness_away,surprise_away,trust_away,joy_away,anticipation_away
0,1,0.613380,0.386620,0.496168,0.503832,2,1,0.117212,0.135486,0.096101,...,0.191020,0.148878,0.136524,0.124393,0.108403,0.120534,0.082598,0.154940,0.132003,0.140604
1,1,0.783270,0.216730,0.696429,0.303571,1,0,0.086841,0.067368,0.056842,...,0.200000,0.200000,0.117978,0.117978,0.084270,0.084270,0.095506,0.207865,0.140449,0.151685
2,1,0.646465,0.353535,0.661290,0.338710,0,0,-0.014826,0.116505,0.072816,...,0.165049,0.169903,0.111940,0.097015,0.067164,0.082090,0.111940,0.216418,0.149254,0.164179
3,1,0.410596,0.589404,0.676056,0.323944,0,0,-0.265460,0.141791,0.111940,...,0.104478,0.152985,0.072289,0.068273,0.056225,0.068273,0.156627,0.192771,0.204819,0.180723
4,1,0.684932,0.315068,0.634259,0.365741,1,0,0.050672,0.090319,0.068477,...,0.171389,0.169225,0.118932,0.114078,0.067961,0.094660,0.118932,0.191748,0.148058,0.145631
5,1,0.597668,0.402332,0.443396,0.556604,0,0,0.154271,0.133224,0.129934,...,0.149671,0.157895,0.150000,0.173611,0.075000,0.152778,0.093519,0.125000,0.098148,0.131944
6,1,0.497205,0.502795,0.452648,0.547352,0,0,0.044557,0.119541,0.104870,...,0.151124,0.167697,0.132926,0.132235,0.126418,0.134898,0.071788,0.141110,0.122868,0.137758
7,1,0.610577,0.389423,0.518409,0.481591,0,0,0.092167,0.108454,0.107600,...,0.137489,0.168232,0.130530,0.116513,0.098555,0.115199,0.106001,0.158563,0.136662,0.137976
8,1,0.604972,0.395028,0.734076,0.265924,0,1,-0.129104,0.108202,0.113438,...,0.141361,0.146597,0.063565,0.071856,0.058498,0.063565,0.124827,0.192077,0.229387,0.196223
9,1,0.765568,0.234432,0.500470,0.499530,1,0,0.265097,0.066882,0.065804,...,0.224380,0.168285,0.134342,0.126778,0.111952,0.117095,0.080787,0.160363,0.113767,0.154917
