In [1]:
%pylab inline
%matplotlib inline

# Global Imports
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
import numpy as np
import pandas as pd
from sklearn import datasets
import os
import sys
import pickle
from pprint import pprint
from time import time
import datetime
from time import gmtime, strftime
import statsmodels.api as sm
from patsy import dmatrices

# Scikit-Learn imports
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import train_test_split

from sklearn.cross_validation import cross_val_score
from sklearn import metrics

# Local Imports
path = str(os.path.expanduser('~')) + '/git/predictEPL/config'
sys.path.append(path)
import paths

sys.path.append(paths.UTILS)
import useful_methods

Populating the interactive namespace from numpy and matplotlib


## Data Manipulation Functions

In [2]:
def ReadHashDf():
    # Read game_infos as df
    dfGameInfo = useful_methods.csv_dic_df(paths.READ_PATH_GAME_INFO + 'game_infos.csv')
    dfGameInfo = useful_methods.DropNanGames(dfGameInfo).copy().reset_index(drop=True)
    dfGameInfo.GW = [int(gw) for gw in dfGameInfo.GW]
    dfGameInfo = dfGameInfo.sort_values(['GW', 'away_team'], ascending=[True, True]).copy().reset_index(drop=True)

    # Read Hash Emolex Model result
    df = useful_methods.csv_dic_df(paths.READ_PATH_RESULTS + RESULT_FILE_NAME)
    df.GW = [int(gw) for gw in df.GW]
    df = df.sort_values(['GW', 'away_team'], ascending=[True, True]).copy().reset_index(drop=True)


    # Combine 2 dfs
    df['score_ht_away'] = [int(item) for item in dfGameInfo.score_ht_away]
    df['score_ht_home'] = [int(item) for item in dfGameInfo.score_ht_home]
    df['score_ft_away'] = [int(item) for item in dfGameInfo.score_ft_away]
    df['score_ft_home'] = [int(item) for item in dfGameInfo.score_ft_home]

    df['pn_away_neg'] = [float(item) for item in df.pn_away_neg]
    df['pn_away_pos'] = [float(item) for item in df.pn_away_pos]
    df['pn_home_neg'] = [float(item) for item in df.pn_home_neg]
    df['pn_home_pos'] = [float(item) for item in df.pn_home_pos]

    # ******************************************************
    # 'home_win': 1, 'away_win': 0, 'draw': 2
    def Labeling3(goal_diff):
        if goal_diff > 0:
            return 1
        elif goal_diff < 0:
            return 0
        else:
            return 2

    df['goal_diff_ht'] = df.score_ht_home - df.score_ht_away
    df['goal_diff_ft'] = df.score_ft_home - df.score_ft_away
    df['result'] = [Labeling3(item) for item in df.goal_diff_ft]
    
    
    # 'home_win, draw': 1, 'home_lose': 0
    def Labeling2(goal_diff):
        if goal_diff < 0:
            return 0
        else:
            return 1
    df['res2'] = [Labeling2(item) for item in df.goal_diff_ft]

    # **********************************************
    # Add previous 4 games points sum
    dfBe4GameSum = useful_methods.csv_dic_df(paths.DATA_HOME + "EPL/info_before_4game_sum.csv")

    team_homes = list(df.home_team)
    team_aways = list(df.away_team)
    GWs = [int(gw) for gw in df.GW]

    be_4game_sum_home = []
    be_4game_sum_away = []

    for index in range(len(GWs)):
        team_home = team_homes[index]
        team_away = team_aways[index]
        gw = GWs[index]

        be_4game_sum_home.append(int(dfBe4GameSum[dfBe4GameSum.team == team_home][str(gw)]))
        be_4game_sum_away.append(int(dfBe4GameSum[dfBe4GameSum.team == team_away][str(gw)]))

    df['be_4game_sum_away'] = be_4game_sum_away
    df['be_4game_sum_home'] = be_4game_sum_home    

    return df

In [3]:
# Create df for models.
def CreateDfForModel(ht_draw=False, ft_wld=False):
    df = ReadHashDf()
        
    # only for Win or Lose
    if not ft_wld:
        df = df[df.result != 2].copy().reset_index(drop=True)
    
    # HT: Equal
    if ht_draw:
        df = df[df.goal_diff_ht == 0].copy().reset_index(drop=True)
    
    dta = pd.DataFrame()
    
    # Teams
    dta['team_home'] = df.home_team
    dta['team_away'] = df.away_team

    # POS, NEG scores by percentage %
    dta['pn_home_pos'] = df.pn_home_pos / (df.pn_home_pos + df.pn_home_neg)
    dta['pn_home_neg'] = df.pn_home_neg / (df.pn_home_pos + df.pn_home_neg)
    dta['pn_away_pos'] = df.pn_away_pos / (df.pn_away_pos + df.pn_away_neg)
    dta['pn_away_neg'] = df.pn_away_neg / (df.pn_away_pos + df.pn_away_neg)
    
    dta['pn_diff_pos'] = dta['pn_home_pos'] - dta['pn_away_pos']
    dta['pn_diff_neg'] = dta['pn_home_neg'] - dta['pn_away_neg']
    
    # HF scores
    dta['score_ht_home'] = df.score_ht_home
    dta['score_ht_away'] = df.score_ht_away
    dta['score_ft_home'] = df.score_ft_home
    dta['score_ft_away'] = df.score_ft_away
    
    dta['goal_diff_ht'] = df['goal_diff_ht']
    dta['goal_diff_ft'] = df['goal_diff_ft']

    # 'home_win': 1, 'away_win': 0, 'draw': 2
    dta['result'] = df.result
    
    # 'home_win or draw': 1, 'home_lose': 0
    dta['res2'] = df['res2']
    
    # be_4game_sum
    dta['be_4game_sum_home'] = df.be_4game_sum_home / 12.0
    dta['be_4game_sum_away'] = df.be_4game_sum_away / 12.0
    
    # ht_point * sentiments
    dta['sp_home'] = dta['pn_home_pos'] * (dta['score_ht_home'] + 1)
    dta['sn_home'] = dta['pn_home_neg'] * (dta['score_ht_home'] + 1)
    dta['sp_away'] = dta['pn_away_pos'] * (dta['score_ht_away'] + 1)
    dta['sn_away'] = dta['pn_away_neg'] * (dta['score_ht_away'] + 1)
    dta['sp_diff'] = dta['sp_home'] - dta['sp_away']
    dta['sn_diff'] = dta['sn_home'] - dta['sn_away']
    
    return dta

In [4]:
# X: df, y: list
def CreateXy(df, team_name=False, hash_emolex=True, score_ht=True, be_4game_sum=False, score_sent=False):
    print("--------------------------\n")
#     print("score_ht: %s, be_4game_sum: %s\nhash_emolex: %s,  team_name: %s\n" %
#           (score_ht, be_4game_sum, hash_emolex, team_name))
    
    if team_name and hash_emolex and score_ht and be_4game_sum:
        y, X = dmatrices('result ~ \
            be_4game_sum_home + be_4game_sum_away + \
            score_ht_home + score_ht_away + \
            pn_home_pos + pn_home_neg + pn_away_pos + pn_away_neg + \
            pn_diff_pos + \
            C(team_home) + C(team_away)',
            df, return_type="dataframe")
    elif hash_emolex and score_ht and be_4game_sum:
        y, X = dmatrices('result ~ \
            be_4game_sum_home + be_4game_sum_away + \
            score_ht_home + score_ht_away + \
            pn_home_pos + pn_home_neg + pn_away_pos + pn_away_neg + \
            pn_diff_pos',
            df, return_type="dataframe")
    elif hash_emolex and be_4game_sum:
        y, X = dmatrices('result ~ \
            be_4game_sum_home + be_4game_sum_away + \
            pn_home_pos + pn_home_neg + pn_away_pos + pn_away_neg + \
            pn_diff_pos',
            df, return_type="dataframe")
    elif score_ht and be_4game_sum:
        y, X = dmatrices('result ~ \
            score_ht_home + score_ht_away + \
            be_4game_sum_home + be_4game_sum_away',
            df, return_type="dataframe")
    elif score_ht and hash_emolex:
        y, X = dmatrices('result ~ \
            score_ht_home + score_ht_away + \
            pn_home_pos + pn_home_neg + pn_away_pos + pn_away_neg',
            df, return_type="dataframe")
    elif hash_emolex:
        y, X = dmatrices('result ~ \
            pn_home_pos + pn_home_neg + pn_away_pos + pn_away_neg',
            df, return_type="dataframe")
    elif score_sent:
        y, X = dmatrices('result ~ \
            sp_home + sn_home + sp_away + sn_away',
            df, return_type="dataframe")
    else:
        y, X = dmatrices('result ~ \
            score_ht_home + score_ht_away',
            df, return_type="dataframe")
        
    # flatten y into a 1-D array
    y = np.ravel(y)
    
    return X, y

## Models: Logistic, SVC

In [None]:
def ModelScoresWL(df, team_name, hash_emolex, score_ht, be_4game_sum, variable_scores=False, score_sent=False):
    # instantiate a logistic regression model, and fit with X and y
    X, y = CreateXy(df, team_name, hash_emolex, score_ht, be_4game_sum, score_sent)

    # Set Models
    model_log = LogisticRegression(penalty='l1')
    model_svc = SVC()

    # Fit to models
    model_log = model_log.fit(X, y)
    model_svc = model_svc.fit(X, y)

    # Cross Validation
    scores_log = cross_val_score(LogisticRegression(penalty='l1'), X, y, scoring='accuracy', cv=18)
    scores_svc = cross_val_score(SVC(), X, y, scoring='accuracy', cv=18)

    # Accuracy scores
    print("[Log]: \t%.3f (cv: %.3f)" % (model_log.score(X, y), scores_log.mean()))
    print("[SVC]: \t%.3f (cv: %.3f)" % (model_svc.score(X, y), scores_svc.mean()))
    
    
    if variable_scores:
        # Variable Scores
        dfVars = pd.DataFrame()
        dfVars['var'] = X.columns
        dfVars['score'] = np.transpose(model_log.coef_)
        print("\n", dfVars)

In [None]:
def ModelScoresWLD(df, team_name, hash_emolex, score_ht, be_4game_sum, score_sent=False):
    # instantiate a logistic regression model, and fit with X and y
    X, y = CreateXy(df, team_name, hash_emolex, score_ht, be_4game_sum, score_sent)

    # Set Models
    model_log = LogisticRegression(penalty='l1')
    model_svc = SVC()

    # Fit to models
    model_log = model_log.fit(X, y)
    model_svc = model_svc.fit(X, y)

    # Cross Validation
    scores_log = cross_val_score(LogisticRegression(penalty='l1'), X, y, scoring='accuracy', cv=18)
    scores_svc = cross_val_score(SVC(), X, y, scoring='accuracy', cv=18)

    # Accuracy scores
#     print("--------------------------\n")
    print("[Log]: \t%.3f (cv: %.3f)" % (model_log.score(X, y), scores_log.mean()))
    print("[SVC]: \t%.3f (cv: %.3f)" % (model_svc.score(X, y), scores_svc.mean()))

### Accuracy

In [None]:
# Definings
RESULT_FILE_NAME = "hash_all_ht.csv"

In [None]:
# Create DFs for models
# WL: win, lose; WLD: win, lose, draw
dfWL = CreateDfForModel(ht_draw=False, ft_wld=False)
dfWLD = CreateDfForModel(ht_draw=False, ft_wld=True)

In [None]:
# Win, Lose
ModelScoresWL(dfWL, team_name=False, hash_emolex=False, score_ht=True, be_4game_sum=False, variable_scores=True)
ModelScoresWL(dfWL, team_name=False, hash_emolex=False, score_ht=True, be_4game_sum=True, variable_scores=True)
ModelScoresWL(dfWL, team_name=False, hash_emolex=True, score_ht=False, be_4game_sum=False, variable_scores=True)
# ModelScoresWL(dfWL, team_name=False, hash_emolex=True, score_ht=False, be_4game_sum=True, variable_scores=False)
ModelScoresWL(dfWL, team_name=False, hash_emolex=True, score_ht=True, be_4game_sum=False, variable_scores=True)
# ModelScoresWL(dfWL, team_name=True, hash_emolex=True, score_ht=True, be_4game_sum=True, variable_scores=False)

ModelScoresWL(dfWL, team_name=False, hash_emolex=False, score_ht=False, be_4game_sum=False,
              variable_scores=True, score_sent=True)

In [None]:
ModelScoresWLD(dfWLD, team_name=False, hash_emolex=False, score_ht=True, be_4game_sum=False)
# ModelScoresWLD(dfWLD, team_name=False, hash_emolex=False, score_ht=True, be_4game_sum=True)
# ModelScoresWLD(dfWLD, team_name=False, hash_emolex=True, score_ht=False, be_4game_sum=False)
ModelScoresWLD(dfWLD, team_name=False, hash_emolex=True, score_ht=False, be_4game_sum=False)
# ModelScoresWLD(dfWLD, team_name=False, hash_emolex=True, score_ht=True, be_4game_sum=True)
# ModelScoresWLD(dfWLD, team_name=True, hash_emolex=True, score_ht=True, be_4game_sum=True)
ModelScoresWLD(dfWLD, team_name=False, hash_emolex=True, score_ht=True, be_4game_sum=False)
ModelScoresWLD(dfWLD, team_name=False, hash_emolex=False, score_ht=False, be_4game_sum=False, score_sent=True)

## Tune Variables

In [None]:
def ModelAccuracy(X, y, penalty='l2'):
    # Set Models
    model_log = LogisticRegression(penalty=penalty)
    model_svc = SVC()

    # Fit to models
    model_log = model_log.fit(X, y)
    model_svc = model_svc.fit(X, y)

    # Cross Validation
    scores_log = cross_val_score(model_log, X, y, scoring='accuracy', cv=18)
    scores_svc = cross_val_score(model_svc, X, y, scoring='accuracy', cv=18)

    # Accuracy scores
    print("--------------------------\n")
    print("[Log]: \t%.3f (cv: %.3f)" % (model_log.score(X, y), scores_log.mean()))
    print("[SVC]: \t%.3f (cv: %.3f)" % (model_svc.score(X, y), scores_svc.mean()))


    # Variable Scores
    dfVars = pd.DataFrame()
    dfVars['var'] = X.columns
    dfVars['score'] = np.transpose(model_log.coef_)
    print("\n", dfVars)

In [None]:
df = CreateDfForModel(ft_wld=False)
y, X = dmatrices('result ~ \
            score_ht_home + score_ht_away + goal_diff_ht',
            df, return_type="dataframe")
y = np.ravel(y)

ModelAccuracy(X, y, penalty='l1')

In [None]:
df = CreateDfForModel(ft_wld=False)
y, X = dmatrices(
#     'result ~ pn_home_pos + pn_home_neg + pn_away_pos + pn_away_neg + pn_diff_pos + pn_diff_neg',
    'result ~ pn_away_neg + pn_diff_neg',
            df, return_type="dataframe")
y = np.ravel(y)

ModelAccuracy(X, y, penalty='l1')

In [None]:
df = CreateDfForModel(ft_wld=False)
y, X = dmatrices(
    'result ~ \
            score_ht_home + score_ht_away + goal_diff_ht + \
            pn_home_pos + pn_home_neg + pn_away_pos + pn_away_neg + pn_diff_pos + pn_diff_neg',
            df, return_type="dataframe")
y = np.ravel(y)

ModelAccuracy(X, y, penalty='l1')

In [None]:
df = CreateDfForModel(ft_wld=False)
y, X = dmatrices(
    'result ~ \
            score_ht_home + score_ht_away + goal_diff_ht + \
            sp_home + sn_home + sp_away + sn_away + sp_diff + sn_diff + \
            pn_home_pos + pn_home_neg + pn_away_pos + pn_away_neg + pn_diff_pos + pn_diff_neg',
            df, return_type="dataframe")
y = np.ravel(y)

ModelAccuracy(X, y, penalty='l1')

## Scatter Plot

In [None]:
import numpy as np

pn_home_pos = np.array(df.pn_diff_pos)
score_ht_home = np.array(df.goal_diff_ht)

np.corrcoef(pn_home_pos, score_ht_home)

In [None]:
def PlotScatter(x, y):
    x = np.array(x)
    y = np.array(y)
    
    cm = plt.cm.get_cmap('seismic')
    z = x

    # Scatter Plot
    plt.figure(figsize=(10, 6))
    
    
    plt.yticks(np.arange(min(y) - 1, max(y) + 1, 1))
    
    sc = plt.scatter(x, y, c=z, marker="o", s=500, alpha=0.3, cmap=cm)
    plt.colorbar(sc)
    plt.grid(True)
    plt.axhline(y=0, c="k", alpha=0.5)
    plt.xlabel('Percentage')
    plt.ylabel('Result')
    plt.title('Sentiment Percentage against Result')
    plt.show()

    # Correlation
    print("Correlation: ", np.corrcoef(x, y)[0][1])

In [None]:
df = CreateDfForModel(ft_wld=False)
PlotScatter(df.pn_diff_pos, df.goal_diff_ht)
PlotScatter(df.pn_diff_pos, df.goal_diff_ft)
PlotScatter(df.sp_diff, df.goal_diff_ft)
PlotScatter(df.goal_diff_ht, df.goal_diff_ft)

## Panda Dataframe Plot

In [None]:
import matplotlib
matplotlib.style.use('ggplot')

df = CreateDfForModel(ft_wld=True)

# Plot Histogram
plt.figure()
df.goal_diff_ft.plot(kind='hist')
plt.xlabel("FT: Goal Difference")

plt.figure()
df.result.plot(kind='hist')
plt.xlabel("0:Home Lose, 1: Home Win, 2: Draw")

plt.figure()
df.res2.plot(kind='hist')

In [None]:
df = CreateDfForModel(ft_wld=False)
df.plot(kind='scatter',
        x='pn_diff_pos',
        y='goal_diff_ft',
        c='pn_diff_pos',
        marker="o",
        s=500,
        alpha=0.4,
        figsize=(10, 7),
        grid=True,
        colormap='seismic',
       )

In [None]:
# The correlation matrix 
cor = df.corr()
cor

In [None]:
cor.result

## Scatter Matrix Plot

In [None]:
from pandas.tools.plotting import scatter_matrix

df = CreateDfForModel(ft_wld=True)
dta = df[[
        'pn_home_pos', 'pn_away_neg',
        'pn_diff_pos',
        'goal_diff_ht',
        'sp_diff',
        'goal_diff_ft']]

scatter_matrix(dta, alpha=0.8, figsize=(15, 10), diagonal='kde')

## Density Plot

In [None]:
df = CreateDfForModel(ft_wld=True)

df.pn_diff_pos.plot(kind='kde')

## Andrews Curves

In [None]:
from pandas.tools.plotting import andrews_curves

df = CreateDfForModel(ft_wld=False)
data = df[['pn_diff_pos'] + ['result']]
# data = df[df.columns[2:]]

plt.figure(figsize=(10, 6))
andrews_curves(data, 'result')

## Parallel Coordinates

In [None]:
from pandas.tools.plotting import parallel_coordinates

df = CreateDfForModel(ft_wld=False)

df['res'] = df.result

data = df[[
        'pn_home_pos',
        'pn_away_neg',
        'pn_diff_pos',
        'res'
          ] + ['result']]
# data = df[df.columns[2:]]

plt.figure(figsize=(16, 6))
parallel_coordinates(data, 'result')



data = df[[
        'score_ht_home',
        'score_ht_away',
        'goal_diff_ht',
        'res'
          ] + ['result']]
plt.figure(figsize=(16, 6))
parallel_coordinates(data, 'result')

## Lag Plot

In [None]:
from pandas.tools.plotting import lag_plot

df = CreateDfForModel(ft_wld=False)
data = df.pn_home_pos

plt.figure(figsize=(16, 6))
lag_plot(data)

## Autocorrelation Plot

In [None]:
from pandas.tools.plotting import autocorrelation_plot

df = CreateDfForModel(ft_wld=False)
data = df.pn_diff_pos

plt.figure(figsize=(16, 6))
autocorrelation_plot(data)

## Bootstrap Plot

In [None]:
from pandas.tools.plotting import bootstrap_plot

df = CreateDfForModel(ft_wld=False)
data = df.pn_diff_pos

bootstrap_plot(data, size=50, samples=500, color='grey')

## RadViz

In [None]:
from pandas.tools.plotting import radviz


df = CreateDfForModel(ft_wld=True)

data = df[[
        'pn_diff_pos', 'score_ht_home', 
        'pn_diff_neg',
        'score_ht_away'
          ] + ['result']]
data = df[df.columns[2:]]

plt.figure(figsize=(16, 10))

radviz(data, 'result')
plt.axhline(y=0, c="k", alpha=0.3)
plt.axvline(x=0, c="k", alpha=0.3)