In [2]:
%pylab inline
%matplotlib inline

# Global Imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import os
import sys
import pickle
from pprint import pprint
from time import time
import datetime
from time import gmtime, strftime
import statsmodels.api as sm
from patsy import dmatrices

# Scikit-Learn imports
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import train_test_split

from sklearn.cross_validation import cross_val_score
from sklearn import metrics

# Local Imports
path = str(os.path.expanduser('~')) + '/git/predictEPL/config'
sys.path.append(path)
import paths

sys.path.append(paths.UTILS)
import useful_methods

Populating the interactive namespace from numpy and matplotlib


In [3]:
def ReadEmolexDf():
    # Read game_infos as df
    dfGameInfo = useful_methods.csv_dic_df(paths.READ_PATH_GAME_INFO + 'game_infos.csv')
    dfGameInfo = useful_methods.DropNanGames(dfGameInfo).copy().reset_index(drop=True)
    dfGameInfo.GW = [int(gw) for gw in dfGameInfo.GW]
    dfGameInfo = dfGameInfo.sort_values(['GW', 'away_team'], ascending=[True, True]).copy().reset_index(drop=True)

    # Read Hash Emolex Model result
    df = useful_methods.csv_dic_df(paths.READ_PATH_RESULTS + RESULT_FILE_NAME)
    df.GW = [int(gw) for gw in df.GW]
    df = df.sort_values(['GW', 'away_team'], ascending=[True, True]).copy().reset_index(drop=True)

    df.emolex_home = [np.array([float(emo.strip()) for emo in emolex_home[1:-2].split('.')]) for emolex_home in list(df.emolex_home)]
    df.emolex_away = [np.array([float(emo.strip()) for emo in emolex_away[1:-2].split('.')]) for emolex_away in list(df.emolex_away)]


    # Combine 2 dfs
    df['score_ht_away'] = [int(item) for item in dfGameInfo.score_ht_away]
    df['score_ht_home'] = [int(item) for item in dfGameInfo.score_ht_home]
    df['score_ft_away'] = [int(item) for item in dfGameInfo.score_ft_away]
    df['score_ft_home'] = [int(item) for item in dfGameInfo.score_ft_home]
    
    df['GW'] = [int(item) for item in dfGameInfo.GW]

    # 'home_win': 1, 'away_win': 0, 'draw': 2
    def Labeling(goal_diff):
        if goal_diff > 0:
            return 1
        elif goal_diff < 0:
            return 0
        else:
            return 2

    df['goal_diff_ht'] = df.score_ht_home - df.score_ht_away
    df['goal_diff_ft'] = df.score_ft_home - df.score_ft_away
    df['result'] = [Labeling(item) for item in df.goal_diff_ft] 

    return df

In [4]:
# Create df for models.
def CreateDfForModel(ht_draw=False, ft_wld=False):
    df = ReadEmolexDf()
        
    # only for Win or Lose
    if not ft_wld:
        df = df[df.result != 2].copy().reset_index(drop=True)
    
    # HT: Equal
    if ht_draw:
        df = df[df.goal_diff_ht == 0].copy().reset_index(drop=True)
    
    dta = pd.DataFrame()
    
    # Teams
    dta['team_home'] = df.home_team
    dta['team_away'] = df.away_team
    dta['GW'] = df.GW
    
    # HF scores
    dta['score_ht_home'] = df.score_ht_home
    dta['score_ht_away'] = df.score_ht_away
    
    dta['goal_diff_ht'] = df['goal_diff_ht']
    dta['goal_diff_ft'] = df['goal_diff_ft']

    # 'home_win': 1, 'away_win': 0, 'draw': 2
    dta['result'] = df.result
    
    # Emolex 8
    dta['anger_home'] = [emolex[0] / sum(emolex[:-2]) for emolex in df.emolex_home]
    dta['fear_home'] = [emolex[1] / sum(emolex[:-2]) for emolex in df.emolex_home]
    dta['disgust_home'] = [emolex[2] / sum(emolex[:-2]) for emolex in df.emolex_home]
    dta['sadness_home'] = [emolex[3] / sum(emolex[:-2])  for emolex in df.emolex_home]
    dta['surprise_home'] = [emolex[4] / sum(emolex[:-2])  for emolex in df.emolex_home]
    dta['trust_home'] = [emolex[5] / sum(emolex[:-2])  for emolex in df.emolex_home]
    dta['joy_home'] = [emolex[6] / sum(emolex[:-2])  for emolex in df.emolex_home]
    dta['anticipation_home'] = [emolex[7] / sum(emolex[:-2])  for emolex in df.emolex_home]
    dta['pos_home'] = [emolex[8] / sum(emolex[-2:])  for emolex in df.emolex_home]
    dta['neg_home'] = [emolex[9] / sum(emolex[-2:])  for emolex in df.emolex_home]

    dta['anger_away'] = [emolex[0] / sum(emolex[:-2])  for emolex in df.emolex_away]
    dta['fear_away'] = [emolex[1] / sum(emolex[:-2])  for emolex in df.emolex_away]
    dta['disgust_away'] = [emolex[2] / sum(emolex[:-2])  for emolex in df.emolex_away]
    dta['sadness_away'] = [emolex[3] / sum(emolex[:-2])  for emolex in df.emolex_away]
    dta['surprise_away'] = [emolex[4] / sum(emolex[:-2])  for emolex in df.emolex_away]
    dta['trust_away'] = [emolex[5] / sum(emolex[:-2])  for emolex in df.emolex_away]
    dta['joy_away'] = [emolex[6] / sum(emolex[:-2])  for emolex in df.emolex_away]
    dta['anticipation_away'] = [emolex[7] / sum(emolex[:-2])  for emolex in df.emolex_away]
    dta['pos_away'] = [emolex[8] / sum(emolex[-2:])  for emolex in df.emolex_away]
    dta['neg_away'] = [emolex[9] / sum(emolex[-2:])  for emolex in df.emolex_away]
    
    
    # Diffs
    dta['diff_anger'] = dta['anger_home'] - dta['anger_away']
    dta['diff_fear'] = dta['fear_home'] - dta['fear_away']
    dta['diff_disgust'] = dta['disgust_home'] - dta['disgust_away']
    dta['diff_sadness'] = dta['sadness_home'] - dta['sadness_away']
    dta['diff_surprise'] = dta['surprise_home'] - dta['surprise_away']
    dta['diff_trust'] = dta['trust_home'] - dta['trust_away']
    dta['diff_joy'] = dta['joy_home'] - dta['joy_away']
    dta['diff_anticipation'] = dta['anticipation_home'] - dta['anticipation_away']
    dta['diff_pos'] = dta['pos_home'] - dta['pos_away']
    dta['diff_neg'] = dta['neg_home'] - dta['neg_away']
    
    
    return dta

In [5]:
def EplEmolexTable(RESULT_FILE_NAME):
    df = CreateDfForModel(RESULT_FILE_NAME, ft_wld=True)
    df = df.drop(df.columns[2:8], axis = 1)

    teams = list(set(df.team_home))
    teams.sort()

    pos_avg = []
    neg_avg = []
    anger_avg = []
    fear_avg = []
    disgust_avg = []
    sadness_avg = []
    surprise_avg = []
    trust_avg = []
    joy_avg = []
    anticipation_avg = []
    
    pos_avg_emo = []
    neg_avg_emo = []
    
    for team in teams:
        df_home = df[(df.team_home == team)]
        df_away = df[(df.team_away == team)]
        
        # pos
        pos_home = df_home.pos_home.mean()
        pos_away = df_away.pos_away.mean()
        pos_avg.append((pos_home + pos_away) / 2)
        
        # neg
        neg_home = df_home.neg_home.mean()
        neg_away = df_away.neg_away.mean()
        neg_avg.append((neg_home + neg_away) / 2)
        
        # anger
        anger_home = df_home.anger_home.mean()
        anger_away = df_away.anger_away.mean()
        anger_avg.append((anger_home + anger_away) / 2)
        
        # fear
        fear_home = df_home.fear_home.mean()
        fear_away = df_away.fear_away.mean()
        fear_avg.append((fear_home + fear_away) / 2)
        
        # disgust
        disgust_home = df_home.disgust_home.mean()
        disgust_away = df_away.disgust_away.mean()
        disgust_avg.append((disgust_home + disgust_away) / 2)
        
        # sadness
        sadness_home = df_home.sadness_home.mean()
        sadness_away = df_away.sadness_away.mean()
        sadness_avg.append((sadness_home + sadness_away) / 2)
        
        # surprise
        surprise_home = df_home.surprise_home.mean()
        surprise_away = df_away.surprise_away.mean()
        surprise_avg.append((surprise_home + surprise_away) / 2)
        
        # trust
        trust_home = df_home.trust_home.mean()
        trust_away = df_away.trust_away.mean()
        trust_avg.append((trust_home + trust_away) / 2)
        
        # joy
        joy_home = df_home.joy_home.mean()
        joy_away = df_away.joy_away.mean()
        joy_avg.append((joy_home + joy_away) / 2)
        
        # anticipation
        anticipation_home = df_home.anticipation_home.mean()
        anticipation_away = df_away.anticipation_away.mean()
        anticipation_avg.append((anticipation_home + anticipation_away) / 2)
        
        # pos
        pos_home_emo = df_home.pos_home_emo.mean()
        pos_away_emo = df_away.pos_away_emo.mean()
        pos_avg_emo.append((pos_home_emo + pos_away_emo) / 2)
        
        # neg
        neg_home_emo = df_home.neg_home_emo.mean()
        neg_away_emo = df_away.neg_away_emo.mean()
        neg_avg_emo.append((neg_home_emo + neg_away_emo) / 2)
        
        
        
    
    # Create new df
    dfSentTable = pd.DataFrame()
    dfSentTable['team'] = teams

    dfSentTable['pos_avg'] = pos_avg
    dfSentTable['neg_avg'] = neg_avg
    dfSentTable['anger_avg'] = anger_avg
    dfSentTable['fear_avg'] = fear_avg
    dfSentTable['disgust_avg'] = disgust_avg
    dfSentTable['sadness_avg'] = sadness_avg
    dfSentTable['surprise_avg'] = surprise_avg
    dfSentTable['trust_avg'] = trust_avg
    dfSentTable['joy_avg'] = joy_avg
    dfSentTable['anticipation_avg'] = anticipation_avg
    
    dfSentTable['pos_avg_emo'] = pos_avg_emo
    dfSentTable['neg_avg_emo'] = neg_avg_emo
    
  
    return dfSentTable.sort_values(['pos_avg'], ascending=[False]).reset_index(drop=True)

## EPL Emplex Table

In [6]:
# Single Team's Sentiment's Average
# Only Team's Sent
def SentAvg(df, team, sentiment):
    dfTeam = df[(df.team_home == team) | (df.team_away == team)].copy().reset_index(drop=True)
    sents = []
    for index in range(len(dfTeam)):
        if dfTeam.loc[index]['team_home'] == team:
            sents.append(dfTeam.loc[index][sentiment + '_home'])
        if dfTeam.loc[index]['team_away'] == team:
            sents.append(dfTeam.loc[index][sentiment + '_away'])

    sents = np.array(sents)
    return sents.mean()

# Difference with opponent team
def SentDiffAvg(df, team, sentiment):
    dfTeam = df[(df.team_home == team) | (df.team_away == team)].copy().reset_index(drop=True)
    sents = []
    for index in range(len(dfTeam)):
        if dfTeam.loc[index]['team_home'] == team:
            sents.append(dfTeam.loc[index][sentiment + '_home'] - dfTeam.loc[index][sentiment + '_away'])
        if dfTeam.loc[index]['team_away'] == team:
            sents.append(dfTeam.loc[index][sentiment + '_away'] - dfTeam.loc[index][sentiment + '_home'])

    sents = np.array(sents)
    return sents.mean()


# RESULT_FILE_NAME_HT = "emolex_all_ht.csv"
RESULT_FILE_NAME = "emolex_all_ft.csv"
df = CreateDfForModel(ft_wld=True)

teams = list(set(df.team_home))
teams.sort()


# Create Sentiment Table
dfSentTable = pd.DataFrame()
dfSentTable['team'] = teams
emolex = ['pos', 'neg', 'joy', 'trust', 'anticipation', 'fear', 'anger', 'fear', 'disgust', 'sadness', 'surprise']

for sent in emolex:
    dfSentTable[sent] = [SentAvg(df, team, sent) for team in teams]


dfSentTable
dfSentTable.sort_values(['neg'], ascending=[False])

Unnamed: 0,team,pos,neg,joy,trust,anticipation,fear,anger,disgust,sadness,surprise
16,Villa,0.52043,0.47957,0.130534,0.150511,0.148755,0.116566,0.125867,0.106008,0.125518,0.096242
8,Newcastle,0.529778,0.470222,0.130052,0.158811,0.144549,0.117359,0.125577,0.10605,0.116183,0.101419
18,WestBromwich,0.53925,0.46075,0.135514,0.156945,0.147638,0.111404,0.136913,0.096614,0.117751,0.09722
12,Sunderland,0.543223,0.456777,0.129889,0.159992,0.147653,0.127444,0.120696,0.092144,0.117594,0.104589
2,Chelsea,0.545317,0.454683,0.145637,0.167233,0.154896,0.109752,0.122432,0.101639,0.103111,0.0953
7,Liverpool,0.551401,0.448599,0.144163,0.168848,0.148478,0.114988,0.119636,0.095069,0.110795,0.098023
15,United,0.552746,0.447254,0.151158,0.174627,0.15873,0.104791,0.115351,0.094022,0.102529,0.098792
3,City,0.562058,0.437942,0.142082,0.175854,0.150609,0.116331,0.123396,0.082207,0.107912,0.101608
4,Crystal,0.569221,0.430779,0.151822,0.170744,0.149121,0.109474,0.128563,0.081765,0.104875,0.103637
9,Norwich,0.570346,0.429654,0.143715,0.178415,0.158733,0.108766,0.111332,0.083775,0.109039,0.106224


In [24]:
dfRank = dfSentTable.copy()
dfRank.index = range(1, 21)

def RankSent(dfRank, sent):
    pos_index = dfRank.sort_values([sent], ascending=[False]).index
    
    l = [0] * 20
    
    for index in range(len(pos_index)):
        l[pos_index[index] - 1] = index + 1

    dfRank[sent] = l
    return dfRank

# Rank by Sentiments
dfRank = RankSent(dfRank, 'pos').copy()
dfRank = RankSent(dfRank, 'joy').copy()
dfRank = RankSent(dfRank, 'trust').copy()
dfRank = RankSent(dfRank, 'anticipation').copy()

dfRank = RankSent(dfRank, 'neg').copy()
dfRank = RankSent(dfRank, 'fear').copy()
dfRank = RankSent(dfRank, 'anger').copy()
dfRank = RankSent(dfRank, 'disgust').copy()
dfRank = RankSent(dfRank, 'sadness').copy()
dfRank = RankSent(dfRank, 'surprise').copy()

# Positive Table
dfTablePos = dfRank[['team', 'pos', 'joy', 'trust', 'anticipation', 'surprise']].copy()
dfTablePos = dfTablePos.sort_values(['pos'], ascending=True).reset_index(drop=True)
dfTablePos.index = range(1, 21)
dfTablePos

Unnamed: 0,team,pos,joy,trust,anticipation,surprise
1,Leicester,1,3,1,1,6
2,Swansea,2,4,3,3,1
3,Stoke,3,9,4,7,4
4,Bournemouth,4,11,5,6,20
5,Tottenham,5,1,8,4,10
6,Watford,6,7,6,5,2
7,Southampton,7,2,13,2,3
8,Arsenal,8,10,2,11,13
9,WestHam,9,12,7,19,17
10,Everton,10,5,11,8,12


In [26]:
dfTableNeg = dfRank[['team', 'neg', 'anger', 'fear', 'disgust', 'sadness']].copy()
dfTableNeg = dfTableNeg.sort_values(['neg'], ascending=True).reset_index(drop=True)
dfTableNeg.index = range(1, 21)
dfTableNeg

Unnamed: 0,team,neg,anger,fear,disgust,sadness
1,Villa,1,5,4,2,1
2,Newcastle,2,6,3,1,4
3,WestBromwich,3,1,9,4,2
4,Sunderland,4,10,1,7,3
5,Chelsea,5,9,11,3,9
6,Liverpool,6,12,6,5,5
7,United,7,14,14,6,11
8,City,8,7,5,10,7
9,Crystal,9,3,12,11,8
10,Norwich,10,17,13,8,6


## EPL Score Table
Exclude GW: [1, 2, 3, 4, 13]

In [30]:
dfScore = useful_methods.ReadEplScoreTable()

# drop weeks
# dfScore = dfScore.drop(dfScore.columns[[1, 2, 3, 4, 5, 13, 25]], axis=1)

# sum all scores
# dfScore['pts'] = dfScore.sum(axis=1)

# sort by overall points
dfScore = dfScore.sort_values(['pts'], ascending=False).reset_index(drop=True)
dfScore[['team', 'pts']]

Unnamed: 0,team,pts
0,Leicester,53
1,Arsenal,48
2,Tottenham,48
3,City,47
4,United,41
5,WestHam,39
6,Southampton,37
7,Everton,35
8,Liverpool,35
9,Watford,33
