In [1]:
%pylab inline
%matplotlib inline

# Global Imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import os
import sys
import pickle
from pprint import pprint
from time import time
import datetime
from time import gmtime, strftime
import statsmodels.api as sm
from patsy import dmatrices

# Scikit-Learn imports
from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import train_test_split

from sklearn.cross_validation import cross_val_score
from sklearn import metrics

# Local Imports
path = str(os.path.expanduser('~')) + '/git/predictEPL/config'
sys.path.append(path)
import paths

sys.path.append(paths.UTILS)
import useful_methods

Populating the interactive namespace from numpy and matplotlib


In [2]:
date_now = strftime("%Y-%m-%d %H:%M:%S", gmtime()).replace(" ", "_")

# ***************************************************
# [Step 1]: Data Load
# ***************************************************

# Read Data
df = useful_methods.csv_dic_df(paths.DATA_HOME + "EPL/all_game_emolex_counted.csv")

# Manipulations

dic_res ={
    'home_win': 1,
    'away_win': 0,
    'draw': 2
}

def Labeling(goal_diff):
    if goal_diff > 0:
        return 1
    elif goal_diff < 0:
        return 0
    else:
        return 2

df.score_ft_home = [int(score_ft_home) for score_ft_home in df.score_ft_home]
df.score_ft_away = [int(score_ft_away) for score_ft_away in df.score_ft_away]

df.pn_home = [np.array([float(pn) for pn in pn_home[1:-1].split(',')]) for pn_home in list(df.pn_home)]
df.pn_away = [np.array([float(pn) for pn in pn_away[1:-1].split(',')]) for pn_away in list(df.pn_away)]

df.emolex_home = [np.array([float(emo) for emo in emolex_home[1:-1].split(',')]) for emolex_home in list(df.emolex_home)]
df.emolex_away = [np.array([float(emo) for emo in emolex_away[1:-1].split(',')]) for emolex_away in list(df.emolex_away)]

df['goal_diff'] = df.score_ft_home - df.score_ft_away
df['result'] = [Labeling(goal_diff) for goal_diff in df.goal_diff]
# df = df[df.result != 2].copy().reset_index(drop=True)
df

Unnamed: 0,GW,away_team,date,emolex_away,emolex_home,home_team,pn_away,pn_home,score_ft_away,score_ft_home,score_ht_away,score_ht_home,time,goal_diff,result
0,5,Chelsea,Sat. 12 Sep.,"[1732.0, 1670.0, 1436.0, 1569.0, 1079.0, 2217....","[405.0, 287.0, 192.0, 322.0, 291.0, 484.0, 530...",Everton,"[3901.0, 3699.0]","[1009.0, 647.0]",1,3,1,2,11:45,2,1
1,5,Bournemouth,Sat. 12 Sep.,"[36.0, 36.0, 25.0, 22.0, 27.0, 50.0, 37.0, 37.0]","[67.0, 64.0, 22.0, 54.0, 122.0, 194.0, 122.0, ...",Norwich,"[112.0, 53.0]","[331.0, 100.0]",1,3,0,1,14:00,2,1
2,5,Swansea,Sat. 12 Sep.,"[24.0, 23.0, 10.0, 20.0, 22.0, 51.0, 33.0, 46.0]","[38.0, 22.0, 18.0, 20.0, 57.0, 66.0, 52.0, 58.0]",Watford,"[78.0, 36.0]","[98.0, 55.0]",0,1,0,0,14:00,1,1
3,5,Southampton,Sat. 12 Sep.,"[23.0, 28.0, 17.0, 23.0, 75.0, 65.0, 68.0, 67.0]","[56.0, 30.0, 38.0, 32.0, 43.0, 80.0, 48.0, 94.0]",WestBromwich,"[154.0, 62.0]","[116.0, 116.0]",0,0,0,0,14:00,0,2
4,5,Stoke,Sat. 12 Sep.,"[82.0, 59.0, 40.0, 47.0, 75.0, 130.0, 107.0, 8...","[603.0, 445.0, 421.0, 432.0, 1039.0, 1681.0, 1...",Arsenal,"[255.0, 113.0]","[2529.0, 938.0]",0,2,0,1,14:00,2,1
5,5,City,Sat. 12 Sep.,"[1029.0, 1358.0, 538.0, 805.0, 441.0, 378.0, 3...","[146.0, 113.0, 60.0, 80.0, 100.0, 190.0, 145.0...",Crystal,"[975.0, 1708.0]","[299.0, 225.0]",1,0,0,0,14:00,-1,0
6,5,Liverpool,Sat. 12 Sep.,"[1743.0, 1955.0, 1644.0, 1838.0, 1207.0, 2875....","[5228.0, 3026.0, 2403.0, 2447.0, 2705.0, 7167....",United,"[4881.0, 4464.0]","[11807.0, 9999.0]",1,3,0,0,16:30,2,1
7,5,Tottenham,Sun. 13 Sep.,"[379.0, 365.0, 248.0, 315.0, 297.0, 511.0, 421...","[158.0, 152.0, 87.0, 120.0, 205.0, 329.0, 218....",Sunderland,"[1117.0, 815.0]","[498.0, 330.0]",1,0,0,0,12:30,-1,0
8,5,Villa,Sun. 13 Sep.,"[195.0, 233.0, 203.0, 191.0, 384.0, 725.0, 881...","[70.0, 75.0, 74.0, 65.0, 99.0, 122.0, 99.0, 10...",Leicester,"[1550.0, 480.0]","[276.0, 163.0]",2,3,1,0,15:00,1,1
9,5,Newcastle,Mon. 14 Sep.,"[589.0, 697.0, 473.0, 643.0, 418.0, 919.0, 574...","[96.0, 92.0, 64.0, 68.0, 147.0, 401.0, 297.0, ...",WestHam,"[1560.0, 1563.0]","[655.0, 202.0]",0,2,0,1,19:00,2,1


In [3]:
# df = df[df.result != 2].copy().reset_index(drop=True)
dta = pd.DataFrame()

dta['team_home'] = df.home_team
dta['team_away'] = df.away_team

dta['pos_home'] = [pn_home[0] / sum(pn_home)  for pn_home in df.pn_home]
dta['neg_home'] = [pn_home[1] / sum(pn_home)  for pn_home in df.pn_home]

dta['pos_away'] = [pn_away[0] / sum(pn_away)  for pn_away in df.pn_away]
dta['neg_away'] = [pn_away[1] / sum(pn_away)  for pn_away in df.pn_away]

dta['score_ht_home'] = [int(score_ht_home) for score_ht_home in df.score_ht_home]
dta['score_ht_away'] = [int(score_ht_away) for score_ht_away in df.score_ht_away]

dta['anger_home'] = [emolex[0] / sum(emolex) for emolex in df.emolex_home]
dta['fear_home'] = [emolex[1] / sum(emolex) for emolex in df.emolex_home]
dta['disgust_home'] = [emolex[2] / sum(emolex) for emolex in df.emolex_home]
dta['sadness_home'] = [emolex[3] / sum(emolex)  for emolex in df.emolex_home]
dta['surprise_home'] = [emolex[4] / sum(emolex)  for emolex in df.emolex_home]
dta['trust_home'] = [emolex[5] / sum(emolex)  for emolex in df.emolex_home]
dta['joy_home'] = [emolex[6] / sum(emolex)  for emolex in df.emolex_home]
dta['anticipation_home'] = [emolex[7] / sum(emolex)  for emolex in df.emolex_home]


dta['anger_away'] = [emolex[0] / sum(emolex)  for emolex in df.emolex_away]
dta['fear_away'] = [emolex[1] / sum(emolex)  for emolex in df.emolex_away]
dta['disgust_away'] = [emolex[2] / sum(emolex)  for emolex in df.emolex_away]
dta['sadness_away'] = [emolex[3] / sum(emolex)  for emolex in df.emolex_away]
dta['surprise_away'] = [emolex[4] / sum(emolex)  for emolex in df.emolex_away]
dta['trust_away'] = [emolex[5] / sum(emolex)  for emolex in df.emolex_away]
dta['joy_away'] = [emolex[6] / sum(emolex)  for emolex in df.emolex_away]
dta['anticipation_away'] = [emolex[7] / sum(emolex)  for emolex in df.emolex_away]


# 'home_win': 1, 'away_win': 0, 'draw': 2
dta['result'] = df.result
dta

Unnamed: 0,team_home,team_away,pos_home,neg_home,pos_away,neg_away,score_ht_home,score_ht_away,anger_home,fear_home,...,anticipation_home,anger_away,fear_away,disgust_away,sadness_away,surprise_away,trust_away,joy_away,anticipation_away,result
0,Everton,Chelsea,0.609300,0.390700,0.513289,0.486711,2,1,0.136824,0.096959,...,0.151689,0.130648,0.125971,0.108320,0.118353,0.081391,0.167232,0.124085,0.143999,1
1,Norwich,Bournemouth,0.767981,0.232019,0.678788,0.321212,1,0,0.082209,0.078528,...,0.208589,0.133333,0.133333,0.092593,0.081481,0.100000,0.185185,0.137037,0.137037,1
2,Watford,Swansea,0.640523,0.359477,0.684211,0.315789,0,0,0.114804,0.066465,...,0.175227,0.104803,0.100437,0.043668,0.087336,0.096070,0.222707,0.144105,0.200873,1
3,WestBromwich,Southampton,0.500000,0.500000,0.712963,0.287037,0,0,0.133017,0.071259,...,0.223278,0.062842,0.076503,0.046448,0.062842,0.204918,0.177596,0.185792,0.183060,2
4,Arsenal,Stoke,0.729449,0.270551,0.692935,0.307065,1,0,0.085133,0.062826,...,0.175491,0.130366,0.093800,0.063593,0.074722,0.119237,0.206677,0.170111,0.141494,1
5,Crystal,City,0.570611,0.429389,0.363399,0.636601,0,0,0.145563,0.112662,...,0.168495,0.173232,0.228620,0.090572,0.135522,0.074242,0.063636,0.057239,0.176936,0
6,United,Liverpool,0.541456,0.458544,0.522311,0.477689,0,0,0.150100,0.086879,...,0.175423,0.110031,0.123414,0.103781,0.116028,0.076195,0.181491,0.136039,0.153021,1
7,Sunderland,Tottenham,0.601449,0.398551,0.578157,0.421843,0,0,0.104153,0.100198,...,0.163481,0.128518,0.123771,0.084096,0.106816,0.100712,0.173279,0.142760,0.140047,0
8,Leicester,Villa,0.628702,0.371298,0.763547,0.236453,0,1,0.099010,0.106082,...,0.145686,0.054561,0.065193,0.056799,0.053442,0.107443,0.202854,0.246503,0.213206,1
9,WestHam,Newcastle,0.764294,0.235706,0.499520,0.500480,1,0,0.066992,0.064201,...,0.187020,0.115422,0.136586,0.092691,0.126004,0.081913,0.180090,0.112483,0.154811,1


In [4]:
# 'home_win': 1, 'away_win': 0, 'draw': 2
dta.groupby('result').mean()

Unnamed: 0_level_0,pos_home,neg_home,pos_away,neg_away,score_ht_home,score_ht_away,anger_home,fear_home,disgust_home,sadness_home,...,joy_home,anticipation_home,anger_away,fear_away,disgust_away,sadness_away,surprise_away,trust_away,joy_away,anticipation_away
result,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.594051,0.405949,0.626635,0.373365,0.375,0.833333,0.121734,0.110601,0.074734,0.095972,...,0.14644,0.155577,0.114128,0.102344,0.067767,0.100756,0.102807,0.195737,0.153609,0.162851
1,0.656051,0.343949,0.557368,0.442632,1.121622,0.202703,0.104693,0.09181,0.058235,0.084646,...,0.16945,0.171276,0.125709,0.115874,0.085068,0.112246,0.100829,0.171509,0.135504,0.153261
2,0.608007,0.391993,0.614516,0.385484,0.565217,0.521739,0.115646,0.095839,0.072673,0.093601,...,0.14883,0.168049,0.111951,0.103521,0.069515,0.094951,0.115612,0.18906,0.155669,0.15972


In [5]:
dta.groupby('team_home').mean()

Unnamed: 0_level_0,pos_home,neg_home,pos_away,neg_away,score_ht_home,score_ht_away,anger_home,fear_home,disgust_home,sadness_home,...,anticipation_home,anger_away,fear_away,disgust_away,sadness_away,surprise_away,trust_away,joy_away,anticipation_away,result
team_home,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Arsenal,0.674028,0.325972,0.604427,0.395573,1.428571,0.285714,0.087535,0.07936,0.065175,0.080684,...,0.173979,0.131546,0.112774,0.073552,0.091902,0.112734,0.184227,0.142611,0.150654,1.0
Bournemouth,0.636743,0.363257,0.55944,0.44056,0.875,0.875,0.132075,0.101392,0.046557,0.096514,...,0.170126,0.128001,0.123743,0.072871,0.107966,0.105363,0.174576,0.1451,0.14238,1.125
Chelsea,0.587701,0.412299,0.571347,0.428653,0.7,0.4,0.115459,0.097462,0.084468,0.083157,...,0.18643,0.115445,0.100063,0.080941,0.101501,0.101971,0.19444,0.146099,0.15954,1.0
City,0.653726,0.346274,0.62249,0.37751,1.5,0.7,0.103295,0.10512,0.055878,0.082814,...,0.153972,0.103916,0.094016,0.07782,0.101908,0.108459,0.194847,0.163838,0.155196,0.9
Crystal,0.56212,0.43788,0.514373,0.485627,0.555556,0.333333,0.148816,0.115339,0.087146,0.084467,...,0.14304,0.124001,0.139743,0.100168,0.103325,0.102001,0.161737,0.126199,0.142825,0.777778
Everton,0.639618,0.360382,0.615428,0.384572,1.222222,1.0,0.117751,0.081691,0.070181,0.082253,...,0.159858,0.124908,0.109058,0.087617,0.110956,0.103262,0.172262,0.14939,0.142547,1.0
Leicester,0.70007,0.29993,0.638328,0.361672,0.333333,0.666667,0.088135,0.081606,0.090948,0.098801,...,0.172251,0.080828,0.101874,0.0655,0.091126,0.150371,0.187528,0.151451,0.171322,1.333333
Liverpool,0.658567,0.341433,0.622791,0.377209,0.5,0.375,0.099561,0.093701,0.061253,0.087246,...,0.174109,0.101352,0.098561,0.065389,0.101308,0.09788,0.194377,0.165031,0.176101,1.375
Newcastle,0.579765,0.420235,0.608512,0.391488,0.8,0.7,0.118234,0.112589,0.087671,0.098036,...,0.156816,0.113834,0.111332,0.075308,0.095299,0.10161,0.201677,0.124188,0.176752,1.1
Norwich,0.604774,0.395226,0.630585,0.369415,0.428571,0.428571,0.104064,0.097863,0.069471,0.10016,...,0.171095,0.124168,0.102282,0.067101,0.093551,0.115334,0.173201,0.150809,0.173553,1.0


In [6]:
dta.groupby('team_away').mean()

Unnamed: 0_level_0,pos_home,neg_home,pos_away,neg_away,score_ht_home,score_ht_away,anger_home,fear_home,disgust_home,sadness_home,...,anticipation_home,anger_away,fear_away,disgust_away,sadness_away,surprise_away,trust_away,joy_away,anticipation_away,result
team_away,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Arsenal,0.600755,0.399245,0.590703,0.409297,0.7,0.8,0.117791,0.100137,0.078352,0.094417,...,0.178027,0.128318,0.118139,0.071463,0.103249,0.098151,0.188957,0.125852,0.16587,0.9
Bournemouth,0.664152,0.335848,0.594083,0.405917,1.222222,0.333333,0.090264,0.090615,0.084665,0.094271,...,0.171861,0.134527,0.127724,0.063448,0.119158,0.094954,0.178487,0.133322,0.14838,1.0
Chelsea,0.630274,0.369726,0.537337,0.462663,0.666667,0.333333,0.110732,0.096167,0.06377,0.082874,...,0.162579,0.132889,0.113136,0.098605,0.103512,0.095563,0.162789,0.141628,0.151879,1.333333
City,0.666822,0.333178,0.566743,0.433257,0.714286,0.142857,0.106365,0.096917,0.053663,0.064,...,0.174152,0.125931,0.130465,0.076511,0.110272,0.108959,0.186684,0.117813,0.143367,1.0
Crystal,0.650413,0.349587,0.580069,0.419931,0.25,0.125,0.121361,0.101535,0.072095,0.062247,...,0.156592,0.116451,0.100094,0.072079,0.104291,0.127421,0.163158,0.153908,0.162598,1.0
Everton,0.600658,0.399342,0.589254,0.410746,0.375,0.5,0.107743,0.092162,0.065691,0.09025,...,0.19522,0.106303,0.093313,0.075017,0.087116,0.119528,0.202582,0.161172,0.15497,1.375
Leicester,0.651022,0.348978,0.667925,0.332075,0.6,0.6,0.106247,0.108894,0.062456,0.089747,...,0.176463,0.104739,0.092808,0.076198,0.080378,0.103457,0.20028,0.163661,0.178478,0.7
Liverpool,0.62374,0.37626,0.570534,0.429466,0.666667,0.555556,0.113013,0.105413,0.068313,0.094596,...,0.155555,0.123107,0.10866,0.087233,0.100966,0.09808,0.191242,0.142758,0.147954,0.888889
Newcastle,0.628853,0.371147,0.522571,0.477429,1.0,0.285714,0.10443,0.102145,0.063481,0.096201,...,0.154208,0.131937,0.129423,0.100075,0.116894,0.098965,0.160772,0.129819,0.132114,0.857143
Norwich,0.603571,0.396429,0.572978,0.427022,0.8,0.4,0.138881,0.111157,0.072955,0.10926,...,0.124477,0.120823,0.10793,0.075595,0.107063,0.105775,0.179944,0.146762,0.156107,1.1


## Prepare Data for Logistic Regression

In [18]:
y, X = dmatrices('result ~ \
    pos_home + neg_home + pos_away + neg_away + \
    score_ht_home + score_ht_away + \
    anger_home + fear_home + disgust_home + sadness_home + \
    surprise_home + trust_home + joy_home + anticipation_home + \
    anger_away + fear_away + disgust_away + sadness_away + \
    surprise_away + trust_away + joy_away + anticipation_away + \
    C(team_home) + C(team_away)',
    dta, return_type="dataframe")

# flatten y into a 1-D array
y = np.ravel(y)

print(X.columns)
X.head()

Index(['Intercept', 'C(team_home)[T.Bournemouth]', 'C(team_home)[T.Chelsea]',
       'C(team_home)[T.City]', 'C(team_home)[T.Crystal]',
       'C(team_home)[T.Everton]', 'C(team_home)[T.Leicester]',
       'C(team_home)[T.Liverpool]', 'C(team_home)[T.Newcastle]',
       'C(team_home)[T.Norwich]', 'C(team_home)[T.Southampton]',
       'C(team_home)[T.Stoke]', 'C(team_home)[T.Sunderland]',
       'C(team_home)[T.Swansea]', 'C(team_home)[T.Tottenham]',
       'C(team_home)[T.United]', 'C(team_home)[T.Villa]',
       'C(team_home)[T.Watford]', 'C(team_home)[T.WestBromwich]',
       'C(team_home)[T.WestHam]', 'C(team_away)[T.Bournemouth]',
       'C(team_away)[T.Chelsea]', 'C(team_away)[T.City]',
       'C(team_away)[T.Crystal]', 'C(team_away)[T.Everton]',
       'C(team_away)[T.Leicester]', 'C(team_away)[T.Liverpool]',
       'C(team_away)[T.Newcastle]', 'C(team_away)[T.Norwich]',
       'C(team_away)[T.Southampton]', 'C(team_away)[T.Stoke]',
       'C(team_away)[T.Sunderland]', 'C(team_aw

Unnamed: 0,Intercept,C(team_home)[T.Bournemouth],C(team_home)[T.Chelsea],C(team_home)[T.City],C(team_home)[T.Crystal],C(team_home)[T.Everton],C(team_home)[T.Leicester],C(team_home)[T.Liverpool],C(team_home)[T.Newcastle],C(team_home)[T.Norwich],...,joy_home,anticipation_home,anger_away,fear_away,disgust_away,sadness_away,surprise_away,trust_away,joy_away,anticipation_away
0,1,0,0,0,0,1,0,0,0,0,...,0.179054,0.151689,0.130648,0.125971,0.10832,0.118353,0.081391,0.167232,0.124085,0.143999
1,1,0,0,0,0,0,0,0,0,1,...,0.149693,0.208589,0.133333,0.133333,0.092593,0.081481,0.1,0.185185,0.137037,0.137037
2,1,0,0,0,0,0,0,0,0,0,...,0.1571,0.175227,0.104803,0.100437,0.043668,0.087336,0.09607,0.222707,0.144105,0.200873
3,1,0,0,0,0,0,0,0,0,0,...,0.114014,0.223278,0.062842,0.076503,0.046448,0.062842,0.204918,0.177596,0.185792,0.18306
4,1,0,0,0,0,0,0,0,0,0,...,0.172102,0.175491,0.130366,0.0938,0.063593,0.074722,0.119237,0.206677,0.170111,0.141494


In [19]:
# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model = model.fit(X, y)

# check the accuracy on the training set
print(model.score(X, y))


# evaluate the model using 10-fold cross-validation
scores = cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=10)
print(scores)
print(scores.mean())

0.724550898204
[ 0.44444444  0.38888889  0.44444444  0.55555556  0.52941176  0.52941176
  0.625       0.53333333  0.4         0.53333333]
0.498382352941


In [15]:
coef = pd.DataFrame()
coef['var'] = X.columns
coef['score0'] = np.transpose(model.coef_[0])
coef['score1'] = np.transpose(model.coef_[1])
coef['score2'] = np.transpose(model.coef_[2])

# coef[0:20]
# coef[20:39]
coef[39::]

Unnamed: 0,var,score0,score1,score2
39,pos_home,-0.035034,0.118177,-0.495306
40,neg_home,-0.247385,-0.271698,0.230171
41,pos_away,-0.289551,-0.455166,0.336903
42,neg_away,0.007132,0.301645,-0.602038
43,score_ht_home,-1.251178,1.316965,-0.25121
44,score_ht_away,1.299358,-1.557288,-0.033655
45,anger_home,-0.038109,-0.103828,0.015318
46,fear_home,0.063424,-0.009623,-0.160277
47,disgust_home,-0.039264,-0.191978,0.136822
48,sadness_home,-0.185705,0.02695,0.047048


## Model Evaluation Using a Validation Set

In [30]:
# evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
model2 = LogisticRegression()
model2.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

In [31]:
# predict class labels for the test set
predicted = model2.predict(X_test)
print(predicted)

[ 1.  1.  0.  1.  0.  1.  0.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.  1.
  1.  0.  1.  1.  0.  0.  1.  0.  1.  0.  0.  1.  1.  0.  0.  0.  1.  0.
  0.]


In [32]:
# generate class probabilities
probs = model2.predict_proba(X_test)
print(probs)

[[ 0.46095175  0.53904825]
 [ 0.14858166  0.85141834]
 [ 0.66331607  0.33668393]
 [ 0.13269906  0.86730094]
 [ 0.8608485   0.1391515 ]
 [ 0.21941747  0.78058253]
 [ 0.77886447  0.22113553]
 [ 0.01450591  0.98549409]
 [ 0.42324699  0.57675301]
 [ 0.00147993  0.99852007]
 [ 0.00550222  0.99449778]
 [ 0.08038558  0.91961442]
 [ 0.54797909  0.45202091]
 [ 0.32690864  0.67309136]
 [ 0.31757799  0.68242201]
 [ 0.28261236  0.71738764]
 [ 0.12384658  0.87615342]
 [ 0.20704192  0.79295808]
 [ 0.09086022  0.90913978]
 [ 0.98333418  0.01666582]
 [ 0.04392126  0.95607874]
 [ 0.10166496  0.89833504]
 [ 0.68073728  0.31926272]
 [ 0.9335452   0.0664548 ]
 [ 0.2814178   0.7185822 ]
 [ 0.87222308  0.12777692]
 [ 0.00643372  0.99356628]
 [ 0.83277918  0.16722082]
 [ 0.64833495  0.35166505]
 [ 0.14007145  0.85992855]
 [ 0.05526444  0.94473556]
 [ 0.65033055  0.34966945]
 [ 0.55808716  0.44191284]
 [ 0.55711246  0.44288754]
 [ 0.14941533  0.85058467]
 [ 0.51488937  0.48511063]
 [ 0.61355786  0.38644214]]


In [33]:
# generate evaluation metrics
print(metrics.accuracy_score(y_test, predicted))
print(metrics.roc_auc_score(y_test, probs[:, 1]))

0.675675675676
0.786713286713


In [34]:
print(metrics.confusion_matrix(y_test, predicted))
print(metrics.classification_report(y_test, predicted))

[[ 7  4]
 [ 8 18]]
             precision    recall  f1-score   support

        0.0       0.47      0.64      0.54        11
        1.0       0.82      0.69      0.75        26

avg / total       0.71      0.68      0.69        37



## Model Evaluation Using Cross-Validation

In [35]:
# evaluate the model using 10-fold cross-validation
scores = cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=10)
print(scores)
print(scores.mean())

[ 0.69230769  0.61538462  0.76923077  0.84615385  0.83333333  0.66666667
  0.83333333  0.63636364  0.63636364  0.72727273]
0.725641025641
