# EDA: Split season

In [2]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as st

In [15]:
data = pd.read_csv('../data/ML/E0_ML_n3_date.csv')
data['Date']=pd.to_datetime(data['Date'])
data.head()

In [30]:
ODDS_FILEPATH = '../data/ML/E0_home_win_odds.csv'
data_odds = pd.read_csv(ODDS_FILEPATH)

In [10]:
data_0 = data[data['home_win']==0]
data_1 = data[data['home_win']==1]
features = data.drop(['id','Date','home_win'],1).columns.values

best_feat ={} 
worst_feat ={}
for feature in features:
    group1 = data_0[feature]
    group2 = data_1[feature]
    stats,p_value=st.kruskal(group1,group2)
    if p_value<0.01:
        best_feat[feature]=p_value
    else:
        worst_feat[feature]=p_value

print(len(best_feat))
print(len(worst_feat))

41
15


In [23]:
import operator
sorted_best_feat = sorted(best_feat.items(), key=operator.itemgetter(1))
for best in sorted_best_feat:
    print(best)

('diff_season_wages', 2.7201169807474926e-23)
('diff_nb_goals_diff', 1.2985388216484946e-11)
('a_season_wages', 1.7210649221095588e-11)
('h_season_wages', 7.0440796124706071e-11)
('diff_nb_defeats', 5.2703217765154178e-10)
('diff_nb_points', 1.8365461030176566e-09)
('capacity_home_stadium', 2.6104616799398929e-08)
('diff_nb_victories', 1.5460295637888435e-07)
('a_nb_goals_diff', 1.5040983192003147e-06)
('h_nb_goals_diff', 2.3771137087521503e-06)
('a_nb_defeats', 3.50085584662456e-05)
('h_nb_points', 4.4619614913992122e-05)
('h_nb_victories', 0.00011419692974934507)
('h_nb_goals_scored', 0.00033273180394677651)
('h_nb_points_home', 0.0031554745724446734)
('h_diff_goals_home', 0.004339316578661153)
('a_nb_defeats_away', 0.0049705400744598548)
('h_mean_nb_goals_scored_home', 0.0058819525074810539)
('h_last_n_games_points_home', 0.0060693719635523729)
('a_diff_goals_away', 0.0068602409140050419)
('a_last_n_games_defeats_away', 0.0072183652302253249)
('h_nb_goals_scored_home', 0.00885630188

In [24]:
sorted_worst_feat = sorted(worst_feat.items(), key=operator.itemgetter(1))
for worst in sorted_worst_feat:
    print worst

('a_nb_goals_conceded', 0.012064063327298993)
('h_nb_victories_home', 0.014071356920327531)
('h_last_n_games_victories_home', 0.025280297001821422)
('a_nb_goals_conceded_away', 0.055319738499393513)
('h_nb_games_home', 0.071788496170479388)
('Week', 0.087448188713616273)
('a_nb_games_away', 0.093999518083994399)
('h_nb_games', 0.099121307647731222)
('h_nb_defeats', 0.10296141927834866)
('a_nb_games', 0.1041527037975372)
('a_mean_nb_goals_conceded_away', 0.13101423461771713)
('h_mean_nb_goals_conceded_home', 0.17395863094250236)
('Month', 0.20921952643062416)
('h_nb_defeats_home', 0.28680070406171948)
('a_nb_draws', 0.30200820302854769)
('a_nb_victories', 0.35459832299007388)
('h_nb_draws', 0.40459531808512661)
('h_last_n_games_defeats_home', 0.4830468469364162)
('a_nb_goals_scored', 0.48459970818389919)
('a_nb_draws_away', 0.49352407277425414)
('a_nb_points', 0.49843492000835454)
('h_nb_draws_home', 0.56122552019045258)
('a_last_n_games_draws_away', 0.58414384264987673)
('distance_km',

In [27]:
data_first_part = data[(data['Date'].dt.month >= 8)
                        & (data['Date'].dt.month <= 12)]

data_second_part = data[(data['Date'].dt.month >= 1)
                        & (data['Date'].dt.month <= 5)]


In [34]:
def get_profit_multipliers(y, predictions, id_strings, home_win_odds):
    """
    We define a profit multiplier as follow:
    It is equal to 'decimal_odd -1'. If it is positive you
    make (decimal_odd-1) time the money you bet,
    otherwise it is equal to -1 and you've lost the money you bet

    Parameters
    ----------
    y: numpy.ndarray
        Real outcome: 1 means 'home team won' and 0 'home team lost or draw'
    predictions: numpy.ndarray
        Predicted outcome
    id_strings: numpy.ndarray
        List of ids in the same order of association as for the elements in
        'y' and 'predictions'
    home_win_odds: pandas.DataFrame
        'Home win' odds associated with each game

    Returns
    -------
    profits_multipliers: list
        List that contains
    """
    profits_multipliers = []
    home_win_odds.index = home_win_odds['id']
    for i in range(len(y)):
        pred = predictions[i]
        real = y[i]
        id_str = id_strings[i]
        home_win_odd = home_win_odds.loc[id_str]['BbAvH']

        # If algo thinks home team will win
        if pred == 1:
            # If this is indeed true, we make money
            if pred == real:
                profit_m = home_win_odd - 1
                profits_multipliers.append(profit_m)

            # If the decision was wrong, we loose what we bet
            else:
                profit_m = -1
                profits_multipliers.append(profit_m)

        else:
            profit_m = 0
            profits_multipliers.append(profit_m)

    return profits_multipliers


In [46]:
def get_earning_coeff(y, predictions, id_strings, home_win_odds):
    sum_profit_multipliers = 0
    home_win_odds.index = home_win_odds['id']
    for i in range(len(y)):
        pred = predictions[i]
        real = y[i]
        id_str = id_strings[i]
        home_win_odd = home_win_odds.loc[id_str]['BbAvH']

        # If algo thinks home team will win
        if pred == 1:
            # If this is indeed true, we make money
            if pred == real:
                sum_profit_multipliers += home_win_odd - 1

            # If the decision was wrong, we loose what we bet
            else:
                sum_profit_multipliers -= 1

    return sum_profit_multipliers

In [74]:
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, make_scorer
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

params={'estimator__C': [10**i for i in range(-5, 3)],
       'estimator__penalty':['l1','l2']}

for i_month in range(8,13):
    dfp_part1 = data_first_part[(data_first_part['Date'].dt.month >= 8)
                        & (data_first_part['Date'].dt.month <= i_month)]
    
    y=dfp_part1['home_win'].values
    id_str = dfp_part1['id'].values
    dfp_part1=dfp_part1.drop(['Date','id','home_win'],1)
    X=dfp_part1.values
    standardizer = StandardScaler()
    X = standardizer.fit_transform(X)
    
    
    earning_coeff_score = make_scorer(get_earning_coeff, id_strings=id_str,
                                      home_win_odds=data_odds,
                                      greater_is_better=True)

    classifier = LogisticRegression()
    selector = RFECV(classifier, step=1, cv=5, n_jobs=1)

    grid_search = GridSearchCV(selector, param_grid=params,
                                   scoring=earning_coeff_score, cv=5,
                                   verbose=0, n_jobs=-1)
    grid_search.fit(X, y)

    print('Best score: %f' % grid_search.best_score_)
    print(grid_search.best_params_)
    print("Optimal number of features : %d" %
              grid_search.best_estimator_.n_features_)
    best_features = [features[i]
                         for i in range(len(grid_search.best_estimator_.ranking_)) if grid_search.best_estimator_.ranking_[i] == 1]
    print(best_features)
    print('')
    
    X_sub=dfp_part1[best_features].values
    stdardizer = StandardScaler()
    X_sub = stdardizer.fit_transform(X_sub)
    classif=LogisticRegression(C=grid_search.best_params_['estimator__C'])
    proba = cross_val_predict(classif, X_sub, y,
                              method='predict_proba',
                              cv=10, n_jobs=-1)
    proba_home_win = [p[1] for p in proba]
    predictions = [1 if p[1] > 0.5 else 0 for p in proba]
    auc = roc_auc_score(y, proba_home_win)
    fpr, tpr, thresholds = roc_curve(y, proba_home_win, pos_label=1)
    
    profits_multipliers = get_profit_multipliers(y, predictions,
                                                 id_str, data_odds)
    
    print(i_month)
    print('Area under the curve: %f' % auc)
    print(np.sum(profits_multipliers))
    print(len(profits_multipliers))
    print('')
                                     

Best score: 15.343223
{'estimator__penalty': 'l2', 'estimator__C': 0.01}
Optimal number of features : 1
['diff_season_wages']

8
Area under the curve: 0.740494
0.7
242

Best score: 28.698664
{'estimator__penalty': 'l2', 'estimator__C': 0.01}
Optimal number of features : 2
['h_season_wages', 'diff_season_wages']

9
Area under the curve: 0.709227
6.45
494

Best score: 42.457766
{'estimator__penalty': 'l2', 'estimator__C': 0.01}
Optimal number of features : 3
['h_season_wages', 'diff_nb_goals_diff', 'diff_season_wages']

10
Area under the curve: 0.709119
-2.31
779

Best score: 55.730452
{'estimator__penalty': 'l2', 'estimator__C': 0.01}
Optimal number of features : 4
['h_season_wages', 'a_season_wages', 'diff_nb_defeats', 'diff_season_wages']

11
Area under the curve: 0.704777
-11.91
1083

Best score: 68.788044
{'estimator__penalty': 'l2', 'estimator__C': 0.001}
Optimal number of features : 6
['h_season_wages', 'a_season_wages', 'diff_nb_defeats', 'diff_nb_points', 'diff_nb_goals_diff', '

In [73]:
params={'estimator__C': [10**i for i in range(-5, 3)],
       'estimator__penalty':['l1','l2']}

for i_month in range(1,6):
    dsp_part1 = data_second_part[(data_second_part['Date'].dt.month >= 1)
                        & (data_second_part['Date'].dt.month <= i_month)]
    
    y=dsp_part1['home_win'].values
    id_str = dsp_part1['id'].values
    dsp_part1=dsp_part1.drop(['Date','id','home_win'],1)
    X=dsp_part1.values
    standardizer = StandardScaler()
    X = standardizer.fit_transform(X)
    
    
    earning_coeff_score = make_scorer(get_earning_coeff, id_strings=id_str,
                                      home_win_odds=data_odds,
                                      greater_is_better=True)

    classifier = LogisticRegression()
    selector = RFECV(classifier, step=1, cv=5, n_jobs=1)

    grid_search = GridSearchCV(selector, param_grid=params,
                                   scoring=earning_coeff_score, cv=5,
                                   verbose=0, n_jobs=-1)
    grid_search.fit(X, y)

    print('Best score: %f' % grid_search.best_score_)
    print(grid_search.best_params_)
    print("Optimal number of features : %d" %
              grid_search.best_estimator_.n_features_)
    best_features = [features[i]
                         for i in range(len(grid_search.best_estimator_.ranking_)) if grid_search.best_estimator_.ranking_[i] == 1]
    print(best_features)
    print('')
    
    X_sub=dsp_part1[best_features].values
    stdardizer = StandardScaler()
    X_sub = stdardizer.fit_transform(X_sub)
    classif=LogisticRegression(C=grid_search.best_params_['estimator__C'])
    proba = cross_val_predict(classif, X_sub, y,
                              method='predict_proba',
                              cv=10, n_jobs=-1)
    proba_home_win = [p[1] for p in proba]
    predictions = [1 if p[1] > 0.5 else 0 for p in proba]
    auc = roc_auc_score(y, proba_home_win)
    fpr, tpr, thresholds = roc_curve(y, proba_home_win, pos_label=1)
    
    profits_multipliers = get_profit_multipliers(y, predictions,
                                                 id_str, data_odds)
    
    print(i_month)
    print('Area under the curve: %f' % auc)
    print(np.sum(profits_multipliers))
    print(len(profits_multipliers))
    print('')
                                     

Best score: 18.217389
{'estimator__C': 0.1}
Optimal number of features : 1
['diff_season_wages']

1
Area under the curve: 0.718637
-0.16
314

Best score: 33.772791
{'estimator__C': 0.1}
Optimal number of features : 4
['h_nb_goals_scored_home', 'a_season_wages', 'diff_nb_goals_diff', 'diff_season_wages']

2
Area under the curve: 0.715760
12.7
609

Best score: 57.143775
{'estimator__C': 0.01}
Optimal number of features : 56
['h_nb_victories', 'h_nb_draws', 'h_nb_defeats', 'h_nb_points', 'h_nb_goals_scored', 'h_nb_goals_conceded', 'h_nb_goals_diff', 'h_nb_games', 'h_nb_games_home', 'h_nb_victories_home', 'h_nb_draws_home', 'h_nb_defeats_home', 'h_nb_points_home', 'h_nb_goals_scored_home', 'h_nb_goals_conceded_home', 'h_diff_goals_home', 'h_last_n_games_points_home', 'h_last_n_games_victories_home', 'h_last_n_games_draws_home', 'h_last_n_games_defeats_home', 'h_mean_nb_goals_scored_home', 'h_mean_nb_goals_conceded_home', 'h_season_wages', 'a_nb_victories', 'a_nb_draws', 'a_nb_defeats', 'a_