In [1]:
import pandas as pd
import numpy as np

from boruta import BorutaPy

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import preprocessing
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [2]:
year_weights = {2008: 1,
 2009: 1,
 2010: 1,
 2011: 1,
 2012: 1,
 2013: 1,
 2014: 1,
 2015: 1,
 2016: 1,
 2017: 1,
 2018: 1}

In [3]:
afl_ML = pd.read_pickle('afl_ML.pkl')
test_ids = pd.read_pickle('test_fw_ids.pkl')
afl_ML = afl_ML.dropna(axis = 1)

In [None]:
train = afl_ML[~(afl_ML.fw_game_id.isin(test_ids.fw_game_id.values))]
test = afl_ML[afl_ML.fw_game_id.isin(test_ids.fw_game_id.values)]

In [None]:
target = 'Win'
M_cols = ['fw_game_id', 'year', 'round_index']

M_train = train[M_cols]
M_test = test[M_cols]
sample_weights = M_train.year.map(year_weights)

Y_train = train[target]
Y_test = test[target]

M_cols.append(target)
X_cols = [col for col in afl_ML.columns if col not in (M_cols)]

X_train = train[X_cols]
X_test = test[X_cols]

In [None]:
train_scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = train_scaler.transform(X_train)
X_test_scaled = train_scaler.transform(X_test)

In [None]:
# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)
Y_trainval = Y_train.values
np.random.seed(80085)
feat_selector.fit(X_train_scaled, Y_trainval)

In [None]:
criteria = pd.Series(feat_selector.support_)
X_train_boruta = feat_selector.transform(X_train_scaled)
X_train_boruta = pd.DataFrame(X_train_boruta, index = X_train.index, columns = X_train.columns[criteria].values)

In [None]:
criteria = pd.Series(feat_selector.support_)
X_test_boruta = feat_selector.transform(X_test_scaled)
X_test_boruta = pd.DataFrame(X_test_boruta, index = X_test.index, columns = X_test.columns[criteria].values)

In [None]:
X_test_boruta.columns

In [None]:
param_set = [{'C':[0.1, 0.5, 1, 5, 10, 100, 1000], 'kernel': ['linear']},
             {'C': [0.1, 0.5, 1, 5, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']}]
score = 'neg_log_loss'

In [None]:
SVC_clf = GridSearchCV(svm.SVC(probability = True),
            param_grid = param_set,
            scoring = score,
            cv = 5,
            verbose = 3)

In [None]:
SVC_clf.fit(X_train_boruta, Y_train, sample_weight = sample_weights.values)

In [None]:
Y_test_predictions = SVC_clf.predict(X_test_boruta)
Y_test_probabilities = SVC_clf.predict_proba(X_test_boruta)
Y_test_predictions = pd.DataFrame(Y_test_predictions, index = Y_test.index, columns = ['predict'])
Y_test_probabilities = pd.DataFrame(Y_test_probabilities, index = Y_test.index)
Y_test_outcomes = Y_test_probabilities.join(Y_test_predictions)
Y_test_outcomes = Y_test_outcomes.join(Y_test)
s = abs(Y_test_outcomes.iloc[0:,2] - Y_test_outcomes.iloc[0:,3])
1 - sum(s)/len(s)

In [None]:
afl_DF.loc[afl_DF['Margin'] == 0, 'Result'] = 0
afl_DF.loc[afl_DF['Margin'] > 0, 'Result'] = 1
afl_DF.loc[afl_DF['Margin'] < 0, 'Result'] = -1

In [80]:
prediction_set = Y_test_probabilities.join(M_test).join(X_test.home_game)
prediction_set = prediction_set.drop(0,axis=1)
prediction_set = pd.merge(prediction_set.loc[prediction_set['home_game'] == 1,[1,'fw_game_id','year','round_index','home_game']],
         prediction_set.loc[prediction_set['home_game'] == 0, [1,'fw_game_id']],
         left_on = ['fw_game_id'],
         right_on = ['fw_game_id'],
        how = 'left')
prediction_set[['fin_pred']] = pd.DataFrame((1-prediction_set[['1_y']].values+prediction_set[['1_x']].values)/2)
prediction_set

Unnamed: 0,1_x,fw_game_id,year,round_index,home_game,1_y,fin_pred
0,0.921113,5814,2014,164,1,0.078675,0.921219
1,0.429241,5881,2014,172,1,0.570575,0.429333
2,0.941456,9339,2017,230,1,0.060942,0.940257
3,0.841626,9375,2017,234,1,0.158090,0.841768
4,0.621214,9611,2018,260,1,0.381225,0.619995
5,0.540663,6131,2015,200,1,0.458875,0.540894
6,0.169483,6159,2015,203,1,0.829932,0.169775
7,0.259955,6184,2016,205,1,0.739379,0.260288
8,0.525448,6202,2016,207,1,0.473991,0.525728
9,0.169442,9437,2017,242,1,0.830032,0.169705


In [60]:
prediction_set.loc[prediction_set['home_game'] == 0, [1,'fw_game_id','home_join']]

Unnamed: 0,1,fw_game_id,home_join
128,0.471972,5853,1
145,0.656503,6013,1
184,0.620386,9318,1
196,0.786150,9426,1
226,0.814600,9703,1
372,0.285508,6003,1
377,0.171611,6050,1
388,0.067912,6146,1
403,0.135305,6288,1
418,0.098045,9367,1


In [None]:
SVC_clf.best_estimator_

In [None]:
Y_test_outcomes.loc[Y_test_outcomes.predict == Y_test_outcomes.Win,'score'] = Y_test_outcomes[Y_test_outcomes.predict == Y_test_outcomes.Win].iloc[0:,0:2].max(axis = 1).values

In [None]:
Y_test_outcomes.loc[~(Y_test_outcomes.predict == Y_test_outcomes.Win),'score'] = Y_test_outcomes[~(Y_test_outcomes.predict == Y_test_outcomes.Win)].iloc[0:,0:2].min(axis = 1).values

In [None]:
Y_test_outcomes.loc[0:,]

In [None]:
Y_test_outcomes['points'] = 1 + np.log2(Y_test_outcomes['score'])
sum(Y_test_outcomes['points']) / len(Y_test_outcomes['points']) * 207

In [40]:
SVC_clf.best_estimator_

NameError: name 'SVC_clf' is not defined

In [None]:
SVC_clf.grid_scores_

In [42]:
afl_ML.Win.unique

<bound method Series.unique of 0       1.0
1       1.0
2       0.0
3       0.0
4       0.0
5       0.0
6       0.0
7       1.0
8       1.0
9       1.0
10      1.0
11      0.0
12      1.0
13      1.0
14      0.0
15      1.0
16      0.0
17      1.0
18      0.0
19      0.0
20      0.0
21      1.0
22      1.0
23      1.0
24      1.0
25      1.0
26      1.0
27      1.0
28      0.0
29      1.0
       ... 
3913    0.0
3914    0.0
3915    1.0
3916    1.0
3917    1.0
3918    1.0
3919    0.0
3920    0.0
3921    0.0
3922    0.0
3923    0.0
3924    1.0
3925    0.0
3926    0.0
3927    1.0
3928    1.0
3929    1.0
3930    0.0
3931    0.0
3932    0.0
3933    0.0
3934    0.0
3936    0.0
3937    0.0
3938    0.0
3939    0.0
3940    1.0
3941    1.0
3942    1.0
3943    0.0
Name: Win, Length: 3902, dtype: float64>