In [3]:
import sys
sys.path.append('../src/')
import get_modeling_data
import data_constants as dc
import sklearn.preprocessing
import numpy as np
import os
import pandas as pd
import time
import sklearn.ensemble
import model_evaluation.model_performance_functions as mpf
import statsmodels.api as sm

  from pandas.core import datetools


In [4]:
train = get_modeling_data.get_train()
validation = get_modeling_data.get_validation()

In [5]:
print(train.shape)
for i in range(5,15):
    col_name = train.columns[i]
    train = train[~(train[col_name] == '0')]
    validation = validation[~(validation[col_name] == '0')]
print(train.shape)

(171321, 90)
(171284, 90)


In [6]:
cols_to_keep = ['team_100_win', '100_TOP_SOLO', '100_JUNGLE_NONE', '100_MIDDLE_SOLO', '100_BOTTOM_DUO_CARRY', '100_BOTTOM_DUO_SUPPORT', '200_TOP_SOLO', '200_JUNGLE_NONE', '200_MIDDLE_SOLO', '200_BOTTOM_DUO_CARRY', '200_BOTTOM_DUO_SUPPORT']
train = train[cols_to_keep]
validation = validation[cols_to_keep]

In [7]:
champ_ids = dc.get_champs_four_letters()
champ_ids = dict(zip(champ_ids, champ_ids.index))

In [8]:
train.head()

Unnamed: 0,team_100_win,100_TOP_SOLO,100_JUNGLE_NONE,100_MIDDLE_SOLO,100_BOTTOM_DUO_CARRY,100_BOTTOM_DUO_SUPPORT,200_TOP_SOLO,200_JUNGLE_NONE,200_MIDDLE_SOLO,200_BOTTOM_DUO_CARRY,200_BOTTOM_DUO_SUPPORT
1,0,Rene,Mast,Fizz,Cait,Lux,Kled,Skar,Kass,Ezre,Sona
2,0,Kled,Reng,Ryze,Varu,Sora,Gang,Seju,Ahri,Jinx,Blit
5,1,Jarv,Malp,Kata,Jinx,Thre,Kled,Rek',Malz,Tris,Sona
6,1,Shac,Lee,Malz,Varu,Zile,Vlad,Elis,Drav,Miss,Sona
7,1,Kled,Mast,Twis,Tris,Tari,Trun,Nida,Yasu,Xaya,Raka


In [9]:
for i in cols_to_keep[1:]:
    train[i] = train[i].map(champ_ids)
    validation[i] = validation[i].map(champ_ids)
train.head()

Unnamed: 0,team_100_win,100_TOP_SOLO,100_JUNGLE_NONE,100_MIDDLE_SOLO,100_BOTTOM_DUO_CARRY,100_BOTTOM_DUO_SUPPORT,200_TOP_SOLO,200_JUNGLE_NONE,200_MIDDLE_SOLO,200_BOTTOM_DUO_CARRY,200_BOTTOM_DUO_SUPPORT
1,0,90,70,29,14,66,58,102,51,26,103
2,0,58,91,94,120,104,31,95,1,46,11
5,1,42,67,52,46,112,58,89,68,113,103
6,1,96,61,68,120,137,126,24,22,71,103
7,1,58,70,116,113,110,114,78,132,129,87


In [10]:
Y_train = train['team_100_win']
Y_validation = validation['team_100_win']

In [11]:
X_train = train.drop(['team_100_win'], axis=1)
X_validation = validation.drop(['team_100_win'], axis=1)
print(X_train.shape)
print(X_validation.shape)

(171284, 10)
(57093, 10)


In [12]:
ohe = sklearn.preprocessing.OneHotEncoder(140)

In [13]:
X_train = ohe.fit_transform(X_train)
X_validation = ohe.fit_transform(X_validation)

In [14]:
rf_params = { 'max_features': 'sqrt',
              'n_estimators': 10000,
              'min_samples_split': 100,
              'min_samples_leaf': 50,
              'max_depth': 10,
              'random_state': 414}
print(rf_params)
start_time = time.time()
model = sklearn.ensemble.RandomForestClassifier(**rf_params)
model_fit = model.fit(X_train, Y_train)

train_pred_rf = model.predict(X_train)
validation_pred_rf = model.predict(X_validation)

train_time_rf = time.time() - start_time
print(train_time_rf)

ks_gini_train = mpf.ks_gini(Y_train, train_pred_rf)
ks_gini_validation = mpf.ks_gini(Y_validation, validation_pred_rf)
correct_pred_train = mpf.correct_prediction_rate(Y_train, train_pred_rf)
correct_pred_validation = mpf.correct_prediction_rate(Y_validation, validation_pred_rf)

{'max_features': 'sqrt', 'n_estimators': 2500, 'min_samples_split': 100, 'min_samples_leaf': 50, 'max_depth': 10, 'random_state': 414}


In [15]:
rf_perf = [correct_pred_train, correct_pred_validation, ks_gini_train, ks_gini_validation]
print(rf_perf)

[0.5472606898484389, 0.5373688543253989, {'ks': 0.04466087523900575, 'gini': 0.022258804252409753}, {'ks': 0.03349658202315553, 'gini': 0.017004103739299208}]


In [17]:
gbm_params = {'learning_rate': 0.02,
              'n_estimators': 2500,
              'min_samples_split': 300,
              'min_samples_leaf': 50,
              'max_depth': 5,
              'random_state': 414}
print(gbm_params)
start_time = time.time()
model = sklearn.ensemble.GradientBoostingClassifier(**gbm_params)
model_fit = model.fit(X_train, Y_train)
n_est_performance = mpf.gbm_best_iter(model_fit, X_validation, Y_validation)
print(n_est_performance)
# Get training and validation predictions using best iteration
ctr = 1
for prediction in model_fit.staged_predict(X_train):
    if ctr == n_est_performance['best_iter']:
        train_pred = prediction
    ctr = ctr + 1
ctr = 1
for prediction in model_fit.staged_predict(X_validation):
    if ctr == n_est_performance['best_iter']:
        validation_pred = prediction
    ctr = ctr + 1

train_time = time.time() - start_time

ks_gini_train = mpf.ks_gini(Y_train, train_pred)
ks_gini_validation = mpf.ks_gini(Y_validation, validation_pred)
correct_pred_train = mpf.correct_prediction_rate(Y_train, train_pred)
correct_pred_validation = mpf.correct_prediction_rate(Y_validation, validation_pred)

{'learning_rate': 0.02, 'n_estimators': 2500, 'min_samples_split': 300, 'min_samples_leaf': 50, 'max_depth': 5, 'random_state': 414}
{'scores': [0.002515567536321206, 0.0027750771933302554, 0.002859414074752875, 0.002842470620982751, 0.003600898922589635, 0.003581966229648237, 0.0037255194561983673, 0.008086401903746987, 0.008182493380273437, 0.008050799511266193, 0.008892888103543162, 0.009227605343708478, 0.00966684244806204, 0.009474174345142305, 0.009835916553085466, 0.013035502374283525, 0.013974455066075309, 0.013430139988110934, 0.013527458681781401, 0.015442703144708525, 0.01616685978624577, 0.01584985550490925, 0.015873896613929794, 0.01597534594091421, 0.015553309207740895, 0.015530023884888333, 0.016578729574136752, 0.016758574255639314, 0.01686616465703128, 0.017087996003941508, 0.01772822920153916, 0.017748243192513002, 0.017818727112119626, 0.01775338964733486, 0.018638982713214736, 0.020303308662726782, 0.020133239938117664, 0.020547080881187618, 0.020290815118275574, 0.

In [18]:
gbm_perf = [correct_pred_train, correct_pred_validation, ks_gini_train, ks_gini_validation]
print(gbm_perf)

[0.5939142009761565, 0.550067433836022, {'ks': 0.09453389810920693, 'gini': 0.04774320601190053}, {'ks': 0.050397614488421894, 'gini': 0.024913188978973447}]
