In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import numpy as np
from sklearn.metrics import roc_auc_score

In [2]:
import pandas as pd
player_skill = pd.read_excel('player_skill.xlsx')
player_skill.drop(['Unnamed: 0'], axis = 1, inplace=True)

player_skill2 = player_skill.copy()
player_skill2.replace([np.inf, -np.inf], np.nan, inplace=True)
player_skill2.fillna(0, inplace=True)

del player_skill

In [3]:
has_hero = pd.read_excel('has_hero.xlsx')
has_hero.drop(['Unnamed: 0'], axis = 1, inplace=True)

In [4]:
team = pd.read_excel('team.xlsx')
team.drop(['Unnamed: 0'], axis = 1, inplace=True)

In [5]:
y = pd.read_csv('./mlcourse-dota2-win-prediction/train_targets.csv')

In [6]:
from sklearn.pipeline import Pipeline, FeatureUnion,make_pipeline, make_union
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

In [7]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, KFold
from scipy import stats

In [8]:
#team - apply SC
#has_hero - nothing
#player_skill2 - apply variance selector, then SC
#for all - feature union

pipe = Pipeline([('vt', VarianceThreshold()), ('sc', StandardScaler()), ('xgb', XGBClassifier(objective = 'binary:logistic'))])

In [9]:
X = pd.concat([team, player_skill2, has_hero], axis = 1)

In [10]:
y = y.radiant_win.astype('int8')

In [11]:
param_dist = {'xgb__n_estimators': stats.randint(150, 500),
              'xgb__learning_rate': stats.uniform(0.01, 0.07),
              'xgb__subsample': stats.uniform(0.3, 0.7),
              'xgb__max_depth': [3, 4, 5, 6, 7, 8, 9],
              'xgb__colsample_bytree': stats.uniform(0.5, 0.45),
              'xgb__min_child_weight': [1, 2, 3]
             }

grid = RandomizedSearchCV(pipe, param_distributions = param_dist, n_iter = 25, scoring = 'roc_auc',
                          error_score = 0, verbose = 3, n_jobs = -1, random_state = 42)

In [None]:
%%time
grid.fit(X,y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 25 candidates, totalling 75 fits


In [47]:
numFolds = 5
folds = KFold(n_splits = numFolds, shuffle = True)

In [None]:
%%time
estimators = []
results = np.zeros(len(X))
score = 0.0
i = 0
for train_index, test_index in folds.split(X):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = y.iloc[train_index].values.ravel(), y.iloc[test_index].values.ravel()
    grid.fit(X_train, y_train)

    estimators.append(grid.best_estimator_)
    results[test_index] = grid.predict(X_test)
    score += roc_auc_score(y_test, results[test_index])
    i +=1
    print(score, i)
score /= numFolds

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  3.2min


In [None]:
results

In [None]:
estimators