In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import numpy as np
from sklearn.metrics import roc_auc_score

In [2]:
import pandas as pd
player_skill = pd.read_excel('player_skill.xlsx')
player_skill.drop(['Unnamed: 0'], axis = 1, inplace=True)

player_skill2 = player_skill.copy()
player_skill2.replace([np.inf, -np.inf], np.nan, inplace=True)
player_skill2.fillna(0, inplace=True)

del player_skill

has_hero = pd.read_excel('has_hero.xlsx')
has_hero.drop(['Unnamed: 0'], axis = 1, inplace=True)

team = pd.read_excel('team.xlsx')
team.drop(['Unnamed: 0'], axis = 1, inplace=True)

In [5]:
y = pd.read_csv('./mlcourse-dota2-win-prediction/train_targets.csv')

In [6]:
from sklearn.pipeline import Pipeline, FeatureUnion,make_pipeline, make_union
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

In [7]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, KFold
from scipy import stats

In [8]:
#team - apply SC
#has_hero - nothing
#player_skill2 - apply variance selector, then SC
#for all - feature union

pipe = Pipeline([('vt', VarianceThreshold()), ('sc', StandardScaler()), ('xgb', XGBClassifier(objective = 'binary:logistic'))])

In [9]:
X = pd.concat([team, player_skill2, has_hero], axis = 1)

In [10]:
y = y.radiant_win.astype('int8')

In [11]:
param_dist = {'xgb__n_estimators': stats.randint(150, 500),
              'xgb__learning_rate': stats.uniform(0.01, 0.07),
              'xgb__subsample': stats.uniform(0.3, 0.7),
              'xgb__max_depth': [3, 4, 5, 6, 7, 8, 9],
              'xgb__colsample_bytree': stats.uniform(0.5, 0.45),
              'xgb__min_child_weight': [1, 2, 3]
             }

grid = RandomizedSearchCV(pipe, param_distributions = param_dist, n_iter = 25, scoring = 'roc_auc',
                          error_score = 0, verbose = 3, n_jobs = -1, random_state = 42)

In [12]:
%%time
grid.fit(X,y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed: 32.4min finished


RandomizedSearchCV(cv='warn', error_score=0,
                   estimator=Pipeline(memory=None,
                                      steps=[('vt',
                                              VarianceThreshold(threshold=0.0)),
                                             ('sc',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('xgb',
                                              XGBClassifier(base_score=0.5,
                                                            booster='gbtree',
                                                            colsample_bylevel=1,
                                                            colsample_bytree=1,
                                                            gamma=0,
                                                       

In [14]:
grid.best_score_

0.8358532195374792

In [None]:
#xgboost = 0.8358532195374792 - CV 

In [15]:
from sklearn.externals import joblib
joblib.dump(grid, 'grid_xgboost.pkl')



['grid_xgboost.pkl']

In [17]:
f = joblib.load('grid_xgboost.pkl')

In [18]:
f.best_score_

0.8358532195374792

In [None]:
#predicting

In [21]:
import pandas as pd
player_skill = pd.read_excel('player_skill_test.xlsx')
player_skill.drop(['Unnamed: 0'], axis = 1, inplace=True)

player_skill2 = player_skill.copy()
player_skill2.replace([np.inf, -np.inf], np.nan, inplace=True)
player_skill2.fillna(0, inplace=True)

del player_skill

has_hero = pd.read_excel('has_hero_test.xlsx')
has_hero.drop(['Unnamed: 0'], axis = 1, inplace=True)

team = pd.read_excel('team_test.xlsx')
team.drop(['Unnamed: 0'], axis = 1, inplace=True)

x_test_comb = pd.concat([team, player_skill2, has_hero], axis = 1)

In [22]:
x_test_comb

Unnamed: 0,r_kills,r_deaths,r_assists,r_denies,r_gold,r_lh,r_xp,r_health,r_max_health,r_max_mana,...,r_HasHeroId_112,d_HasHeroId_112,r_HasHeroId_113,d_HasHeroId_113,r_HasHeroId_114,d_HasHeroId_114,r_HasHeroId_119,d_HasHeroId_119,r_HasHeroId_120,d_HasHeroId_120
0,0,0,0,0,575,0,0,3185,3280,1598.68902,...,0,0,0,0,0,0,0,0,0,1
1,31,12,49,24,39315,304,37183,6644,6830,3062.69046,...,0,0,0,0,0,0,0,1,0,0
2,22,13,35,58,36130,326,34281,5532,6965,4071.69131,...,0,0,0,0,0,0,0,0,0,0
3,5,4,6,8,12961,107,13339,3405,4660,2700.69011,...,0,0,0,0,0,0,0,0,0,0
4,14,23,27,34,42630,482,51062,7343,7375,4495.69178,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,18,43,41,20,41941,368,57015,4719,6760,4510.69180,...,0,0,0,0,0,0,0,0,0,0
9996,28,21,28,49,79788,716,85067,7584,9485,5301.69260,...,0,0,0,0,0,0,1,0,0,0
9997,2,7,2,18,7281,57,7957,3253,3720,2078.68949,...,0,0,0,0,0,1,0,0,0,0
9998,14,14,22,41,38008,385,39953,4645,6590,4080.69133,...,0,0,0,0,0,0,0,0,0,0


In [20]:
y_pred = grid.predict_proba(x_test_comb)[:,1]

ValueError: X has a different shape than during fitting.

In [None]:
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

write_to_submission_file(y_pred, 'my_submit_1_xgboost835.csv')

In [47]:
numFolds = 5
folds = KFold(n_splits = numFolds, shuffle = True)

In [None]:
%%time
estimators = []
results = np.zeros(len(X))
score = 0.0
i = 0
for train_index, test_index in folds.split(X):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = y.iloc[train_index].values.ravel(), y.iloc[test_index].values.ravel()
    grid.fit(X_train, y_train)

    estimators.append(grid.best_estimator_)
    results[test_index] = grid.predict(X_test)
    score += roc_auc_score(y_test, results[test_index])
    i +=1
    print(score, i)
score /= numFolds

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  3.2min


In [None]:
results

In [None]:
estimators