Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

Import Train and Test Data

In [2]:
data = pd.read_csv('./features.csv', index_col='match_id')
test_data = pd.read_csv('./features_test.csv', index_col='match_id')

From Train Data delete columns that are absent in the Test Data (except the target column)

In [3]:
set(data) - set(test_data)

{'barracks_status_dire',
 'barracks_status_radiant',
 'duration',
 'radiant_win',
 'tower_status_dire',
 'tower_status_radiant'}

In [4]:
for var in (set(data) - set(test_data)):
    if var!='radiant_win':
        del data[var]

Create a List of Variables with Missing Values

In [5]:
missing = []
for var in data:
    if data[var].count()<97230:
        missing.append(var)

In [6]:
missing

['first_blood_time',
 'first_blood_team',
 'first_blood_player1',
 'first_blood_player2',
 'radiant_bottle_time',
 'radiant_courier_time',
 'radiant_flying_courier_time',
 'radiant_first_ward_time',
 'dire_bottle_time',
 'dire_courier_time',
 'dire_flying_courier_time',
 'dire_first_ward_time']

Substitute Missing Values with 0

In [8]:
data.fillna(0, inplace=True)

Target Variable

In [9]:
y = data['radiant_win']

In [10]:
X = data.loc[:, data.columns!='radiant_win']

Cross-Validation Splits Generator

In [11]:
kf = KFold(n_splits=5, shuffle=True)

Model 1: Gradient Boosting Classifier

In [14]:
trees = [5, 10, 15, 20, 25, 30, 50]

In [15]:
for t in trees:
    gbm = GradientBoostingClassifier(n_estimators=t,verbose=True)
    score = np.mean(cross_val_score(gbm, X, y, scoring='roc_auc', cv=kf))
    print(score)

      Iter       Train Loss   Remaining Time 
         1           1.3785            2.33s
         2           1.3728            1.77s
         3           1.3678            1.18s
         4           1.3631            0.59s
         5           1.3581            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3784            2.31s
         2           1.3728            1.77s
         3           1.3678            1.19s
         4           1.3633            0.60s
         5           1.3588            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3786            2.28s
         2           1.3730            1.78s
         3           1.3679            1.19s
         4           1.3635            0.62s
         5           1.3586            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3786            2.72s
         2           1.3730            1.95s
         3           1.3679            1.27s
      

Measure time for 30 trees

In [16]:
import time
import datetime

start_time = datetime.datetime.now()

gbm = GradientBoostingClassifier(n_estimators=30,verbose=True)
score = np.mean(cross_val_score(gbm, X, y, scoring='roc_auc', cv=kf))
print(score)

print 'Time elapsed:', datetime.datetime.now() - start_time

      Iter       Train Loss   Remaining Time 
         1           1.3787           17.27s
         2           1.3732           16.94s
         3           1.3682           16.64s
         4           1.3638           16.17s
         5           1.3594           15.72s
         6           1.3546           15.04s
         7           1.3503           14.24s
         8           1.3462           13.45s
         9           1.3419           12.80s
        10           1.3381           12.12s
        20           1.3085            5.82s
        30           1.2885            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3785           16.95s
         2           1.3729           16.56s
         3           1.3680           16.15s
         4           1.3635           15.58s
         5           1.3587           14.86s
         6           1.3541           14.21s
         7           1.3501           13.92s
         8           1.3458           13.25s
        

Model 2: Logistic Regression Classifier

In [18]:
Xstd = StandardScaler().fit_transform(X)

In [19]:
Cvalues = [0.001, 0.01, 0.1, 1, 10, 100, 1000] 

In [20]:
for v in Cvalues:
    logit = LogisticRegression(C=v)
    logit.fit(Xstd, y)
    print('C is '+str(v))
    score = np.mean(cross_val_score(logit, Xstd, y, scoring='roc_auc', cv=kf))
    print(score)

C is 0.001
0.716127903798
C is 0.01
0.716361152102
C is 0.1
0.716281649488
C is 1
0.716452998587
C is 10
0.716485372928
C is 100
0.716414660912
C is 1000
0.71637882822


Deleting Categorical Features

In [21]:
categorical = ['lobby_type','r1_hero', 'r2_hero','r3_hero','r4_hero','r5_hero', 'd1_hero', 'd2_hero','d3_hero','d4_hero','d5_hero']

In [22]:
newX = X.copy()
for var in categorical:
    newX=newX.loc[:,newX.columns!=var]

In [23]:
newXstd = StandardScaler().fit_transform(newX)

Logistic Regression trained on new X

In [24]:
for v in Cvalues:
    logit = LogisticRegression(C=v)
    logit.fit(Xstd, y)
    print('C is '+str(v))
    score = np.mean(cross_val_score(logit, newXstd, y, scoring='roc_auc', cv=kf))
    print(score)

C is 0.001
0.71648597506
C is 0.01
0.716497728884
C is 0.1
0.716355588333
C is 1
0.716389887871
C is 10
0.716439873022
C is 100
0.71646889065
C is 1000
0.716514323097


Counting Unique Heroes

In [34]:
allheroes = X['r1_hero']
other = ['r2_hero','r3_hero','r4_hero','r5_hero', 'd1_hero', 'd2_hero','d3_hero','d4_hero','d5_hero']
for var in other:
    allheroes = np.hstack((allheroes, X[var]))
np.unique(allheroes)

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  25,  26,  27,
        28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,
        41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,
        54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,
        67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,
        93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105,
       106, 109, 110, 112])

In [29]:
np.unique(allheroes).max()

112

In [46]:
len(np.unique(allheroes))

108

In [47]:
N = np.unique(allheroes).max()

Logistic Regression with Heroes IDs recoded

In [48]:
X_pick = np.zeros((X.shape[0], N))
for i, match_id in enumerate(X.index):
    for p in xrange(5):
        X_pick[i, X.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, X.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1

In [49]:
Xplus = np.hstack((newXstd,X_pick))

In [50]:
for v in Cvalues:
    logit = LogisticRegression(C=v)
    logit.fit(Xplus, y)
    print('C is '+str(v))
    score = np.mean(cross_val_score(logit, Xplus, y, scoring='roc_auc', cv=kf))
    print(score)

C is 0.001
0.746197513009
C is 0.01
0.751624091277
C is 0.1
0.751767297531
C is 1
0.751688106664
C is 10
0.751861752586
C is 100
0.751802703829
C is 1000
0.75183624217


In [51]:
# The best model is the last one, with C = 10

Building Predictions for Test Data

In [53]:
test_data.fillna(0, inplace=True)    
newtest = test_data.copy()
for var in categorical:
    newtest=newtest.loc[:,newtest.columns!=var]
newteststd = StandardScaler().fit_transform(newtest)

In [54]:
test_pick = np.zeros((test_data.shape[0], N))
for i, match_id in enumerate(test_data.index):
    for p in xrange(5):
        test_pick[i, test_data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        test_pick[i, test_data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1

In [55]:
testplus = np.hstack((newteststd ,test_pick))

In [56]:
logit = LogisticRegression(C=10)
logit.fit(Xplus, y)
test_data["predicted"]=logit.predict_proba(testplus)[:,1]

In [57]:
test_data['predicted']

match_id
6         0.825778
7         0.758760
10        0.186977
13        0.861085
16        0.239727
18        0.378841
19        0.529665
24        0.565862
33        0.215163
37        0.674097
41        0.154910
42        0.326742
55        0.231960
60        0.673145
62        0.554989
63        0.612092
64        0.088299
71        0.554179
72        0.316118
83        0.500696
85        0.783005
89        0.930880
92        0.757268
100       0.959741
102       0.884724
108       0.537060
111       0.775833
126       0.181362
130       0.051853
140       0.790707
            ...   
114165    0.813820
114168    0.666934
114171    0.463057
114183    0.145547
114188    0.633010
114192    0.841987
114195    0.515675
114202    0.477959
114209    0.384252
114211    0.808393
114217    0.755161
114232    0.385117
114236    0.500069
114238    0.664195
114242    0.581018
114247    0.288299
114285    0.255999
114286    0.410673
114314    0.536098
114327    0.480521
114330    0.413029
114

In [59]:
test_data['predicted'].to_csv("output.csv")

In [60]:
test_data['predicted'].min()

0.0086170028517682865

In [62]:
test_data['predicted'].max()

0.9965172470351864