In [85]:
import pandas
import numpy as np
import time
import datetime
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
pandas.set_option('display.max_columns', None)

# Read data
features = pandas.read_csv('features.csv', index_col='match_id')
features_test = pandas.read_csv('features_test.csv', index_col='match_id')
print(features.head())

# Remove data, fill NaN's, count NaN's
X_train = features.drop(['duration', 'radiant_win', 'tower_status_radiant', 'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire'], axis = 1)
y_train = features['radiant_win']
X_test = features_test
counts = X_train.count()
length = X_train.shape[0]
for i in counts[counts < length].index:
    print(i, ':', '%.3f' % (counts[i] / length))
X_train.fillna(value = 0, inplace = True)
X_test.fillna(value = 0, inplace = True)

# GradientBoosting
print('Gradient Boosting')
nums = [5, 10, 20, 30, 50, 100, 200, 300, 500]
cv = KFold(n_splits = 5, shuffle = True, random_state = 241)
scores = []
for n_estimators in nums:
    print('Number of estimators:', n_estimators)
    estimator = GradientBoostingClassifier(n_estimators = n_estimators, random_state = 241)
    start_time = datetime.datetime.now()
    score = np.mean(cross_val_score(estimator = estimator, X = X_train, y = y_train, cv = cv, scoring = 'roc_auc', n_jobs = -1))
    print('Scoring:', '%.3f' % score)
    print('Time elapsed:', datetime.datetime.now() - start_time)
    scores.append(score)
print('Max score', '%.3f' % max(scores))

# LogisticRegression
print('Logistic Regression')
scaler = StandardScaler()
X_train_tr = scaler.fit_transform(X_train)
c_range = [10 ** i for i in range(-5, 6)]
scores = []
for C in c_range:
    print('C:', C)
    estimator = LogisticRegression(C = C, random_state = 241)
    start_time = datetime.datetime.now()
    score = np.mean(cross_val_score(estimator = estimator, X = X_train_tr, y = y_train, cv = cv, scoring = 'roc_auc', n_jobs = -1))
    print('Scoring:', '%.3f' % score)
    print('Time elapsed:', datetime.datetime.now() - start_time)
    scores.append(score)    
print('C_max_scores:', c_range[scores.index(max(scores))])
print('Max score', '%.3f' % max(scores))

# LogisticRegression with cleared data
print('Logistic Regression, Cleared Data')
X_train_clear = X_train.copy()
X_train_clear.drop('lobby_type', axis = 1, inplace = True)
for i in range(1, 6):
    X_train_clear.drop('r{}_hero'.format(i), axis = 1, inplace = True)
    X_train_clear.drop('d{}_hero'.format(i), axis = 1, inplace = True)

X_train_tr = scaler.fit_transform(X_train_clear)
scores = []
for C in c_range:
    print('C:', C)
    estimator = LogisticRegression(C = C, random_state = 241)
    start_time = datetime.datetime.now()
    score = np.mean(cross_val_score(estimator = estimator, X = X_train_tr, y = y_train, cv = cv, scoring = 'roc_auc', n_jobs = -1))
    print('Scoring:', '%.3f' % score)
    print('Time elapsed:', datetime.datetime.now() - start_time)
    scores.append(score)    
print('C_max_scores:', c_range[scores.index(max(scores))])
print('Max score', '%.3f' % max(scores))

# LogisticRegression with BagOfWords
print('Logistic Regression, Bag of Words')
heroes = pandas.read_csv('heroes.csv', index_col='id')
N = heroes['name'].unique().shape[0]
print(N, 'heroes')

X_pick = np.zeros((X_train.shape[0], N))
for i, match_id in enumerate(X_train.index):
    for p in range(5):
        X_pick[i, X_train.loc[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, X_train.loc[match_id, 'd%d_hero' % (p+1)]-1] = -1
        
X_pick = pandas.DataFrame(X_pick)
X_pick.index = X_train.index
X_train_full = pandas.concat([X_train_clear, X_pick], axis = 1)

X_train_tr = scaler.fit_transform(X_train_full)
scores = []
for C in c_range:
    print('C:', C)
    estimator = LogisticRegression(C = C, random_state = 241)
    start_time = datetime.datetime.now()
    score = np.mean(cross_val_score(estimator = estimator, X = X_train_tr, y = y_train, cv = cv, scoring = 'roc_auc', n_jobs = -1))
    print('Scoring:', '%.3f' % score)
    print('Time elapsed:', datetime.datetime.now() - start_time)
    scores.append(score)    
print('C_max_scores:', c_range[scores.index(max(scores))])
print('Max score', '%.3f' % max(scores))

# LogisticRegression with best params
print('Logistic Regression with Best Parameters')
C = c_range[scores.index(max(scores))]
X_train_tr = scaler.fit_transform(X_train_full)
model = LogisticRegression(C = C, random_state = 241)
model.fit(X = X_train_tr, y = y_train)

X_pick = np.zeros((X_test.shape[0], N))

for i, match_id in enumerate(X_test.index):
    for p in range(5):
        X_pick[i, X_test.loc[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, X_test.loc[match_id, 'd%d_hero' % (p+1)]-1] = -1
        
X_pick = pandas.DataFrame(X_pick)
X_pick.index = X_test.index
X_test_full = pandas.concat([X_test, X_pick], axis = 1)
X_test_full.drop('lobby_type', axis = 1, inplace = True)
for i in range(1, 6):
    X_test_full.drop('r{}_hero'.format(i), axis = 1, inplace = True)
    X_test_full.drop('d{}_hero'.format(i), axis = 1, inplace = True)
    
X_test_tr = scaler.transform(X_test_full)
y_test = pandas.DataFrame(model.predict_proba(X_test_tr))
print('min and max probas', float(y_test[[1]].min()), float(y_test[[1]].max()))

          start_time  lobby_type  r1_hero  r1_level  r1_xp  r1_gold  r1_lh  \
match_id                                                                     
0         1430198770           7       11         5   2098     1489     20   
1         1430220345           0       42         4   1188     1033      9   
2         1430227081           7       33         4   1319     1270     22   
3         1430263531           1       29         4   1779     1056     14   
4         1430282290           7       13         4   1431     1090      8   

          r1_kills  r1_deaths  r1_items  r2_hero  r2_level  r2_xp  r2_gold  \
match_id                                                                     
0                0          0         7       67         3    842      991   
1                0          1        12       49         4   1596      993   
2                0          0        12       98         3   1314      775   
3                0          0         5       30         2    5

first_blood_time : 0.799
first_blood_team : 0.799
first_blood_player1 : 0.799
first_blood_player2 : 0.548
radiant_bottle_time : 0.839
radiant_courier_time : 0.993
radiant_flying_courier_time : 0.717
radiant_first_ward_time : 0.981
dire_bottle_time : 0.834
dire_courier_time : 0.993
dire_flying_courier_time : 0.732
dire_first_ward_time : 0.981
Gradient Boosting
Number of estimators: 5
Scoring: 0.636
Time elapsed: 0:00:50.658489
Number of estimators: 10
Scoring: 0.664
Time elapsed: 0:00:54.591844
Number of estimators: 20
Scoring: 0.683
Time elapsed: 0:01:13.945494
Number of estimators: 30
Scoring: 0.689
Time elapsed: 0:01:38.709876
Number of estimators: 50
Scoring: 0.697
Time elapsed: 0:02:31.238377
Number of estimators: 100
Scoring: 0.706
Time elapsed: 0:04:39.978904
Number of estimators: 200
Scoring: 0.714
Time elapsed: 0:12:45.966853
Number of estimators: 300
Scoring: 0.717
Time elapsed: 0:16:41.318507
Number of estimators: 500
Scoring: 0.721
Time elapsed: 0:22:40.470583
Max score 0.72