In [62]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

In [63]:
df_train = pd.read_csv('main_train.csv')
df_test = pd.read_csv('main_test.csv')
tf_train = pd.read_csv('teamfights_train.csv')
tf_test = pd.read_csv('teamfights_test.csv')

In [64]:
df_train.head()

Unnamed: 0,match_id,radiant,hero,gold_0,lh_0,xp_0,gold_60,lh_60,xp_60,gold_120,...,xp_600,level_180,level_240,level_300,level_360,level_420,level_480,level_540,level_600,radiant_win
0,0,1,Rubick,0.0,0.0,0.0,100.0,0.0,46.0,250.0,...,1741.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0,1,Wraith King,0.0,0.0,0.0,175.0,2.0,124.0,526.0,...,2319.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
2,0,1,Riki,0.0,0.0,0.0,137.0,1.0,93.0,439.0,...,3859.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1
3,0,1,Tusk,0.0,0.0,0.0,100.0,0.0,62.0,200.0,...,1676.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
4,0,1,Templar Assassin,0.0,0.0,0.0,320.0,3.0,352.0,668.0,...,4453.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,1


In [65]:
tf_train.head()

Unnamed: 0,teamfight_id,match_id,player_slot,radiant,buybacks,damage,deaths_player,gold_delta,xp_end,xp_start,start,end,last_death
0,3288,1,0,1,0,673,0,89,2681,2263,423,467,452
1,3288,1,1,1,0,154,0,199,2016,1694,423,467,452
2,3288,1,2,1,0,403,0,320,1640,1230,423,467,452
3,3288,1,3,1,0,201,0,158,2367,1957,423,467,452
4,3288,1,4,1,0,369,1,165,2445,2061,423,467,452


In [66]:
from pandas import get_dummies
df_train = get_dummies(df_train)

In [67]:
df_train = df_train.fillna(method = 'ffill')

In [68]:
radiant_tf = tf_train[tf_train['radiant'] == 1].groupby('match_id')
dare_tf = tf_train[tf_train['radiant'] == 0].groupby('match_id')

features = ['damage', 'deaths_player', 'buybacks', 'gold_delta']

tf_rad_sum = radiant_tf.sum()[features].reset_index()
tf_dare_sum = dare_tf.sum()[features].reset_index()

tf_rad_max = radiant_tf.max()[features].reset_index()
tf_dare_max = dare_tf.max()[features].reset_index()

tf_rad_min = radiant_tf.min()[features].reset_index()
tf_dare_min = dare_tf.min()[features].reset_index()

In [69]:
for feature in df_train:
    if 'hero' in feature:
        df_train[feature + '_level'] = df_train[feature]*df_train['level_600']
        df_train[feature + '_gold'] = df_train[feature]*df_train['gold_600']
        df_train[feature + '_xp'] = df_train[feature]*df_train['xp_600']
        df_train[feature + '_lh'] = df_train[feature]*df_train['lh_600']

In [70]:
radiant_df = df_train[df_train['radiant'] == 1].groupby('match_id')
dare_df = df_train[df_train['radiant'] == 0].groupby('match_id')

features = ['level_600', 'gold_600', 'xp_600', 'lh_600',
           'level_300', 'gold_300', 'xp_300', 'lh_300',
           'level_180', 'gold_180', 'xp_180', 'lh_180',
           'level_420', 'gold_420', 'xp_420', 'lh_420']

df_rad_max = radiant_df[features].max().reset_index()
df_dare_max = dare_df[features].max().reset_index()

df_rad_min = radiant_df[features].min().reset_index()
df_dare_min = dare_df[features].min().reset_index()

features = features + [f for f in df_train if 'hero' in f]

df_rad_sum = radiant_df.sum()[features].reset_index()
df_dare_sum = (-dare_df.sum()[features]).reset_index()

In [71]:
y_train_pd_ = df_train[df_train['radiant'] == 1].groupby('match_id')
y_train_pd = y_train_pd_.aggregate('sum')[['radiant_win']].reset_index()
y_train_pd['radiant_win'] = y_train_pd['radiant_win'] / 5;

In [72]:
feature_tables = [df_rad_sum, df_dare_sum, df_rad_max, df_dare_max, df_rad_min, df_dare_min, tf_rad_sum, 
                  tf_dare_sum, tf_rad_max, tf_dare_max, tf_rad_min, tf_dare_min]

X_all = y_train_pd
for table in feature_tables:
    X_all = pd.merge(X_all, table, how = 'outer', on = ['match_id'])
    
X_all = X_all.fillna(0)

In [73]:
X_train, y_train = X_all.drop(['match_id', 'radiant_win'], axis = 1), X_all['radiant_win']

In [74]:
from sklearn.model_selection import cross_val_score
np.mean(cross_val_score(X = X_train, y = y_train, estimator = LogisticRegression(penalty = 'l2'), cv = 3, 
                        scoring = 'roc_auc'))

0.75213205401542771

In [85]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 100, max_depth = 20)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [75]:
clf = LogisticRegression()
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

## Test

In [76]:
df_test = get_dummies(df_test)
df_test = df_test.fillna(method = 'ffill')

In [77]:
radiant_tf = tf_test[tf_test['radiant'] == 1].groupby('match_id')
dare_tf = tf_test[tf_test['radiant'] == 0].groupby('match_id')

features = ['damage', 'deaths_player', 'buybacks', 'gold_delta']

tf_rad_sum = radiant_tf.sum()[features].reset_index()
tf_dare_sum = dare_tf.sum()[features].reset_index()

tf_rad_max = radiant_tf.max()[features].reset_index()
tf_dare_max = dare_tf.max()[features].reset_index()

tf_rad_min = radiant_tf.min()[features].reset_index()
tf_dare_min = dare_tf.min()[features].reset_index()

In [78]:
for feature in df_test:
    if 'hero' in feature:
        df_test[feature + '_level'] = df_test[feature]*df_test['level_600']
        df_test[feature + '_gold'] = df_test[feature]*df_test['gold_600']
        df_test[feature + '_xp'] = df_test[feature]*df_test['xp_600']
        df_test[feature +'_lh'] = df_test[feature]*df_test['lh_600']

In [79]:
radiant_df = df_test[df_test['radiant'] == 1].groupby('match_id')
dare_df = df_test[df_test['radiant'] == 0].groupby('match_id')

features = ['level_600', 'gold_600', 'xp_600', 'lh_600',
           'level_300', 'gold_300', 'xp_300', 'lh_300',
           'level_180', 'gold_180', 'xp_180', 'lh_180',
           'level_420', 'gold_420', 'xp_420', 'lh_420']

df_rad_max = radiant_df[features].max().reset_index()
df_dare_max = dare_df[features].max().reset_index()

df_rad_min = radiant_df[features].min().reset_index()
df_dare_min = dare_df[features].min().reset_index()

features += [f for f in df_test if 'hero' in f]

df_rad_sum = radiant_df.sum()[features].reset_index()
df_dare_sum = (-dare_df.sum()[features]).reset_index()

In [80]:
feature_tables = [df_dare_sum, df_rad_max, df_dare_max, df_rad_min, df_dare_min, tf_rad_sum, tf_dare_sum, 
                  tf_rad_max, tf_dare_max, tf_rad_min, tf_dare_min]

X_all = df_rad_sum
for table in feature_tables:
    X_all = pd.merge(X_all, table, how = 'outer', on = ['match_id'])
    
match_ids = X_all['match_id']
X_all = X_all.drop('match_id', axis = 1)
X_all = X_all.fillna(0)

In [81]:
predictions = clf.predict_proba(X_all)

In [86]:
rfc_predictions = rfc.predict_proba(X_all)
rfc_predictions

array([[ 0.33677688,  0.66322312],
       [ 0.14855707,  0.85144293],
       [ 0.54337093,  0.45662907],
       ..., 
       [ 0.41847613,  0.58152387],
       [ 0.71882323,  0.28117677],
       [ 0.44340194,  0.55659806]])

In [89]:
result = pd.DataFrame()
result['index'] = match_ids
result['proba'] = (predictions[:,1] + rfc_predictions[:,1]) / 2
result.to_csv('result', index = None)

In [90]:
result

Unnamed: 0,index,proba
0,39959,0.753010
1,39960,0.865801
2,39961,0.370812
3,39962,0.409118
4,39963,0.318283
5,39964,0.655359
6,39965,0.305807
7,39966,0.176060
8,39967,0.397669
9,39968,0.181758
