In [112]:
import pandas as pd
import pickle

In [4]:
df_matches = pd.read_csv('./data/dataset.csv', index_col=0)

In [5]:
df_matches.head()

Unnamed: 0,account_id,assists,attack_type,camps_stacked,creeps_stacked,deaths,denies,duration,first_blood_time,gold_per_min,...,cm_enabled,int_gain,legs,move_speed,pro_ban,pro_pick,pro_win,projectile_speed,str_gain,turn_rate
0,90882159,5,Ranged,0.0,0.0,4,12,1704,0,400,...,True,2.5,2,295,218.0,244.0,124.0,1500,2.0,0.5
1,114239371,6,Melee,0.0,0.0,6,1,1704,0,216,...,True,2.0,2,285,96.0,196.0,99.0,900,3.5,0.6
2,148215639,7,Ranged,0.0,0.0,6,13,1704,0,408,...,True,2.1,2,320,70.0,135.0,66.0,3000,2.1,0.6
3,111189717,6,Melee,2.0,6.0,3,12,1704,0,382,...,True,1.6,2,315,13.0,21.0,13.0,900,2.6,0.5
4,119576842,8,Ranged,0.0,0.0,5,3,1704,0,204,...,True,2.5,2,300,40.0,155.0,69.0,1200,2.2,0.5


In [10]:
'lane_role' in df_matches.columns

True

In [21]:
LANES = {
    'safe': 1,
    'mid': 2,
    'off': 3,
    'jungle': 4
}

In [22]:
df_matches['lane_role'].value_counts()

2.0    2140
1.0    1932
3.0    1454
4.0      70
Name: lane_role, dtype: int64

In [32]:
for col in df_matches.dtypes.items():
    print(col)

('account_id', dtype('int64'))
('assists', dtype('int64'))
('attack_type', dtype('O'))
('camps_stacked', dtype('float64'))
('creeps_stacked', dtype('float64'))
('deaths', dtype('int64'))
('denies', dtype('int64'))
('duration', dtype('int64'))
('first_blood_time', dtype('int64'))
('gold_per_min', dtype('int64'))
('gold_spent', dtype('int64'))
('hero_damage', dtype('int64'))
('hero_healing', dtype('int64'))
('hero_id', dtype('int64'))
('is_roaming', dtype('bool'))
('kills', dtype('int64'))
('lane', dtype('float64'))
('lane_role', dtype('float64'))
('last_hits', dtype('int64'))
('leagueid', dtype('int64'))
('leaguename', dtype('O'))
('level', dtype('int64'))
('localized_name', dtype('O'))
('match_id', dtype('int64'))
('observers_placed', dtype('float64'))
('player_slot', dtype('int64'))
('primary_attr', dtype('O'))
('roshans_killed', dtype('float64'))
('rune_pickups', dtype('float64'))
('start_time', dtype('int64'))
('stuns', dtype('float64'))
('t_experience_cnt_10', dtype('float64'))
('t

In [70]:
id_columns = [
    'hero_id', 'match_id', 'account_id', 'team_id',
    'start_time', 'id', 'datetime', 'leaguename',
    'win'
]
categorical_columns = [c for c in df_matches.columns if df_matches[c].dtype.name == 'object' and c not in id_columns]
numerical_columns   = [c for c in df_matches.columns if df_matches[c].dtype.name != 'object' and c not in id_columns]
valid_columns = numerical_columns + categorical_columns

## Transform

In [106]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### Drop id columns

In [97]:
X = df_matches[valid_columns]
y = df_matches['win']

### One-hot encoding

In [98]:
df_dummies = pd.get_dummies(X[categorical_columns])
binary_columns = df_dummies.columns
X = pd.concat([X, df_dummies], axis = 1)
X.drop(categorical_columns, axis=1, inplace=True)

### Data Partition

In [99]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 12345)

N_train, _ = X_train.shape 
N_test,  _ = X_test.shape 
print(N_train, N_test)

3917 1679


In [101]:
lm = LogisticRegression(penalty = 'l1', C = 1, fit_intercept=False)
lm.fit(X_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [104]:
print("Train accuracy = %s" %metrics.accuracy_score(y_train, lm.predict(X_train)))
print("Test accuracy = %s" %metrics.accuracy_score(y_test, lm.predict(X_test)))

print("Train AUC = %s" %metrics.roc_auc_score(y_train, lm.predict_proba(X_train)[:,1]))
print("Test AUC = %s" %metrics.roc_auc_score(y_test, lm.predict_proba(X_test)[:,1]))

print("Train Recall = %s" %metrics.recall_score(y_train, lm.predict(X_train)))
print("Test Recall = %s" %metrics.recall_score(y_test, lm.predict(X_test)))

print("Train Precision = %s" %metrics.precision_score(y_train, lm.predict(X_train)))
print("Test Precision = %s" %metrics.precision_score(y_test, lm.predict(X_test)))

Train accuracy = 0.983660965024
Test accuracy = 0.95771292436
Train AUC = 0.99878770539
Test AUC = 0.989698992089
Train Recall = 0.985707244948
Test Recall = 0.961152882206
Train Precision = 0.982800982801
Test Precision = 0.950433705081


In [108]:
model = {}
for role, df_role in df_matches.groupby('lane_role'):
    print('Model for role: {}'.format(role))
    X = df_role[valid_columns]
    y = df_role['win']
    
    df_dummies = pd.get_dummies(X[categorical_columns])
    binary_columns = df_dummies.columns
    X = pd.concat([X, df_dummies], axis = 1)
    X.drop(categorical_columns, axis=1, inplace=True)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 12345)
    N_train, _ = X_train.shape 
    N_test,  _ = X_test.shape 
    print(N_train, N_test)
    
    lm = LogisticRegression(penalty = 'l1', C = 1, fit_intercept=False)
    lm.fit(X_train, y_train)
    model[role] = lm
    
    print("Train accuracy = %s" %metrics.accuracy_score(y_train, lm.predict(X_train)))
    print("Test accuracy = %s" %metrics.accuracy_score(y_test, lm.predict(X_test)))

    print("Train AUC = %s" %metrics.roc_auc_score(y_train, lm.predict_proba(X_train)[:,1]))
    print("Test AUC = %s" %metrics.roc_auc_score(y_test, lm.predict_proba(X_test)[:,1]))

    print("Train Recall = %s" %metrics.recall_score(y_train, lm.predict(X_train)))
    print("Test Recall = %s" %metrics.recall_score(y_test, lm.predict(X_test)))

    print("Train Precision = %s" %metrics.precision_score(y_train, lm.predict(X_train)))
    print("Test Precision = %s" %metrics.precision_score(y_test, lm.predict(X_test)))
    
    print()

Model for role: 1.0
1352 580
Train accuracy = 1.0
Test accuracy = 0.929310344828
Train AUC = 1.0
Test AUC = 0.974701159679
Train Recall = 1.0
Test Recall = 0.926315789474
Train Precision = 1.0
Test Precision = 0.929577464789

Model for role: 2.0
1498 642
Train accuracy = 1.0
Test accuracy = 0.933021806854
Train AUC = 1.0
Test AUC = 0.972606197096
Train Recall = 1.0
Test Recall = 0.953987730061
Train Precision = 1.0
Test Precision = 0.917404129794

Model for role: 3.0
1017 437
Train accuracy = 1.0
Test accuracy = 0.949656750572
Train AUC = 1.0
Test AUC = 0.973769117955
Train Recall = 1.0
Test Recall = 0.944186046512
Train Precision = 1.0
Test Precision = 0.953051643192

Model for role: 4.0
49 21
Train accuracy = 1.0
Test accuracy = 0.904761904762
Train AUC = 1.0
Test AUC = 0.990384615385
Train Recall = 1.0
Test Recall = 0.923076923077
Train Precision = 1.0
Test Precision = 0.923076923077



In [116]:
import os

In [119]:
for role, model_obj in model.items():
    file_name = 'model_{}.sklearn'.format(role)
    file_path = os.path.join(os.path.abspath('data'), file_name)
    with open(file_path, 'wb') as model_file:
        pickle.dump(model_obj, model_file)