In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

In [3]:
# Import data
skill_lookup = {
    1: "Novice",
    2: "Novice",
    3: None,
    4: "Proficient",
    5: "Proficient",
    6: None,
    7: "Expert"
}
league_lookup = {
    1: "Bronze",
    2: "Silver",
    3: "Gold",
    4: "Platinum",
    5: "Diamond",
    6: "Master",
    7: "Grandmaster"
}
data = pd.read_csv("sc2_prediction_data2021-07-31.csv")
# filter players in specified ranks of skill levels
data_uid = data[data['rank'].isin([1, 2, 4, 5, 7])].copy()
# mutate a new variable "Skill" to map ranks to skill levels
data_uid["Skill"] = data_uid.apply(lambda r: skill_lookup[r['rank']], axis = 1)
# average each player stats (1 player per row)
data_uid = data_uid.groupby(["uid", "Skill", "win"], as_index = False).mean()
# sample an equal number of players in each skill level
data_uid = data_uid.groupby('Skill', as_index = False).apply(lambda r: r.sample(n = 3000))
# data_uid

In [4]:
def powerset(s):
    x = len(s)
    masks = [1 << i for i in range(x)]
    for i in range(1 << x):
        yield [ss for mask, ss in zip(masks, s) if i & mask]

def predict(df, models, predictors_list, responses):
    for model in models:
        print(type(model), '\n')
        print('\n')
        for predictors in predictors_list:
            # Take X, y 
            X = df[predictors].values
            y = df[responses].values.ravel()
            # cross validation method - stratified k-fold
            skf = StratifiedKFold(n_splits=3)
            scores = []
            var_importances = []
            for train_index, test_index in skf.split(X, y):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                model.fit(X_train, y_train)
                scores.append(model.score(X_test, y_test))
                var_importances.append(model.feature_importances_)
            var_importances = pd.DataFrame(var_importances, columns = predictors)
            print("Accuracy:", np.mean(scores), "\n")
            print("Variables importances", "\n")
            print(var_importances.mean(), "\n")
            print("====================0.0====================", "\n")

In [5]:
predictors = ['scout_freq', 'scout_freq_fb', 'scout_mb',
             'scout_first', 'apm', 'rel_apm', 'cps', 'rel_cps']
predictors_powerset = list(powerset(predictors))
predictors_powerset.pop(0)

[]

In [7]:
# Skill 
# gradient boosting machine
clf = GradientBoostingClassifier(n_estimators = 100, learning_rate = 1.0, 
                                     max_depth = 1, random_state = 0)
models = [clf]
predict(data_uid, models, predictors_powerset, ['Skill'])

<class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'> 



Accuracy: 0.5287777777777778 

Variables importances 

scout_freq    1.0
dtype: float64 


Accuracy: 0.4676666666666667 

Variables importances 

scout_freq_fb    1.0
dtype: float64 


Accuracy: 0.5541111111111111 

Variables importances 

scout_freq       0.748148
scout_freq_fb    0.251852
dtype: float64 


Accuracy: 0.5343333333333334 

Variables importances 

scout_mb    1.0
dtype: float64 


Accuracy: 0.5682222222222223 

Variables importances 

scout_freq    0.709566
scout_mb      0.290434
dtype: float64 


Accuracy: 0.537 

Variables importances 

scout_freq_fb    0.404546
scout_mb         0.595454
dtype: float64 


Accuracy: 0.5777777777777778 

Variables importances 

scout_freq       0.641952
scout_freq_fb    0.189899
scout_mb         0.168149
dtype: float64 


Accuracy: 0.4808888888888889 

Variables importances 

scout_first    1.0
dtype: float64 


Accuracy: 0.5523333333333333 

Variables importance

Accuracy: 0.6141111111111112 

Variables importances 

scout_freq_fb    0.238717
scout_mb         0.245819
scout_first      0.288462
rel_apm          0.227002
dtype: float64 


Accuracy: 0.6265555555555555 

Variables importances 

scout_freq       0.466288
scout_freq_fb    0.140123
scout_mb         0.112976
scout_first      0.092740
rel_apm          0.187873
dtype: float64 


Accuracy: 0.8402222222222222 

Variables importances 

apm        0.899517
rel_apm    0.100483
dtype: float64 


Accuracy: 0.8407777777777778 

Variables importances 

scout_freq    0.009262
apm           0.892581
rel_apm       0.098157
dtype: float64 


Accuracy: 0.8377777777777777 

Variables importances 

scout_freq_fb    0.004151
apm              0.896678
rel_apm          0.099171
dtype: float64 


Accuracy: 0.8403333333333333 

Variables importances 

scout_freq       0.008368
scout_freq_fb    0.003422
apm              0.890700
rel_apm          0.097510
dtype: float64 


Accuracy: 0.8395555555555556 

Variab

Accuracy: 0.7920000000000001 

Variables importances 

scout_freq       0.010666
scout_freq_fb    0.008732
scout_mb         0.004871
apm              0.341691
cps              0.634041
dtype: float64 


Accuracy: 0.7877777777777778 

Variables importances 

scout_first    0.254091
apm            0.263266
cps            0.482643
dtype: float64 


Accuracy: 0.791 

Variables importances 

scout_freq     0.010179
scout_first    0.117835
apm            0.306814
cps            0.565171
dtype: float64 


Accuracy: 0.789 

Variables importances 

scout_freq_fb    0.011895
scout_first      0.012060
apm              0.331130
cps              0.644915
dtype: float64 


Accuracy: 0.7903333333333333 

Variables importances 

scout_freq       0.008471
scout_freq_fb    0.007570
scout_first      0.115563
apm              0.305444
cps              0.562952
dtype: float64 


Accuracy: 0.7912222222222222 

Variables importances 

scout_mb       0.009126
scout_first    0.009986
apm            0.343920
cp

Accuracy: 0.8445555555555555 

Variables importances 

scout_mb       0.002071
scout_first    0.005486
apm            0.345173
rel_apm        0.079605
cps            0.567665
dtype: float64 


Accuracy: 0.8456666666666667 

Variables importances 

scout_freq     0.007212
scout_mb       0.002061
scout_first    0.004122
apm            0.343020
rel_apm        0.078394
cps            0.565191
dtype: float64 


Accuracy: 0.844 

Variables importances 

scout_freq_fb    0.003966
scout_mb         0.001750
scout_first      0.005191
apm              0.343989
rel_apm          0.078857
cps              0.566247
dtype: float64 


Accuracy: 0.8443333333333333 

Variables importances 

scout_freq       0.006082
scout_freq_fb    0.002397
scout_mb         0.002019
scout_first      0.027883
apm              0.334509
rel_apm          0.075470
cps              0.551640
dtype: float64 


Accuracy: 0.5292222222222221 

Variables importances 

rel_cps    1.0
dtype: float64 


Accuracy: 0.5886666666666667 



Accuracy: 0.6109999999999999 

Variables importances 

scout_mb    0.390388
rel_apm     0.138714
rel_cps     0.470898
dtype: float64 


Accuracy: 0.6350000000000001 

Variables importances 

scout_freq    0.422473
scout_mb      0.139850
rel_apm       0.132297
rel_cps       0.305379
dtype: float64 


Accuracy: 0.6165555555555556 

Variables importances 

scout_freq_fb    0.212425
scout_mb         0.224968
rel_apm          0.119144
rel_cps          0.443463
dtype: float64 


Accuracy: 0.6455555555555555 

Variables importances 

scout_freq       0.395441
scout_freq_fb    0.108849
scout_mb         0.085085
rel_apm          0.128649
rel_cps          0.281976
dtype: float64 


Accuracy: 0.5877777777777778 

Variables importances 

scout_first    0.333726
rel_apm        0.162077
rel_cps        0.504197
dtype: float64 


Accuracy: 0.6206666666666667 

Variables importances 

scout_freq     0.450240
scout_first    0.074541
rel_apm        0.149334
rel_cps        0.325884
dtype: float64 


Accur

Accuracy: 0.8196666666666667 

Variables importances 

scout_freq     0.013351
scout_first    0.106422
cps            0.800394
rel_cps        0.079833
dtype: float64 


Accuracy: 0.8134444444444444 

Variables importances 

scout_freq_fb    0.005764
scout_first      0.023804
cps              0.729461
rel_cps          0.240971
dtype: float64 


Accuracy: 0.8203333333333335 

Variables importances 

scout_freq       0.013511
scout_freq_fb    0.005852
scout_first      0.028451
cps              0.864899
rel_cps          0.087288
dtype: float64 


Accuracy: 0.8194444444444445 

Variables importances 

scout_mb       0.002865
scout_first    0.116215
cps            0.799912
rel_cps        0.081009
dtype: float64 


Accuracy: 0.8204444444444444 

Variables importances 

scout_freq     0.014677
scout_mb       0.002204
scout_first    0.035046
cps            0.861424
rel_cps        0.086649
dtype: float64 


Accuracy: 0.8201111111111111 

Variables importances 

scout_freq_fb    0.006038
scout_mb

Accuracy: 0.8203333333333332 

Variables importances 

scout_mb       0.003003
scout_first    0.075914
rel_apm        0.026798
cps            0.826893
rel_cps        0.067392
dtype: float64 


Accuracy: 0.8222222222222223 

Variables importances 

scout_freq     0.014379
scout_mb       0.002028
scout_first    0.020910
rel_apm        0.028094
cps            0.865819
rel_cps        0.068770
dtype: float64 


Accuracy: 0.8224444444444444 

Variables importances 

scout_freq_fb    0.006233
scout_mb         0.001574
scout_first      0.159663
rel_apm          0.024176
cps              0.749421
rel_cps          0.058933
dtype: float64 


Accuracy: 0.823 

Variables importances 

scout_freq       0.013351
scout_freq_fb    0.005424
scout_mb         0.001781
scout_first      0.026018
rel_apm          0.027706
cps              0.858737
rel_cps          0.066983
dtype: float64 


Accuracy: 0.8496666666666667 

Variables importances 

apm        0.320138
rel_apm    0.061199
cps        0.575486
rel_

In [8]:
# Winner
est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,
                                max_depth=1, random_state=0, loss='ls')
# models = [est]
predict(data_uid, models, predictors_powerset, ['win'])

<class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'> 



Accuracy: 0.5500016328520333 

Variables importances 

scout_freq    1.0
dtype: float64 


Accuracy: 0.5735568199631035 

Variables importances 

scout_freq_fb    1.0
dtype: float64 


Accuracy: 0.5663360413336342 

Variables importances 

scout_freq       0.299156
scout_freq_fb    0.700844
dtype: float64 


Accuracy: 0.5463342254445437 

Variables importances 

scout_mb    1.0
dtype: float64 


Accuracy: 0.5521141514200908 

Variables importances 

scout_freq    0.901588
scout_mb      0.098412
dtype: float64 


Accuracy: 0.5707790787408853 

Variables importances 

scout_freq_fb    0.894118
scout_mb         0.105882
dtype: float64 


Accuracy: 0.5663362266546424 

Variables importances 

scout_freq       0.266244
scout_freq_fb    0.667933
scout_mb         0.065823
dtype: float64 


Accuracy: 0.5403322621480292 

Variables importances 

scout_first    1.0
dtype: float64 


Accuracy: 0.5494465217162802 

Variabl

Accuracy: 0.6696701969386639 

Variables importances 

scout_freq_fb    0.072640
scout_mb         0.020117
scout_first      0.034120
rel_apm          0.873123
dtype: float64 


Accuracy: 0.6691144931361782 

Variables importances 

scout_freq       0.031181
scout_freq_fb    0.065991
scout_mb         0.020958
scout_first      0.028822
rel_apm          0.853048
dtype: float64 


Accuracy: 0.6442262692103262 

Variables importances 

apm        0.184524
rel_apm    0.815476
dtype: float64 


Accuracy: 0.6448927137782028 

Variables importances 

scout_freq    0.036569
apm           0.176437
rel_apm       0.786994
dtype: float64 


Accuracy: 0.6385614166796635 

Variables importances 

scout_freq_fb    0.055358
apm              0.170300
rel_apm          0.774342
dtype: float64 


Accuracy: 0.6461146767781739 

Variables importances 

scout_freq       0.035682
scout_freq_fb    0.054783
apm              0.162546
rel_apm          0.746988
dtype: float64 


Accuracy: 0.6447829730252687 

Variab

Accuracy: 0.5990120051988647 

Variables importances 

scout_freq       0.030231
scout_freq_fb    0.065304
scout_mb         0.017234
apm              0.631631
cps              0.255600
dtype: float64 


Accuracy: 0.5996796720508278 

Variables importances 

scout_first    0.050449
apm            0.673966
cps            0.275584
dtype: float64 


Accuracy: 0.6013467834089018 

Variables importances 

scout_freq     0.025206
scout_first    0.048334
apm            0.658311
cps            0.268148
dtype: float64 


Accuracy: 0.6093459322853505 

Variables importances 

scout_freq_fb    0.061363
scout_first      0.044685
apm              0.635775
cps              0.258178
dtype: float64 


Accuracy: 0.6065683023347497 

Variables importances 

scout_freq       0.020173
scout_freq_fb    0.060669
scout_first      0.035699
apm              0.628026
cps              0.255433
dtype: float64 


Accuracy: 0.6004571535322764 

Variables importances 

scout_mb       0.017855
scout_first    0.052393


Accuracy: 0.6607829743339106 

Variables importances 

scout_mb       0.012759
scout_first    0.027053
apm            0.203702
rel_apm        0.594562
cps            0.161923
dtype: float64 


Accuracy: 0.6615613077537255 

Variables importances 

scout_freq     0.013526
scout_mb       0.010101
scout_first    0.025959
apm            0.202188
rel_apm        0.588834
cps            0.159392
dtype: float64 


Accuracy: 0.6633382337165938 

Variables importances 

scout_freq_fb    0.038470
scout_mb         0.010624
scout_first      0.025046
apm              0.193890
rel_apm          0.579015
cps              0.152955
dtype: float64 


Accuracy: 0.661116455766026 

Variables importances 

scout_freq       0.014115
scout_freq_fb    0.036116
scout_mb         0.010546
scout_first      0.020631
apm              0.193275
rel_apm          0.572965
cps              0.152352
dtype: float64 


Accuracy: 0.563334633284095 

Variables importances 

rel_cps    1.0
dtype: float64 


Accuracy: 0.56044733

Accuracy: 0.6842272724820426 

Variables importances 

scout_mb    0.026256
rel_apm     0.779442
rel_cps     0.194301
dtype: float64 


Accuracy: 0.6857826799635077 

Variables importances 

scout_freq    0.024440
scout_mb      0.020313
rel_apm       0.770611
rel_cps       0.184637
dtype: float64 


Accuracy: 0.6841153464572608 

Variables importances 

scout_freq_fb    0.049423
scout_mb         0.019443
rel_apm          0.753941
rel_cps          0.177193
dtype: float64 


Accuracy: 0.6876712356301372 

Variables importances 

scout_freq       0.020952
scout_freq_fb    0.045984
scout_mb         0.018112
rel_apm          0.741484
rel_cps          0.173469
dtype: float64 


Accuracy: 0.6806720869512195 

Variables importances 

scout_first    0.026227
rel_apm        0.782705
rel_cps        0.191068
dtype: float64 


Accuracy: 0.6814495315314294 

Variables importances 

scout_freq     0.025433
scout_first    0.023885
rel_apm        0.766948
rel_cps        0.183734
dtype: float64 


Accur

Accuracy: 0.5668886703086177 

Variables importances 

scout_first    0.097461
cps            0.274174
rel_cps        0.628365
dtype: float64 


Accuracy: 0.5631104478641239 

Variables importances 

scout_freq     0.196539
scout_first    0.096691
cps            0.188114
rel_cps        0.518655
dtype: float64 


Accuracy: 0.5682233370988894 

Variables importances 

scout_freq_fb    0.277372
scout_first      0.062279
cps              0.192920
rel_cps          0.467428
dtype: float64 


Accuracy: 0.5742214120122556 

Variables importances 

scout_freq       0.073960
scout_freq_fb    0.247604
scout_first      0.057673
cps              0.180012
rel_cps          0.440750
dtype: float64 


Accuracy: 0.5681122631112391 

Variables importances 

scout_mb       0.048023
scout_first    0.090027
cps            0.262320
rel_cps        0.599630
dtype: float64 


Accuracy: 0.5675548554814037 

Variables importances 

scout_freq     0.194758
scout_mb       0.011597
scout_first    0.095875
cps       

Accuracy: 0.6756711976424787 

Variables importances 

scout_freq       0.014061
scout_freq_fb    0.044001
scout_first      0.018733
rel_apm          0.723308
cps              0.046582
rel_cps          0.153315
dtype: float64 


Accuracy: 0.6751158268153388 

Variables importances 

scout_mb       0.019044
scout_first    0.022589
rel_apm        0.742897
cps            0.046327
rel_cps        0.169143
dtype: float64 


Accuracy: 0.6707821599140672 

Variables importances 

scout_freq     0.015752
scout_mb       0.018241
scout_first    0.022683
rel_apm        0.735882
cps            0.043024
rel_cps        0.164418
dtype: float64 


Accuracy: 0.6748926047288079 

Variables importances 

scout_freq_fb    0.043052
scout_mb         0.013545
scout_first      0.021236
rel_apm          0.720382
cps              0.049319
rel_cps          0.152466
dtype: float64 


Accuracy: 0.6768929382473635 

Variables importances 

scout_freq       0.012937
scout_freq_fb    0.043944
scout_mb         0.014552

In [17]:
# Logistic regression
clf = LogisticRegression(random_state=0)
# models = [clf]
# predict(data_uid, models, predictors_powerset, ['Win'])
X = data_uid[predictors].values
y = data_uid['Win'].values.ravel()
clf.fit(X, y)
clf.score(X, y)



0.6878888888888889