In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import ParameterGrid

In [16]:
# Import data
skill_lookup = {
    1: "Novice",
    2: "Novice",
    3: None,
    4: "Proficient",
    5: "Proficient",
    6: None,
    7: "Expert"
}
league_lookup = {
    1: "Bronze",
    2: "Silver",
    3: "Gold",
    4: "Platinum",
    5: "Diamond",
    6: "Master",
    7: "Grandmaster"
}
data = pd.read_csv("sc2_prediction_data2021-08-03.csv")
# filter players in specified ranks of skill levels
data_uid = data[data['rank'].isin([1, 2, 4, 5, 7])].copy()
# mutate a new variable "Skill" to map ranks to skill levels
data_uid["Skill"] = data_uid.apply(lambda r: skill_lookup[r['rank']], axis = 1)
# average each player stats (1 player per row)
data_uid = data_uid.groupby(["uid", "Skill", "win"], as_index = False).mean()
# sample an equal number of players in each skill level
data_uid = data_uid.groupby('Skill', as_index = False).apply(lambda r: r.sample(n = 3000))
# data_uid

In [17]:
def powerset(s):
    x = len(s)
    masks = [1 << i for i in range(x)]
    for i in range(1 << x):
        yield [ss for mask, ss in zip(masks, s) if i & mask]
        
def score_est(est, X, y, cv):
#     est.fit(X_train, y_train)
#     return est.score(X_test, y_test)
    return np.mean(cross_val_score(est, X, y, cv=cv))

def predict(df, models, predictors_list, responses, grid, pool):
    for model in models:
        print(type(model), '\n')
        print('\n')
        for predictors in predictors_list:
            # Take X, y 
            X = df[predictors].values
            y = df[responses].values.ravel()
            # cross validation method - stratified k-fold
            skf = StratifiedKFold(n_splits=3)
            tasks = []
            var_importances = []
            for params in grid:
                param_scores = []
                est = model(**params)
#                 for train_index, test_index in skf.split(X, y):
#                     X_train, X_test = X[train_index], X[test_index]
#                     y_train, y_test = y[train_index], y[test_index]
#                     param_scores.append(pool.apply_async(score_est, (est, X_train, y_train, X_test, y_test)))
                
                tasks.append((pool.apply_async(score_est, (est, X, y, skf)), params))
            scores = [(task.get(), params) for task, params in tasks]
            avg_score, best_params = max(scores, key=lambda x: x[0])
            best_model = model(**best_params)
            for train_index, test_index in skf.split(X, y):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                best_model.fit(X_train, y_train)
                var_importances.append(best_model.feature_importances_)
            var_importances = pd.DataFrame(var_importances, columns = predictors)
            print("Accuracy:", avg_score, "\n")
            print("Variables importances", "\n")
            print(var_importances.mean(), "\n")
            print("Parameters", best_params)
            print("====================0.0====================", "\n")

In [18]:
predictors = ['scout_freq', 'scout_freq_fb', 'scout_mb',
             'scout_first', 'apm', 'rel_apm', 'cps', 'rel_cps']
predictors_powerset = list(powerset(predictors))
predictors_powerset.pop(0)

[]

In [7]:
# Skill 
# gradient boosting machine
grid = {}
clf = GradientBoostingClassifier(n_estimators = 100, learning_rate = 1.0, 
                                     max_depth = 1, random_state = 0)
models = [GradientBoostingClassifier]

predict(data_uid, models, predictors_powerset, ['Skill'])

<class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'> 



Accuracy: 0.5287777777777778 

Variables importances 

scout_freq    1.0
dtype: float64 


Accuracy: 0.4676666666666667 

Variables importances 

scout_freq_fb    1.0
dtype: float64 


Accuracy: 0.5541111111111111 

Variables importances 

scout_freq       0.748148
scout_freq_fb    0.251852
dtype: float64 


Accuracy: 0.5343333333333334 

Variables importances 

scout_mb    1.0
dtype: float64 


Accuracy: 0.5682222222222223 

Variables importances 

scout_freq    0.709566
scout_mb      0.290434
dtype: float64 


Accuracy: 0.537 

Variables importances 

scout_freq_fb    0.404546
scout_mb         0.595454
dtype: float64 


Accuracy: 0.5777777777777778 

Variables importances 

scout_freq       0.641952
scout_freq_fb    0.189899
scout_mb         0.168149
dtype: float64 


Accuracy: 0.4808888888888889 

Variables importances 

scout_first    1.0
dtype: float64 


Accuracy: 0.5523333333333333 

Variables importance

Accuracy: 0.6141111111111112 

Variables importances 

scout_freq_fb    0.238717
scout_mb         0.245819
scout_first      0.288462
rel_apm          0.227002
dtype: float64 


Accuracy: 0.6265555555555555 

Variables importances 

scout_freq       0.466288
scout_freq_fb    0.140123
scout_mb         0.112976
scout_first      0.092740
rel_apm          0.187873
dtype: float64 


Accuracy: 0.8402222222222222 

Variables importances 

apm        0.899517
rel_apm    0.100483
dtype: float64 


Accuracy: 0.8407777777777778 

Variables importances 

scout_freq    0.009262
apm           0.892581
rel_apm       0.098157
dtype: float64 


Accuracy: 0.8377777777777777 

Variables importances 

scout_freq_fb    0.004151
apm              0.896678
rel_apm          0.099171
dtype: float64 


Accuracy: 0.8403333333333333 

Variables importances 

scout_freq       0.008368
scout_freq_fb    0.003422
apm              0.890700
rel_apm          0.097510
dtype: float64 


Accuracy: 0.8395555555555556 

Variab

Accuracy: 0.7920000000000001 

Variables importances 

scout_freq       0.010666
scout_freq_fb    0.008732
scout_mb         0.004871
apm              0.341691
cps              0.634041
dtype: float64 


Accuracy: 0.7877777777777778 

Variables importances 

scout_first    0.254091
apm            0.263266
cps            0.482643
dtype: float64 


Accuracy: 0.791 

Variables importances 

scout_freq     0.010179
scout_first    0.117835
apm            0.306814
cps            0.565171
dtype: float64 


Accuracy: 0.789 

Variables importances 

scout_freq_fb    0.011895
scout_first      0.012060
apm              0.331130
cps              0.644915
dtype: float64 


Accuracy: 0.7903333333333333 

Variables importances 

scout_freq       0.008471
scout_freq_fb    0.007570
scout_first      0.115563
apm              0.305444
cps              0.562952
dtype: float64 


Accuracy: 0.7912222222222222 

Variables importances 

scout_mb       0.009126
scout_first    0.009986
apm            0.343920
cp

Accuracy: 0.8445555555555555 

Variables importances 

scout_mb       0.002071
scout_first    0.005486
apm            0.345173
rel_apm        0.079605
cps            0.567665
dtype: float64 


Accuracy: 0.8456666666666667 

Variables importances 

scout_freq     0.007212
scout_mb       0.002061
scout_first    0.004122
apm            0.343020
rel_apm        0.078394
cps            0.565191
dtype: float64 


Accuracy: 0.844 

Variables importances 

scout_freq_fb    0.003966
scout_mb         0.001750
scout_first      0.005191
apm              0.343989
rel_apm          0.078857
cps              0.566247
dtype: float64 


Accuracy: 0.8443333333333333 

Variables importances 

scout_freq       0.006082
scout_freq_fb    0.002397
scout_mb         0.002019
scout_first      0.027883
apm              0.334509
rel_apm          0.075470
cps              0.551640
dtype: float64 


Accuracy: 0.5292222222222221 

Variables importances 

rel_cps    1.0
dtype: float64 


Accuracy: 0.5886666666666667 



Accuracy: 0.6109999999999999 

Variables importances 

scout_mb    0.390388
rel_apm     0.138714
rel_cps     0.470898
dtype: float64 


Accuracy: 0.6350000000000001 

Variables importances 

scout_freq    0.422473
scout_mb      0.139850
rel_apm       0.132297
rel_cps       0.305379
dtype: float64 


Accuracy: 0.6165555555555556 

Variables importances 

scout_freq_fb    0.212425
scout_mb         0.224968
rel_apm          0.119144
rel_cps          0.443463
dtype: float64 


Accuracy: 0.6455555555555555 

Variables importances 

scout_freq       0.395441
scout_freq_fb    0.108849
scout_mb         0.085085
rel_apm          0.128649
rel_cps          0.281976
dtype: float64 


Accuracy: 0.5877777777777778 

Variables importances 

scout_first    0.333726
rel_apm        0.162077
rel_cps        0.504197
dtype: float64 


Accuracy: 0.6206666666666667 

Variables importances 

scout_freq     0.450240
scout_first    0.074541
rel_apm        0.149334
rel_cps        0.325884
dtype: float64 


Accur

Accuracy: 0.8196666666666667 

Variables importances 

scout_freq     0.013351
scout_first    0.106422
cps            0.800394
rel_cps        0.079833
dtype: float64 


Accuracy: 0.8134444444444444 

Variables importances 

scout_freq_fb    0.005764
scout_first      0.023804
cps              0.729461
rel_cps          0.240971
dtype: float64 


Accuracy: 0.8203333333333335 

Variables importances 

scout_freq       0.013511
scout_freq_fb    0.005852
scout_first      0.028451
cps              0.864899
rel_cps          0.087288
dtype: float64 


Accuracy: 0.8194444444444445 

Variables importances 

scout_mb       0.002865
scout_first    0.116215
cps            0.799912
rel_cps        0.081009
dtype: float64 


Accuracy: 0.8204444444444444 

Variables importances 

scout_freq     0.014677
scout_mb       0.002204
scout_first    0.035046
cps            0.861424
rel_cps        0.086649
dtype: float64 


Accuracy: 0.8201111111111111 

Variables importances 

scout_freq_fb    0.006038
scout_mb

Accuracy: 0.8203333333333332 

Variables importances 

scout_mb       0.003003
scout_first    0.075914
rel_apm        0.026798
cps            0.826893
rel_cps        0.067392
dtype: float64 


Accuracy: 0.8222222222222223 

Variables importances 

scout_freq     0.014379
scout_mb       0.002028
scout_first    0.020910
rel_apm        0.028094
cps            0.865819
rel_cps        0.068770
dtype: float64 


Accuracy: 0.8224444444444444 

Variables importances 

scout_freq_fb    0.006233
scout_mb         0.001574
scout_first      0.159663
rel_apm          0.024176
cps              0.749421
rel_cps          0.058933
dtype: float64 


Accuracy: 0.823 

Variables importances 

scout_freq       0.013351
scout_freq_fb    0.005424
scout_mb         0.001781
scout_first      0.026018
rel_apm          0.027706
cps              0.858737
rel_cps          0.066983
dtype: float64 


Accuracy: 0.8496666666666667 

Variables importances 

apm        0.320138
rel_apm    0.061199
cps        0.575486
rel_

In [None]:
# Winner
grid = {"loss": ["deviance", "exponential"], "max_depth": [2, 3, 5, 7], 
       "n_estimators": [500, 1000, 2000], "min_samples_leaf": [5, 10, 20, 50, 70], 
       "min_samples_split": [200, 400, 600, 800, 1000, 1500], "n_iter_no_change": [100]
       "learning_rate": [0.01, 0.05, 0.1], "subsample": [0.5, 1.0]}
grid_quick = {"loss": ["deviance", "exponential"], "max_depth": [3, 5], 
       "n_estimators": [500], "n_iter_no_change": [100],
       "learning_rate": [0.01, 0.1], "subsample": [0.5, 1.0]}
with Pool(10) as pool:
    predict(data_uid, [GradientBoostingClassifier], predictors_powerset, ['win'], ParameterGrid(grid), pool)

<class 'abc.ABCMeta'> 



Accuracy: 0.5675574491113214 

Variables importances 

scout_freq    1.0
dtype: float64 

Parameters {'loss': 'deviance', 'max_depth': 3, 'min_samples_leaf': 5, 'min_samples_split': 1500, 'n_estimators': 100}

Accuracy: 0.5756672644074738 

Variables importances 

scout_freq_fb    1.0
dtype: float64 

Parameters {'loss': 'deviance', 'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 1000, 'n_estimators': 50}

Accuracy: 0.5802235238766879 

Variables importances 

scout_freq       0.34882
scout_freq_fb    0.65118
dtype: float64 

Parameters {'loss': 'deviance', 'max_depth': 3, 'min_samples_leaf': 5, 'min_samples_split': 400, 'n_estimators': 50}

Accuracy: 0.560447263469449 

Variables importances 

scout_mb    1.0
dtype: float64 

Parameters {'loss': 'deviance', 'max_depth': 20, 'min_samples_leaf': 20, 'min_samples_split': 600, 'n_estimators': 100}

Accuracy: 0.5670017825063709 

Variables importances 

scout_freq    0.864313
scout_mb      0.135687
dty

In [17]:
# Logistic regression
clf = LogisticRegression(random_state=0)
# models = [clf]
# predict(data_uid, models, predictors_powerset, ['Win'])
X = data_uid[predictors].values
y = data_uid['Win'].values.ravel()
clf.fit(X, y)
clf.score(X, y)



0.6878888888888889