In [1]:
import pickle
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import sys
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE

with open('../data/ludo_qval_features_11_to_15.pkl', "rb") as input_file:
    feats, y = pickle.load(input_file)

In [2]:
X_train, X_test, y_train, y_test = train_test_split(feats, y, test_size=0.25, random_state=42)

skill_1 = X_train[:,10] - 3*X_train[:,11]
skill_2 = X_train[:,22] - 3*X_train[:,23]
bins = skill_1 - skill_2

a_bins = np.ceil(bins)

X_train = X_train[np.logical_and(a_bins >= -10, a_bins <= 10)]
y_train = y_train[np.logical_and(a_bins >= -10, a_bins <= 10)]
a_bins = a_bins[np.logical_and(a_bins >= -10, a_bins <= 10)]

a_bins = np.expand_dims(a_bins, -1)

under_sampler = RandomUnderSampler(random_state=42)
_, a_bins = under_sampler.fit_resample(a_bins, a_bins)

indices = under_sampler.sample_indices_
X_train = X_train[indices]
y_train = y_train[indices]

In [3]:
clf = Pipeline([('scaler', StandardScaler()), ('gbc', GradientBoostingClassifier(random_state=42, verbose=1))])
# clf = Pipeline([('scaler', StandardScaler()), ('gbc', LogisticRegression(random_state=42, verbose=1))])
# clf = GradientBoostingClassifier(random_state=42, verbose=1)
clf = clf.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1           1.3764            7.41s
         2           1.3750            7.30s
         3           1.3738            7.23s
         4           1.3725            7.13s
         5           1.3713            7.05s
         6           1.3701            6.96s
         7           1.3693            6.90s
         8           1.3684            6.82s
         9           1.3675            6.78s
        10           1.3667            6.69s
        20           1.3590            5.94s
        30           1.3531            5.22s
        40           1.3486            4.50s
        50           1.3448            3.78s
        60           1.3405            3.03s
        70           1.3361            2.27s
        80           1.3327            1.52s
        90           1.3289            0.76s
       100           1.3259            0.00s


In [4]:
y_pred = clf.predict(X_test)
y_probs = clf.predict_proba(X_test)

In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay

def perf(ytrue, ypred):
    accu = accuracy_score(ytrue, ypred)
    prec = precision_score(ytrue, ypred)
    rec = recall_score(ytrue, ypred)
    return accu*100, prec*100, rec*100

def acc_func(ytrue, ypred):
    accu = accuracy_score(ytrue, ypred)
    return accu*100

In [6]:
acc, precision, recall = perf(y_test, y_pred)
print(acc, precision, recall)

53.978013787963484 54.72375836098383 85.39259356662667


In [7]:
skill_1 = X_test[:,10] - 3*X_test[:,11]
skill_2 = X_test[:,22] - 3*X_test[:,23]
bins = np.abs(skill_1 - skill_2)

In [8]:
u1 = []
tot = []
for i in range(25):
    idx = np.logical_and(bins>i, bins<(i+1))
    temp = y_test[idx]
    u1.append(np.sum(temp[skill_1[idx]>skill_2[idx]]) + np.sum(temp[skill_1[idx]<skill_2[idx]]==0))
    tot.append(len(skill_1[idx]))

In [9]:
import pandas as pd
pd.DataFrame(np.array([u1, tot]).T, columns=['1', '2'])

Unnamed: 0,1,2
0,29185,58515
1,3485,7061
2,838,1775
3,896,1919
4,2977,6263
5,9699,20877
6,1465,3244
7,310,667
8,233,535
9,1496,3325


In [10]:
np.array(u1)/np.array(tot)

array([0.498761  , 0.49355615, 0.47211268, 0.46690985, 0.47533131,
       0.46457824, 0.45160296, 0.46476762, 0.43551402, 0.44992481,
       0.44166192, 0.41813261, 0.44148936, 0.4494382 , 0.43345112,
       0.44009217, 0.45833333, 0.3974359 , 0.47126437, 0.44034918,
       0.44102019, 0.46621622, 0.57142857, 0.38888889, 0.53846154])

In [11]:
bins = np.abs(y_probs[:,1]-0.5)

In [12]:
u1 = []
tot = []
for i in range(25):
    idx = np.logical_and(bins>i*0.05, bins<(i+1)*0.05)
    temp = y_test[idx]
    u1.append(np.sum(temp[y_probs[:,1][idx]>=0.5]==1) + np.sum(temp[y_probs[:,1][idx]<0.5]==0))
    tot.append(len(y_test[idx]))

In [13]:
pd.DataFrame(np.array([u1, tot]).T, columns=['1', '2'])

Unnamed: 0,1,2
0,30651,58560
1,23446,42467
2,5870,10185
3,784,1343
4,82,143
5,3,8
6,1,1
7,0,0
8,0,0
9,0,0


In [14]:
np.array(u1)/np.array(tot)

  np.array(u1)/np.array(tot)


array([0.52341189, 0.55209928, 0.57633775, 0.58376768, 0.57342657,
       0.375     , 1.        ,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan])