In [4]:
%load_ext autoreload
%autoreload 2
import sys
import pickle
import pandas as pd
import numpy as np
import preprocess_all as pal
import read_all as ra
from sklearn.preprocessing import StandardScaler
from columns import colwep,colpla,maps,all_round_status
from sklearn.ensemble import GradientBoostingClassifier,ExtraTreesClassifier,RandomForestClassifier
from catboost import CatBoostClassifier


from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.metrics import accuracy_score,hamming_loss,classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from lightgbm import LGBMModel, LGBMClassifier

# Multi Label Pkgs
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset

from sklearn.multioutput import MultiOutputClassifier

import imblearn
from collections import Counter

import pickle
import matplotlib.pyplot as plt
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
## Importing Preprocessed Data
template="D:/Skybox/Dataset/Win_Reason_Prepro/dataset_{:02}.json"
X,y,final_cols = ra.read_all(template,0,8)

Reading 0th file
(3293, 234)
Reading 1th file
(5455, 234)
Reading 2th file
(8806, 234)
Reading 3th file
(11112, 234)
Reading 4th file
(13455, 234)
Reading 5th file
(16985, 234)
Reading 6th file
(20596, 234)
Reading 7th file
(23150, 234)


In [3]:
X_train, X_test, y_train, y_test = train_test_split(\
     X, y,test_size=0.33,random_state=42,shuffle=False) #stratify=y,

In [7]:
y_test_round_outcome=[i for i,j in y_test]
y_test_win_reason=[j for i,j in y_test]

In [5]:
# summarize class distribution
y_train_winreason=[j for i,j in y_train]
counter = Counter(y_train_winreason)
print(counter)

Counter({2: 5641, 3: 3831, 1: 3264, 0: 2050, 4: 724})


In [9]:
def model_score(X_tr,y_tr,X_tes,y_tes_winreason,y_tes_round_outcome):
    model=RandomForestClassifier(n_estimators=1000,random_state=2021)
    model.fit(X_tr,y_tr)
    pred=model.predict(X_tes)
    prob=model.predict_proba(X_tes)
    round_outcome_prob=[[pro[0]+pro[1]+pro[4],pro[2]+pro[3]] for pro in prob]
    t_win = [np.where(probs[1]>0.5,1,0) for probs in round_outcome_prob]
    print('round_outcome_accuracy: {}'.format(accuracy_score(y_tes_round_outcome,t_win)))
    print('round_outcome_f1_scores: {}'.format(f1_score(y_tes_round_outcome,t_win,average=None)))
    print('win_reason_f1 macro-score: {}'.format(f1_score(y_tes_winreason,pred,average='macro')))
    print('win_reason_f1_scores: {}'.format(f1_score(y_tes_winreason,pred,average=None)))
    

## Balancing

In [10]:
model_score(X_train,y_train_winreason,X_test,y_test_win_reason,y_test_round_outcome)

round_outcome_accuracy: 0.7738219895287958
round_outcome_f1_scores: [0.70860034 0.81518717]
win_reason_f1 macro-score: 0.4203231982627402
win_reason_f1_scores: [0.50744417 0.48052632 0.72745728 0.37015616 0.01603206]


In [11]:
## RandomOverSampler
from imblearn.over_sampling import RandomOverSampler
os=RandomOverSampler()
X_train_os,y_train_os=os.fit_resample(X_train,y_train_winreason)
print("The number of classes before fit {}".format(Counter(y_train_winreason)))
print("The number of classes after fit {}".format(Counter(y_train_os)))

The number of classes before fit Counter({2: 5641, 3: 3831, 1: 3264, 0: 2050, 4: 724})
The number of classes after fit Counter({0: 5641, 1: 5641, 2: 5641, 3: 5641, 4: 5641})


In [12]:
model_score(X_train_os,y_train_os,X_test,y_test_win_reason,y_test_round_outcome)

round_outcome_accuracy: 0.7617801047120419
round_outcome_f1_scores: [0.68846286 0.80716253]
win_reason_f1 macro-score: 0.4166856968283764
win_reason_f1_scores: [0.51234568 0.47344461 0.72013829 0.35428907 0.02321083]


In [13]:
# UnderSamping Version_1
from imblearn.under_sampling import RandomUnderSampler
us1=RandomUnderSampler(random_state=1)
X_train_us1,y_train_us1=us1.fit_resample(X_train,y_train_winreason)
print("The number of classes before fit {}".format(Counter(y_train_winreason)))
print("The number of classes after fit {}".format(Counter(y_train_us1)))

The number of classes before fit Counter({2: 5641, 3: 3831, 1: 3264, 0: 2050, 4: 724})
The number of classes after fit Counter({0: 724, 1: 724, 2: 724, 3: 724, 4: 724})


In [14]:
model_score(X_train_us1,y_train_us1,X_test,y_test_win_reason,y_test_round_outcome)

round_outcome_accuracy: 0.756675392670157
round_outcome_f1_scores: [0.71639969 0.7869341 ]
win_reason_f1 macro-score: 0.4304043800084975
win_reason_f1_scores: [0.558495   0.46852123 0.70145037 0.36420812 0.05934718]


In [15]:
## SMOTETomek
from imblearn.combine import SMOTETomek
st=SMOTETomek()
X_train_st,y_train_st=st.fit_resample(X_train,y_train_winreason)
print("The number of classes before fit {}".format(Counter(y_train_winreason)))
print("The number of classes after fit {}".format(Counter(y_train_st)))

The number of classes before fit Counter({2: 5641, 3: 3831, 1: 3264, 0: 2050, 4: 724})
The number of classes after fit Counter({2: 5641, 4: 5641, 0: 5640, 3: 5639, 1: 5638})


In [16]:
model_score(X_train_st,y_train_st,X_test,y_test_win_reason,y_test_round_outcome)

round_outcome_accuracy: 0.7609947643979058
round_outcome_f1_scores: [0.69697975 0.80267992]
win_reason_f1 macro-score: 0.41476630447007634
win_reason_f1_scores: [0.51533742 0.46435845 0.70796771 0.348      0.03816794]


In [17]:
## TomekLinks
from imblearn.under_sampling import TomekLinks
tl= TomekLinks()
X_train_tl,y_train_tl=tl.fit_resample(X_train,y_train_winreason)
print("The number of classes before fit {}".format(Counter(y_train_winreason)))
print("The number of classes after fit {}".format(Counter(y_train_tl)))

The number of classes before fit Counter({2: 5641, 3: 3831, 1: 3264, 0: 2050, 4: 724})
The number of classes after fit Counter({2: 5639, 3: 3827, 1: 3261, 0: 2048, 4: 724})


In [18]:
model_score(X_train_tl,y_train_tl,X_test,y_test_win_reason,y_test_round_outcome)

round_outcome_accuracy: 0.762041884816754
round_outcome_f1_scores: [0.69196882 0.80614203]
win_reason_f1 macro-score: 0.42480328734304296
win_reason_f1_scores: [0.50188679 0.48853388 0.72562125 0.39194245 0.01603206]


In [19]:
## SMOTE ENN
from imblearn.combine import SMOTEENN
senn=SMOTEENN(random_state=1)
X_train_senn,y_train_senn=senn.fit_resample(X_train,y_train_winreason)
print("The number of classes before fit {}".format(Counter(y_train_winreason)))
print("The number of classes after fit {}".format(Counter(y_train_senn)))

The number of classes before fit Counter({2: 5641, 3: 3831, 1: 3264, 0: 2050, 4: 724})
The number of classes after fit Counter({4: 5638, 0: 5609, 2: 5541, 1: 5518, 3: 5510})


In [20]:
model_score(X_train_senn,y_train_senn,X_test,y_test_win_reason,y_test_round_outcome)

round_outcome_accuracy: 0.7647905759162303
round_outcome_f1_scores: [0.70973995 0.80228848]
win_reason_f1 macro-score: 0.4196318581788918
win_reason_f1_scores: [0.5135468  0.46801181 0.71283666 0.35498354 0.04878049]


In [21]:
## Condensed NearestNeighbour
from imblearn.under_sampling import CondensedNearestNeighbour
cnn=CondensedNearestNeighbour(random_state=1)
X_train_cnn,y_train_cnn=cnn.fit_resample(X_train,y_train_winreason)
print("The number of classes before fit {}".format(Counter(y_train_winreason)))
print("The number of classes after fit {}".format(Counter(y_train_cnn)))

The number of classes before fit Counter({2: 5641, 3: 3831, 1: 3264, 0: 2050, 4: 724})
The number of classes after fit Counter({4: 724, 3: 114, 1: 110, 2: 91, 0: 58})


In [22]:
model_score(X_train_cnn,y_train_cnn,X_test,y_test_win_reason,y_test_round_outcome)

round_outcome_accuracy: 0.7460732984293194
round_outcome_f1_scores: [0.73805023 0.75361951]
win_reason_f1 macro-score: 0.42202119851482295
win_reason_f1_scores: [0.54039301 0.40038406 0.5525075  0.35174679 0.26507463]


In [23]:
#Edited NearestNeighbours
from imblearn.under_sampling import EditedNearestNeighbours
enn=EditedNearestNeighbours()
X_train_enn,y_train_enn=enn.fit_resample(X_train,y_train_winreason)
print("The number of classes before fit {}".format(Counter(y_train_winreason)))
print("The number of classes after fit {}".format(Counter(y_train_enn)))

The number of classes before fit Counter({2: 5641, 3: 3831, 1: 3264, 0: 2050, 4: 724})
The number of classes after fit Counter({2: 5545, 3: 3658, 1: 3109, 0: 1985, 4: 724})


In [24]:
model_score(X_train_enn,y_train_enn,X_test,y_test_win_reason,y_test_round_outcome)

round_outcome_accuracy: 0.7691099476439791
round_outcome_f1_scores: [0.70590197 0.80995475]
win_reason_f1 macro-score: 0.4282152096592413
win_reason_f1_scores: [0.50530256 0.4824173  0.72389712 0.39443962 0.03501946]
