In [2]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [3]:
df = pd.read_csv('../dataframe/UFC_kombinasi2.csv')
df.head()

Unnamed: 0,B_avg_SIG_STR_att,B_avg_SIG_STR_landed,B_avg_HEAD_att,B_avg_HEAD_landed,B_avg_BODY_att,B_avg_BODY_landed,B_avg_DISTANCE_att,B_avg_DISTANCE_landed,B_avg_opp_DISTANCE_att,R_avg_opp_SIG_STR_att,...,R_avg_opp_HEAD_landed,R_avg_opp_BODY_att,R_avg_opp_BODY_landed,R_avg_opp_DISTANCE_att,R_avg_opp_DISTANCE_landed,R_total_rounds_fought,R_losses,R_win_by_Decision_Split,R_age,Winner
0,33.5,11.0,25.0625,5.744141,0.0,0.0,24.125,11.0,71.0,78.0,...,19.543213,5.0,4.0,54.875,18.339844,0.0,0.0,0.0,21.0,Red
1,135.625,53.234375,109.0,39.976562,13.625,7.765625,117.5,45.5,88.695312,117.8125,...,27.65625,17.59375,8.3125,110.25,38.40625,17.0,1.0,0.0,30.0,Blue
2,66.0,22.112305,55.0,16.0,5.125,2.0,60.0,16.211914,64.5,91.625,...,26.0,2.96875,1.0,82.229492,24.0,0.0,0.0,0.0,31.0,Blue
3,39.21875,19.015625,30.367188,11.804688,5.429688,4.546875,25.703125,8.875,24.234375,109.998363,...,27.688076,8.554874,4.713242,102.755116,33.409025,33.0,5.0,1.0,34.0,Blue
4,74.522461,29.986572,58.093994,18.651855,8.149902,5.452637,56.636475,20.501221,69.770264,126.03125,...,35.625,19.15918,13.152161,102.640625,43.398438,22.0,2.0,0.0,31.0,Red


In [4]:
X = df.drop('Winner', axis=1).values
X

array([[ 33.5       ,  11.        ,  25.0625    , ...,   0.        ,
          0.        ,  21.        ],
       [135.625     ,  53.234375  , 109.        , ...,   1.        ,
          0.        ,  30.        ],
       [ 66.        ,  22.11230469,  55.        , ...,   0.        ,
          0.        ,  31.        ],
       ...,
       [ 44.75      ,  16.75      ,  31.        , ...,   0.        ,
          0.        ,  30.        ],
       [ 75.3125    ,  38.421875  ,  39.171875  , ...,   2.        ,
          1.        ,  34.        ],
       [ 40.25      ,  14.5       ,  36.        , ...,   3.        ,
          0.        ,  31.        ]])

In [6]:
y = df['Winner'].values
y

array(['Red', 'Blue', 'Blue', ..., 'Red', 'Blue', 'Red'], dtype=object)

In [7]:
from collections import Counter

counter_y = Counter(y)
print(counter_y)

Counter({'Red': 3581, 'Blue': 1730, 'Draw': 99})


In [8]:
from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, NearMiss, TomekLinks

# Over Sampling
adasyn = ADASYN()
randomOver = RandomOverSampler()
smote = SMOTE()
borderSmote = BorderlineSMOTE()
svmSmote = SVMSMOTE()

# Melakukan resampling
X_adasyn, y_adasyn = adasyn.fit_resample(X, y)
X_randomOver, y_randomOver = randomOver.fit_resample(X, y)
X_smote, y_smote = smote.fit_resample(X, y)
X_borderSmote, y_borderSmote = borderSmote.fit_resample(X, y)
X_svmSmote, y_svmSmote = svmSmote.fit_resample(X, y)


# Under Sampling
rand_under = RandomUnderSampler(sampling_strategy='majority')
nearmiss = NearMiss()
nearmiss2 = NearMiss(version=2)
nearmiss3 = NearMiss(version=3)
tomek = TomekLinks()

# Melakukan resampling
X_rand_under, y_rand_under = rand_under.fit_resample(X, y)
X_nearmiss, y_nearmiss = nearmiss.fit_resample(X, y)
X_nearmiss2, y_nearmiss2 = nearmiss2.fit_resample(X, y)
X_nearmiss3, y_nearmiss3 = nearmiss3.fit_resample(X, y)
X_tomek, y_tomek = tomek.fit_resample(X, y)


In [9]:
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV

def evaluate_classification(classifier):
  cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

  scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
  print("ADASYN OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_adasyn, y_adasyn, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Random OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_randomOver, y_randomOver, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("SMOTE OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_smote, y_smote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Borderline SMOTE OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_borderSmote, y_borderSmote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("SVM SMOTE OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_svmSmote, y_svmSmote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Random Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_rand_under, y_rand_under , scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Near Miss 1 Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_nearmiss, y_nearmiss, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Near Miss 2 Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_nearmiss2, y_nearmiss2, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Near Miss 3 Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_nearmiss3, y_nearmiss3, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Tomek Links Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_tomek, y_tomek, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

In [10]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt

In [11]:
evaluate_classification(dt)

ADASYN OverSampling
f1_micro score: 0.7292982354094363
f1_macro score: 0.7249997201367104
precision_micro score: 0.7313390077705751
precision_macro score: 0.7243952644260222
recall_micro score: 0.7298548645265809
recall_macro score: 0.7287194784457581

Random OverSampling
f1_micro score: 0.8326347276850434
f1_macro score: 0.8264645411189153
precision_micro score: 0.8342166856005955
precision_macro score: 0.8389985728442955
recall_micro score: 0.8323550947866272
recall_macro score: 0.8294680400697091

SMOTE OverSampling
f1_micro score: 0.7252173935648905
f1_macro score: 0.723130718591313
precision_micro score: 0.7280095198234335
precision_macro score: 0.7253549587061264
recall_micro score: 0.7266135650118326
recall_macro score: 0.7279185463952065

Borderline SMOTE OverSampling
f1_micro score: 0.75928486074247
f1_macro score: 0.7593464737500603
precision_micro score: 0.7577030328081227
precision_macro score: 0.7538227944361461
recall_micro score: 0.7582614753906585
recall_macro score: 0.

Karena Random Oversampling menghasilkan score terbaik maka akan digunakan Random Oversampling

In [12]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
print("Random OverSampling")
for i in range(len(scoring)):
    score = cross_val_score(dt, X_randomOver, y_randomOver, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

Random OverSampling
f1_micro score: 0.8329137973315724
f1_macro score: 0.8294110661888034
precision_micro score: 0.8322620282440493
precision_macro score: 0.8351664621729938
recall_micro score: 0.8302147376156075
recall_macro score: 0.8304001776489564


Hyperparameter Tuning

In [21]:
from sklearn.model_selection import GridSearchCV

param_grid_c = {'criterion': ['gini', 'entropy'],
               'min_samples_split': [2, 5, 10],
               'max_depth' : [None, 10, 20, 30],
               'min_samples_leaf': [1, 2, 4]
               }

clf_dtc = GridSearchCV(estimator= DecisionTreeClassifier(), param_grid=param_grid_c, cv= 5)
clf_dtc.fit(X_randomOver, y_randomOver)

In [22]:
print("Hyperparameter terbaik untuk Decision Tree Classifier")
print(clf_dtc.best_params_)

Hyperparameter terbaik untuk Decision Tree Classifier
{'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [23]:
best_hypeparam_dtc = DecisionTreeClassifier(**clf_dtc.best_params_)
best_hypeparam_dtc.fit(X_randomOver, y_randomOver)

In [24]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
print("Random OverSampling")
for i in range(len(scoring)):
    score = cross_val_score(best_hypeparam_dtc, X_randomOver, y_randomOver, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

Random OverSampling
f1_micro score: 0.832448507945751
f1_macro score: 0.8285250941983012
precision_micro score: 0.8311455330225914
precision_macro score: 0.8374684219724461
recall_micro score: 0.8291900524517489
recall_macro score: 0.8318890265408581
