In [1]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, NearMiss, TomekLinks
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

In [2]:
df = pd.read_csv('dataframe/UFC_kombinasi3.csv')
df.head()

Unnamed: 0,B_avg_DISTANCE_landed,R_avg_opp_SIG_STR_pct,R_age,B_avg_CTRL_time(seconds),B_avg_opp_SIG_STR_pct,B_avg_HEAD_att,B_avg_opp_LEG_att,B_age,B_avg_DISTANCE_att,R_losses,...,R_avg_opp_LEG_att,R_avg_LEG_att,B_avg_GROUND_att,R_avg_GROUND_landed,R_avg_opp_BODY_landed,R_avg_SUB_ATT,R_avg_opp_DISTANCE_att,R_avg_BODY_landed,R_avg_opp_CTRL_time(seconds),Winner
0,11.0,0.445,21.0,132.289062,0.44375,25.0625,2.5,26.0,24.125,0.0,...,4.600342,7.75,2.0,8.09375,4.0,1.0,54.875,4.5,204.625,Red
1,45.5,0.339062,30.0,35.945312,0.352422,109.0,2.78125,27.0,117.5,1.0,...,7.96875,9.375,4.757812,7.25,8.3125,0.5625,110.25,13.875,115.0,Blue
2,16.211914,0.225,31.0,16.1875,0.313125,55.0,5.063477,28.0,60.0,0.0,...,3.0,5.0,8.78125,0.84375,1.0,1.25,82.229492,10.116486,78.577091,Blue
3,8.875,0.325567,34.0,68.28125,0.185547,30.367188,1.796875,28.0,25.703125,5.0,...,8.843031,0.218128,1.648438,4.844223,4.713242,0.093866,102.755116,7.289246,116.292816,Blue
4,20.501221,0.42,31.0,209.931152,0.456558,58.093994,6.700195,34.0,56.636475,2.0,...,12.164062,11.414062,7.906982,0.828125,13.152161,0.0,102.640625,13.875,162.796875,Red


In [3]:
X = df.drop('Winner', axis=1).values
X

array([[1.10000000e+01, 4.45000000e-01, 2.10000000e+01, ...,
        5.48750000e+01, 4.50000000e+00, 2.04625000e+02],
       [4.55000000e+01, 3.39062500e-01, 3.00000000e+01, ...,
        1.10250000e+02, 1.38750000e+01, 1.15000000e+02],
       [1.62119141e+01, 2.25000000e-01, 3.10000000e+01, ...,
        8.22294922e+01, 1.01164856e+01, 7.85770912e+01],
       ...,
       [1.32500000e+01, 4.12500000e-01, 3.00000000e+01, ...,
        3.63281250e+01, 4.16406250e+00, 2.40000000e+02],
       [3.64531250e+01, 3.52890625e-01, 3.40000000e+01, ...,
        6.13671875e+01, 3.70312500e+00, 3.00468750e+01],
       [1.00000000e+01, 3.05000000e-01, 3.10000000e+01, ...,
        4.12656250e+01, 6.30859375e+00, 1.05898438e+02]])

In [4]:
y = df['Winner'].values
y

array(['Red', 'Blue', 'Blue', ..., 'Red', 'Blue', 'Red'], dtype=object)

In [5]:
counter_y = Counter(y)
print(counter_y)

Counter({'Red': 3581, 'Blue': 1730, 'Draw': 99})


In [5]:
# Over Sampling
adasyn = ADASYN()
randomOver = RandomOverSampler()
smote = SMOTE()
borderSmote = BorderlineSMOTE()
svmSmote = SVMSMOTE()

# Melakukan resampling
X_adasyn, y_adasyn = adasyn.fit_resample(X, y)
X_randomOver, y_randomOver = randomOver.fit_resample(X, y)
X_smote, y_smote = smote.fit_resample(X, y)
X_borderSmote, y_borderSmote = borderSmote.fit_resample(X, y)
X_svmSmote, y_svmSmote = svmSmote.fit_resample(X, y)


# Under Sampling
rand_under = RandomUnderSampler(sampling_strategy='majority')
nearmiss = NearMiss()
nearmiss2 = NearMiss(version=2)
nearmiss3 = NearMiss(version=3)
tomek = TomekLinks()

# Melakukan resampling
X_rand_under, y_rand_under = rand_under.fit_resample(X, y)
X_nearmiss, y_nearmiss = nearmiss.fit_resample(X, y)
X_nearmiss2, y_nearmiss2 = nearmiss2.fit_resample(X, y)
X_nearmiss3, y_nearmiss3 = nearmiss3.fit_resample(X, y)
X_tomek, y_tomek = tomek.fit_resample(X, y)


In [6]:
def evaluate_classification(classifier):
  cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

  scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
  print("ADASYN OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_adasyn, y_adasyn, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Random OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_randomOver, y_randomOver, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("SMOTE OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_smote, y_smote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Borderline SMOTE OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_borderSmote, y_borderSmote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("SVM SMOTE OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_svmSmote, y_svmSmote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Random Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_rand_under, y_rand_under , scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Near Miss 1 Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_nearmiss, y_nearmiss, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Near Miss 2 Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_nearmiss2, y_nearmiss2, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Near Miss 3 Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_nearmiss3, y_nearmiss3, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Tomek Links Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_tomek, y_tomek, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

# Decision Tree

In [8]:
dt = DecisionTreeClassifier()
dt

In [9]:
evaluate_classification(dt)

ADASYN OverSampling
f1_micro score: 0.702835002835003
f1_macro score: 0.7005566205146466
precision_micro score: 0.7045129334784508
precision_macro score: 0.6960460821568295
recall_micro score: 0.703488595902389
recall_macro score: 0.7056320455914139

Random OverSampling
f1_micro score: 0.8304963635591627
f1_macro score: 0.8263762441896023
precision_micro score: 0.8309608297306876
precision_macro score: 0.8335216302960629
recall_micro score: 0.8272380380463652
recall_macro score: 0.8289121079191439

SMOTE OverSampling
f1_micro score: 0.7029693772947099
f1_macro score: 0.6999362818034833
precision_micro score: 0.7067855388110879
precision_macro score: 0.7005276616152447
recall_micro score: 0.7037129564398321
recall_macro score: 0.704552384365853

Borderline SMOTE OverSampling
f1_micro score: 0.7505353492551643
f1_macro score: 0.7490011246303704
precision_micro score: 0.7491383112668575
precision_macro score: 0.7509326964320964
recall_micro score: 0.749790470297995
recall_macro score: 0.7

Karena Random Oversampling menghasilkan score terbaik maka akan digunakan Random Oversampling

In [10]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(dt, X_randomOver, y_randomOver, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.8283544899407544
f1_macro score: 0.8238821293105734
precision_micro score: 0.8249104212864153
precision_macro score: 0.8342203033058565
recall_micro score: 0.825747890188412
recall_macro score: 0.82667928909251


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_randomOver, y_randomOver, test_size=0.25, random_state=42)

## Hyperparameter Tuning

In [12]:
param_grid_c = {'criterion': ['gini', 'entropy'],
               'min_samples_split': [2, 5, 10],
               'max_depth' : [None, 10, 20, 30],
               'min_samples_leaf': [1, 2, 4]
               }

clf_dtc = GridSearchCV(estimator= DecisionTreeClassifier(), param_grid=param_grid_c, cv= 5)
clf_dtc.fit(X_train, y_train)

In [13]:
print("Hyperparameter terbaik untuk Decision Tree Classifier")
print(clf_dtc.best_params_)

Hyperparameter terbaik untuk Decision Tree Classifier
{'criterion': 'entropy', 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [14]:
best_hypeparam_dtc = DecisionTreeClassifier(**clf_dtc.best_params_)
best_hypeparam_dtc.fit(X_train, y_train)

In [15]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(best_hypeparam_dtc, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.7998006108746781
f1_macro score: 0.7970722770924606
precision_micro score: 0.8014141302121118
precision_macro score: 0.8019866692340244
recall_micro score: 0.8004208042413123
recall_macro score: 0.8005694245912698


In [7]:
def classification_metrics(prediction, y_test):
  accuracy = accuracy_score(y_test, prediction)
  f1 = f1_score(y_test, prediction, average="macro")
  recall = recall_score(y_test, prediction, average="macro")
  precision = precision_score(y_test, prediction, average="macro")

  print('Accuracy: ' + str(accuracy))
  print('F1 Score: ' + str(f1))
  print('Recall Score: ' + str(recall))
  print('Precision Score: ' + str(precision))

In [17]:
prediction = best_hypeparam_dtc.predict(X_test)
classification_metrics(prediction, y_test)

Accuracy: 0.8134772896500372
F1 Score: 0.8112140389682986
Recall Score: 0.8152794526616297
Precision Score: 0.8193106995847267


# Random Forest

In [8]:
rf = RandomForestClassifier()

In [9]:
evaluate_classification(rf)

ADASYN OverSampling
f1_micro score: 0.8100298707195259
f1_macro score: 0.8110063721296648
precision_micro score: 0.8147839858184686
precision_macro score: 0.8070683195451629
recall_micro score: 0.8119874781943747
recall_macro score: 0.8084455975940944

Random OverSampling
f1_micro score: 0.8811321882855738
f1_macro score: 0.8780414229044453
precision_micro score: 0.8791770110042088
precision_macro score: 0.8808997970254578
recall_micro score: 0.8826215562562988
recall_macro score: 0.8785247345005182

SMOTE OverSampling
f1_micro score: 0.8155091840386547
f1_macro score: 0.8121740322330304
precision_micro score: 0.8183936402796157
precision_macro score: 0.8116483118088291
recall_micro score: 0.8140189495265652
recall_macro score: 0.8154181892792491

Borderline SMOTE OverSampling
f1_micro score: 0.8347761247057008
f1_macro score: 0.8351369717176779
precision_micro score: 0.8389655055878921
precision_macro score: 0.8356413494203325
recall_micro score: 0.8319836085035437
recall_macro score:

Karena score nya paling tinggi maka akan digunakan Random Oversampling

In [10]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(rf, X_randomOver, y_randomOver, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.8788979413576797
f1_macro score: 0.8801981963762888
precision_micro score: 0.8768500008232143
precision_macro score: 0.8794490520906819
recall_micro score: 0.8811314083983456
recall_macro score: 0.8792683148542058


## Hyperparameter Tuning

In [13]:
param_grid_c = {'criterion': ['gini', 'entropy'],
               'min_samples_split': [2, 5, 10],
               'max_depth' : [None, 10, 20, 30],
               'min_samples_leaf': [1, 2, 4]
               }

clf_rfc = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid_c, cv= 5)
clf_rfc.fit(X_train, y_train)

In [14]:
print("Hyperparameter terbaik untuk Decision Tree Classifier")
print(clf_rfc.best_params_)

Hyperparameter terbaik untuk Decision Tree Classifier
{'criterion': 'gini', 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5}


In [15]:
best_hypeparam_rfc = RandomForestClassifier(**clf_rfc.best_params_)
best_hypeparam_rfc.fit(X_train, y_train)

In [16]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(best_hypeparam_rfc, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.8514345389097595
f1_macro score: 0.8516829584640437
precision_micro score: 0.8483295673510127
precision_macro score: 0.8482454871977225
recall_micro score: 0.8494470398146735
recall_macro score: 0.8514488421276493


In [17]:
prediction_rf = best_hypeparam_rfc.predict(X_test)
classification_metrics(prediction_rf, y_test)

Accuracy: 0.8637379002233805
F1 Score: 0.8646699528240364
Recall Score: 0.8647002846618923
Precision Score: 0.8647036963204151


# Logistic Regression

In [26]:
logistic = LogisticRegression(max_iter=2000)
logistic

In [27]:
evaluate_classification(logistic)

ADASYN OverSampling
f1_micro score: 0.4458431906707768
f1_macro score: 0.4439989425091329
precision_micro score: 0.4458431906707768
precision_macro score: 0.4450332778087656
recall_micro score: 0.4458431906707768
recall_macro score: 0.4454834657576436

Random OverSampling
f1_micro score: 0.44587184026523097
f1_macro score: 0.44540381668357165
precision_micro score: 0.44587184026523097
precision_macro score: 0.4459935859887395
recall_micro score: 0.44587184026523097
recall_macro score: 0.44587264855374525

SMOTE OverSampling
f1_micro score: 0.4552728608776505
f1_macro score: 0.4546964765934982
precision_micro score: 0.4552728608776505
precision_macro score: 0.4551908796251566
recall_micro score: 0.4552728608776505
recall_macro score: 0.45527129125338617

Borderline SMOTE OverSampling
f1_micro score: 0.5585023305630006
f1_macro score: 0.5542394896061771
precision_micro score: 0.5585023305630006
precision_macro score: 0.5539669712735242
recall_micro score: 0.5585023305630006
recall_macro 

Karena Nearmiss 1 undersampling menghasilkan nilai terbaik maka akan digunakan

In [28]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(logistic, X_nearmiss, y_nearmiss, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.5757062146892655
f1_macro score: 0.5717860087561478
precision_micro score: 0.5757062146892655
precision_macro score: 0.5804405573587159
recall_micro score: 0.5757062146892655
recall_macro score: 0.5759649122807018


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X_nearmiss, y_nearmiss, test_size=0.25, random_state=42)

In [30]:
logistic.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [31]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(logistic, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.5804040404040404
f1_macro score: 0.569183727922118
precision_micro score: 0.5804040404040404
precision_macro score: 0.5819031137993986
recall_micro score: 0.5804040404040404
recall_macro score: 0.5776587301587301


In [32]:
prediction_logreg = logistic.predict(X_test)
classification_metrics(prediction_logreg, y_test)

Accuracy: 0.49333333333333335
F1 Score: 0.49551414768806074
Recall Score: 0.49551414768806074
Precision Score: 0.49551414768806074


# Softmax Regression

In [33]:
softmax = LogisticRegression(max_iter=2000, multi_class='multinomial')
softmax

In [34]:
evaluate_classification(softmax)

ADASYN OverSampling
f1_micro score: 0.4458431906707768
f1_macro score: 0.4439989425091329
precision_micro score: 0.4458431906707768
precision_macro score: 0.4450332778087656
recall_micro score: 0.4458431906707768
recall_macro score: 0.4454834657576436

Random OverSampling
f1_micro score: 0.44587184026523097
f1_macro score: 0.44540381668357165
precision_micro score: 0.44587184026523097
precision_macro score: 0.4459935859887395
recall_micro score: 0.44587184026523097
recall_macro score: 0.44587264855374525

SMOTE OverSampling
f1_micro score: 0.4552728608776505
f1_macro score: 0.4546964765934982
precision_micro score: 0.4552728608776505
precision_macro score: 0.4551908796251566
recall_micro score: 0.4552728608776505
recall_macro score: 0.45527129125338617

Borderline SMOTE OverSampling
f1_micro score: 0.5585023305630006
f1_macro score: 0.5542394896061771
precision_micro score: 0.5585023305630006
precision_macro score: 0.5539669712735242
recall_micro score: 0.5585023305630006
recall_macro 

Karena Nearmiss 1 undersampling menghasilkan nilai terbaik maka akan digunakan

In [35]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(softmax, X_nearmiss, y_nearmiss, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.5757062146892655
f1_macro score: 0.5717860087561478
precision_micro score: 0.5757062146892655
precision_macro score: 0.5804405573587159
recall_micro score: 0.5757062146892655
recall_macro score: 0.5759649122807018


In [36]:
softmax.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [37]:
prediction_softmax = softmax.predict(X_test)
classification_metrics(prediction_softmax, y_test)

Accuracy: 0.49333333333333335
F1 Score: 0.49551414768806074
Recall Score: 0.49551414768806074
Precision Score: 0.49551414768806074


# KNN

In [38]:
knn = KNeighborsClassifier()
knn

In [39]:
evaluate_classification(knn)

ADASYN OverSampling
f1_micro score: 0.7068410682203787
f1_macro score: 0.6833224580931183
precision_micro score: 0.7068410682203787
precision_macro score: 0.7066274023495769
recall_micro score: 0.7068410682203787
recall_macro score: 0.7061337093413604

Random OverSampling
f1_micro score: 0.7326636701666273
f1_macro score: 0.7257812181803915
precision_micro score: 0.7326636701666273
precision_macro score: 0.7250963631975063
recall_micro score: 0.7326636701666273
recall_macro score: 0.7326640330988055

SMOTE OverSampling
f1_micro score: 0.7239130321755475
f1_macro score: 0.704522202786958
precision_micro score: 0.7239130321755474
precision_macro score: 0.7245775435973159
recall_micro score: 0.7239130321755474
recall_macro score: 0.723914951860769

Borderline SMOTE OverSampling
f1_micro score: 0.7520253238048445
f1_macro score: 0.7421913811280898
precision_micro score: 0.7520253238048444
precision_macro score: 0.7562351861758008
recall_micro score: 0.7520253238048444
recall_macro score: 0

Karena Borderline SMOTE Oversampling menghasilkan nilai terbaik maka akan digunakan

In [40]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(knn, X_borderSmote, y_borderSmote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.7520253238048445
f1_macro score: 0.7421913811280898
precision_micro score: 0.7520253238048444
precision_macro score: 0.7562351861758008
recall_micro score: 0.7520253238048444
recall_macro score: 0.7520249124091951


In [41]:
X_train, X_test, y_train, y_test = train_test_split(X_borderSmote, y_borderSmote, test_size=0.25, random_state=42)

## Hyperparameter Tuning

In [42]:
tuned_params = [{'n_neighbors': [2,3,4,5,6,7,8,9,10],'metric': ['euclidean', 'manhattan', 'jaccard']}]

clf_knn = GridSearchCV(knn, tuned_params, cv=cv, verbose = 2)
clf_knn.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END ....................metric=euclidean, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=4; total time=   0.0s
[CV] END ....................metric=euclidean, 

Traceback (most recent call last):
  File "C:\Users\fadlan\AppData\Roaming\Python\Python311\site-packages\sklearn\model_selection\_validation.py", line 810, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\fadlan\AppData\Roaming\Python\Python311\site-packages\sklearn\metrics\_scorer.py", line 527, in __call__
    return estimator.score(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\fadlan\AppData\Roaming\Python\Python311\site-packages\sklearn\base.py", line 706, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
                             ^^^^^^^^^^^^^^^
  File "C:\Users\fadlan\AppData\Roaming\Python\Python311\site-packages\sklearn\neighbors\_classification.py", line 254, in predict
    probabilities = self.predict_proba(X)
                    ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\fadlan\AppData\Roaming\Python\Python311\site-packages\sklearn\neighbo

[CV] END ....................metric=manhattan, n_neighbors=5; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=5; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=5; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=5; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=5; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=6; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=6; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=6; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=6; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=6; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=7; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=7; total time=   0.0s
[CV] END ...................



[CV] END ......................metric=jaccard, n_neighbors=2; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=2; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=2; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=2; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=3; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=3; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=3; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=3; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=3; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=4; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=4; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=4; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=4; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=4; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=5; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=5; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=5; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=5; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=5; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=6; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=6; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=6; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=6; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=6; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=7; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=7; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=7; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=7; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=7; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=8; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=8; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=8; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=8; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=8; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=9; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=9; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=9; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=9; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=9; total time=   0.4s




[CV] END .....................metric=jaccard, n_neighbors=10; total time=   0.4s




[CV] END .....................metric=jaccard, n_neighbors=10; total time=   0.4s




[CV] END .....................metric=jaccard, n_neighbors=10; total time=   0.4s




[CV] END .....................metric=jaccard, n_neighbors=10; total time=   0.4s




[CV] END .....................metric=jaccard, n_neighbors=10; total time=   0.4s


 0.71986875 0.72073747 0.7118017         nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.43886825 0.46233417 0.44967207 0.45923089 0.45786597 0.47350327
 0.45910667 0.4681649  0.45985193]


In [43]:
print("Hyperparameter terbaik untuk Decision Tree Classifier")
print(clf_knn.best_params_)

Hyperparameter terbaik untuk Decision Tree Classifier
{'metric': 'euclidean', 'n_neighbors': 3}


In [44]:
best_hypeparam_knn = KNeighborsClassifier(**clf_knn.best_params_)
best_hypeparam_knn.fit(X_train, y_train)

In [45]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(best_hypeparam_knn, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.7594647838295342
f1_macro score: 0.7500975536267998
precision_micro score: 0.7594647838295342
precision_macro score: 0.7636387127555694
recall_micro score: 0.7594647838295342
recall_macro score: 0.7587012696054963


In [46]:
prediction_knn = best_hypeparam_knn.predict(X_test)
classification_metrics(prediction_knn, y_test)

Accuracy: 0.7658227848101266
F1 Score: 0.7594674386584481
Recall Score: 0.7680850224324266
Precision Score: 0.7732846922128259


# Naive Bayes

# Neural Network (MLP)

In [47]:
mlp = MLPClassifier()
mlp

In [48]:
evaluate_classification(mlp)

ADASYN OverSampling
f1_micro score: 0.7002276691931864
f1_macro score: 0.6830034126039927
precision_micro score: 0.6998502770916565
precision_macro score: 0.6915418383357
recall_micro score: 0.7031175472554783
recall_macro score: 0.7045290135335006

Random OverSampling
f1_micro score: 0.7024110646933787
f1_macro score: 0.7055198811523413
precision_micro score: 0.7076257373183837
precision_macro score: 0.7093512232652989
recall_micro score: 0.7168419679847627
recall_macro score: 0.7115156260956967

SMOTE OverSampling
f1_micro score: 0.7174891444030527
f1_macro score: 0.7125149262882131
precision_micro score: 0.6947780051004624
precision_macro score: 0.7021735872879269
recall_micro score: 0.7111626558799597
recall_macro score: 0.7163664295416708

Borderline SMOTE OverSampling
f1_micro score: 0.7304279067913446
f1_macro score: 0.7002962567501547
precision_micro score: 0.7323851637719853
precision_macro score: 0.7344314183248708
recall_micro score: 0.7415971917127451
recall_macro score: 0.

Karena Borderline SMOTE Oversampling menghasilkan nilai terbaik maka akan digunakan

In [49]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(mlp, X_borderSmote, y_borderSmote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.7284740293220268
f1_macro score: 0.7107740643942613
precision_micro score: 0.7164686619648132
precision_macro score: 0.7376570626038922
recall_micro score: 0.7218650483140138
recall_macro score: 0.7319956418867151


In [50]:
X_train, X_test, y_train, y_test = train_test_split(X_borderSmote, y_borderSmote, test_size=0.25, random_state=42)

## Hyperparameter Tuning

In [51]:
param_grid = {
    'hidden_layer_sizes': [(5), (10), (5, 10)],
    'alpha': [1e-5],
    'activation': ['identity', 'logistic', 'relu', 'tanh'],
    'solver': ['sgd', 'adam'],
    'max_iter': [1000],
    'random_state': [42]
}

tuned_mlp = GridSearchCV(mlp, param_grid, cv=cv)
tuned_mlp.fit(X_train, y_train)

In [52]:
print("Hyperparameter terbaik untuk Decision Tree Classifier")
print(tuned_mlp.best_params_)

Hyperparameter terbaik untuk Decision Tree Classifier
{'activation': 'logistic', 'alpha': 1e-05, 'hidden_layer_sizes': 10, 'max_iter': 1000, 'random_state': 42, 'solver': 'adam'}


In [53]:
best_hypeparam_mlp = MLPClassifier(**tuned_mlp.best_params_)
best_hypeparam_mlp.fit(X_train, y_train)

In [54]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(best_hypeparam_mlp, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.6772999832109582
f1_macro score: 0.6691314011395298
precision_micro score: 0.6772999832109582
precision_macro score: 0.6725550203804888
recall_micro score: 0.6772999832109582
recall_macro score: 0.6765719877911749


In [55]:
prediction_mlp = best_hypeparam_mlp.predict(X_test)
classification_metrics(prediction_mlp, y_test)

Accuracy: 0.6764705882352942
F1 Score: 0.6711579034863586
Recall Score: 0.6789742022007209
Precision Score: 0.6713217505283345


# Kesimpulan

Dari semua model classification, untuk kobinasi pre processing ini yang menghasilkan hasil paling bagus adalah Random Forest dengan data yang di Random Oversampling

# Kaggle

In [18]:
df = pd.read_csv('dataframe/UFC_kombinasi3_kaggle.csv')
df.head()

Unnamed: 0,B_avg_DISTANCE_landed,R_avg_opp_SIG_STR_pct,R_age,B_avg_CTRL_time(seconds),B_avg_opp_SIG_STR_pct,B_avg_HEAD_att,B_avg_opp_LEG_att,B_age,B_avg_DISTANCE_att,R_losses,R_avg_opp_HEAD_landed,R_avg_opp_LEG_att,R_avg_LEG_att,B_avg_GROUND_att,R_avg_GROUND_landed,R_avg_opp_BODY_landed,R_avg_SUB_ATT,R_avg_opp_DISTANCE_att,R_avg_BODY_landed,R_avg_opp_CTRL_time(seconds)
0,17.6875,0.26625,27.0,349.5,0.44875,60.3125,0.625,35.0,58.125,1.0,20.5625,4.9375,15.211609,12.875,1.9375,13.1875,0.0,126.8125,13.5625,94.9375
1,16.0625,0.536279,31.0,9.6875,0.371875,37.125,8.125,28.0,42.6875,5.0,22.541016,14.3125,13.258789,0.125,1.556641,13.320312,0.073242,67.904297,8.336914,273.668945
2,14.0,0.36,30.0,349.5,0.48,61.0,4.0,24.0,40.0,1.0,41.5,14.3125,15.211609,19.0,1.0,13.320312,0.0,126.8125,5.0,40.5
3,0.0,0.4,23.0,349.5,0.605,11.0,4.0,27.0,2.0,0.0,30.21875,5.632935,2.6875,9.0,11.0,11.0,0.0,126.8125,6.0,37.28125
4,5.71875,0.33,24.0,219.75,0.439375,24.25,3.046875,33.0,18.984375,1.0,13.0,0.0,4.5,7.703125,0.5,1.0,1.125,5.0,1.0,0.0


In [20]:
prediction_kaggle = best_hypeparam_rfc.predict(df)
prediction_kaggle



array(['Red', 'Blue', 'Blue', 'Red', 'Red', 'Red', 'Blue', 'Blue', 'Red',
       'Blue', 'Red', 'Red', 'Red', 'Red', 'Red', 'Red', 'Blue', 'Red',
       'Red', 'Red', 'Red', 'Red', 'Red', 'Blue', 'Blue', 'Red', 'Blue',
       'Blue', 'Red', 'Blue', 'Red', 'Red', 'Red', 'Red', 'Blue', 'Blue',
       'Red', 'Red', 'Blue', 'Red', 'Red', 'Red', 'Blue', 'Blue', 'Red',
       'Red', 'Red', 'Red', 'Blue', 'Blue', 'Blue', 'Red', 'Red', 'Blue',
       'Red', 'Red', 'Red', 'Blue', 'Red', 'Red', 'Red', 'Red', 'Red',
       'Red', 'Red', 'Blue', 'Red', 'Blue', 'Red', 'Red', 'Blue', 'Red',
       'Red', 'Red', 'Red', 'Red', 'Red', 'Red', 'Blue', 'Red', 'Blue',
       'Blue', 'Blue', 'Red', 'Red', 'Red', 'Blue', 'Red', 'Red', 'Red',
       'Red', 'Blue', 'Red', 'Red', 'Blue', 'Red', 'Blue', 'Red', 'Blue',
       'Red', 'Red', 'Red', 'Red', 'Red', 'Blue', 'Red', 'Red', 'Blue',
       'Blue', 'Red', 'Red', 'Red', 'Blue', 'Red', 'Red', 'Blue', 'Red',
       'Red', 'Red', 'Blue', 'Blue', 'Blue', 'Blue',

In [32]:
df_kaggle = pd.read_csv('dataframe/UFC_Test_Classif_X.csv')
df_kaggle.head()

Unnamed: 0,id,R_fighter,B_fighter,Referee,date,location,title_bout,weight_class,B_avg_KD,B_avg_opp_KD,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,0,Tecia Torres,Juliana Lima,Chris Tognoni,2017-07-07,"Las Vegas, Nevada, USA",False,WomenStrawweight,0.0,0.0,...,4,0,0,0,Orthodox,154.94,152.4,115.0,35.0,27.0
1,1,John Howard,Lorenz Larkin,Herb Dean,2015-01-18,"Boston, Massachusetts, USA",False,Welterweight,0.0,0.25,...,1,2,0,0,Orthodox,170.18,182.88,170.0,28.0,31.0
2,2,Kyle Bochniak,Jeremy Kennedy,Todd Ronald Anderson,2017-07-22,"Uniondale, New York, USA",False,Featherweight,0.0,0.5,...,0,0,0,0,Orthodox,170.18,177.8,145.0,24.0,30.0
3,3,Yao Zhikui,Royston Wee,Steve Perceval,2014-08-23,"Macau, China",False,Bantamweight,0.0,0.0,...,0,0,0,0,Orthodox,165.1,162.56,125.0,27.0,23.0
4,4,Carlos Newton,Pat Miletich,John McCarthy,2001-05-04,"Atlantic City, New Jersey, USA",True,Welterweight,0.0,0.0,...,0,0,1,0,Orthodox,175.26,,170.0,33.0,24.0


In [33]:
df_kaggle = df_kaggle[['id']]
df_kaggle.head()

Unnamed: 0,id
0,0
1,1
2,2
3,3
4,4


In [34]:
df_kaggle['Winner'] = pd.DataFrame({'Winner': prediction_kaggle})

In [35]:
df_kaggle.head()

Unnamed: 0,id,Winner
0,0,Red
1,1,Blue
2,2,Blue
3,3,Red
4,4,Red


In [None]:
df_kaggle.to_csv('dataframe/Kaggle_prediction.csv', index=False)
