In [1]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, NearMiss, TomekLinks
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB

In [2]:
df = pd.read_csv('dataframe/UFC_kombinasi2.csv')
df.head()

Unnamed: 0,B_avg_SIG_STR_att,B_avg_SIG_STR_landed,B_avg_HEAD_att,B_avg_HEAD_landed,B_avg_BODY_att,B_avg_BODY_landed,B_avg_DISTANCE_att,B_avg_DISTANCE_landed,B_avg_opp_DISTANCE_att,R_avg_opp_SIG_STR_att,...,R_avg_opp_HEAD_landed,R_avg_opp_BODY_att,R_avg_opp_BODY_landed,R_avg_opp_DISTANCE_att,R_avg_opp_DISTANCE_landed,R_total_rounds_fought,R_losses,R_win_by_Decision_Split,R_age,Winner
0,33.5,11.0,25.0625,5.744141,0.0,0.0,24.125,11.0,71.0,78.0,...,19.543213,5.0,4.0,54.875,18.339844,0.0,0.0,0.0,21.0,Red
1,135.625,53.234375,109.0,39.976562,13.625,7.765625,117.5,45.5,88.695312,117.8125,...,27.65625,17.59375,8.3125,110.25,38.40625,17.0,1.0,0.0,30.0,Blue
2,66.0,22.112305,55.0,16.0,5.125,2.0,60.0,16.211914,64.5,91.625,...,26.0,2.96875,1.0,82.229492,24.0,0.0,0.0,0.0,31.0,Blue
3,39.21875,19.015625,30.367188,11.804688,5.429688,4.546875,25.703125,8.875,24.234375,109.998363,...,27.688076,8.554874,4.713242,102.755116,33.409025,33.0,5.0,1.0,34.0,Blue
4,74.522461,29.986572,58.093994,18.651855,8.149902,5.452637,56.636475,20.501221,69.770264,126.03125,...,35.625,19.15918,13.152161,102.640625,43.398438,22.0,2.0,0.0,31.0,Red


In [3]:
X = df.drop('Winner', axis=1).values
X

array([[ 33.5       ,  11.        ,  25.0625    , ...,   0.        ,
          0.        ,  21.        ],
       [135.625     ,  53.234375  , 109.        , ...,   1.        ,
          0.        ,  30.        ],
       [ 66.        ,  22.11230469,  55.        , ...,   0.        ,
          0.        ,  31.        ],
       ...,
       [ 44.75      ,  16.75      ,  31.        , ...,   0.        ,
          0.        ,  30.        ],
       [ 75.3125    ,  38.421875  ,  39.171875  , ...,   2.        ,
          1.        ,  34.        ],
       [ 40.25      ,  14.5       ,  36.        , ...,   3.        ,
          0.        ,  31.        ]])

In [4]:
y = df['Winner'].values
y

array(['Red', 'Blue', 'Blue', ..., 'Red', 'Blue', 'Red'], dtype=object)

In [5]:
counter_y = Counter(y)
print(counter_y)

Counter({'Red': 3581, 'Blue': 1730, 'Draw': 99})


In [6]:
# Over Sampling
adasyn = ADASYN()
randomOver = RandomOverSampler()
smote = SMOTE()
borderSmote = BorderlineSMOTE()
svmSmote = SVMSMOTE()

# Melakukan resampling
X_adasyn, y_adasyn = adasyn.fit_resample(X, y)
X_randomOver, y_randomOver = randomOver.fit_resample(X, y)
X_smote, y_smote = smote.fit_resample(X, y)
X_borderSmote, y_borderSmote = borderSmote.fit_resample(X, y)
X_svmSmote, y_svmSmote = svmSmote.fit_resample(X, y)


# Under Sampling
rand_under = RandomUnderSampler(sampling_strategy='majority')
nearmiss = NearMiss()
nearmiss2 = NearMiss(version=2)
nearmiss3 = NearMiss(version=3)
tomek = TomekLinks()

# Melakukan resampling
X_rand_under, y_rand_under = rand_under.fit_resample(X, y)
X_nearmiss, y_nearmiss = nearmiss.fit_resample(X, y)
X_nearmiss2, y_nearmiss2 = nearmiss2.fit_resample(X, y)
X_nearmiss3, y_nearmiss3 = nearmiss3.fit_resample(X, y)
X_tomek, y_tomek = tomek.fit_resample(X, y)


In [7]:
def evaluate_classification(classifier):
  cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

  scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
  print("ADASYN OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_adasyn, y_adasyn, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Random OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_randomOver, y_randomOver, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("SMOTE OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_smote, y_smote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Borderline SMOTE OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_borderSmote, y_borderSmote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("SVM SMOTE OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_svmSmote, y_svmSmote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Random Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_rand_under, y_rand_under , scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Near Miss 1 Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_nearmiss, y_nearmiss, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Near Miss 2 Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_nearmiss2, y_nearmiss2, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Near Miss 3 Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_nearmiss3, y_nearmiss3, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Tomek Links Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_tomek, y_tomek, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

# Decision Tree

In [8]:
dt = DecisionTreeClassifier()
dt

In [9]:
evaluate_classification(dt)

ADASYN OverSampling
f1_micro score: 0.7244698795959388
f1_macro score: 0.7219082102072548
precision_micro score: 0.7257699160036795
precision_macro score: 0.7211668947293566
recall_micro score: 0.723355673467805
recall_macro score: 0.724463877019702

Random OverSampling
f1_micro score: 0.8299363178751019
f1_macro score: 0.8216358753121351
precision_micro score: 0.8288182195521194
precision_macro score: 0.8310548048085111
recall_micro score: 0.8271431950939896
recall_macro score: 0.826494108236003

SMOTE OverSampling
f1_micro score: 0.717768430684923
f1_macro score: 0.7173283516236711
precision_micro score: 0.7179546504242154
precision_macro score: 0.7125087295626578
recall_micro score: 0.7197226114437185
recall_macro score: 0.7188809154635106

Borderline SMOTE OverSampling
f1_micro score: 0.7516514545330079
f1_macro score: 0.7491815184508303
precision_micro score: 0.7485805185903451
precision_macro score: 0.7466010781302475
recall_micro score: 0.7534213219435136
recall_macro score: 0.7

Karena Random Oversampling menghasilkan score terbaik maka akan digunakan Random Oversampling

In [10]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(dt, X_randomOver, y_randomOver, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.8252812576634752
f1_macro score: 0.8234853078334055
precision_micro score: 0.8299352346983959
precision_macro score: 0.8315767318577576
recall_micro score: 0.8290047859079577
recall_macro score: 0.8308661165782316


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_randomOver, y_randomOver, test_size=0.25, random_state=42)

## Hyperparameter Tuning

In [12]:
param_grid_c = {'criterion': ['gini', 'entropy'],
               'min_samples_split': [2, 5, 10],
               'max_depth' : [None, 10, 20, 30],
               'min_samples_leaf': [1, 2, 4]
               }

clf_dtc = GridSearchCV(estimator= DecisionTreeClassifier(), param_grid=param_grid_c, cv= 5)
clf_dtc.fit(X_train, y_train)

In [13]:
print("Hyperparameter terbaik untuk Decision Tree Classifier")
print(clf_dtc.best_params_)

Hyperparameter terbaik untuk Decision Tree Classifier
{'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [14]:
best_hypeparam_dtc = DecisionTreeClassifier(**clf_dtc.best_params_)
best_hypeparam_dtc.fit(X_train, y_train)

In [15]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(best_hypeparam_dtc, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.8041441208318124
f1_macro score: 0.7977636680793492
precision_micro score: 0.8004203421575922
precision_macro score: 0.8038958154689982
recall_micro score: 0.8012892905936697
recall_macro score: 0.7995691124174968


In [8]:
def classification_metrics(prediction, y_test):
  accuracy = accuracy_score(y_test, prediction)
  f1 = f1_score(y_test, prediction, average="macro")
  recall = recall_score(y_test, prediction, average="macro")
  precision = precision_score(y_test, prediction, average="macro")

  print('Accuracy: ' + str(accuracy))
  print('F1 Score: ' + str(f1))
  print('Recall Score: ' + str(recall))
  print('Precision Score: ' + str(precision))

In [17]:
prediction = best_hypeparam_dtc.predict(X_test)
classification_metrics(prediction, y_test)

Accuracy: 0.8235294117647058
F1 Score: 0.8224311932218423
Recall Score: 0.8251273595353531
Precision Score: 0.8266015969963821


# Random Forest

In [18]:
rf = RandomForestClassifier()

In [19]:
evaluate_classification(rf)

ADASYN OverSampling
f1_micro score: 0.8274227412659123
f1_macro score: 0.827091449805685
precision_micro score: 0.8296519290716887
precision_macro score: 0.8242787380149454
recall_micro score: 0.8258450150478147
recall_macro score: 0.8307923754385493

Random OverSampling
f1_micro score: 0.8735919352728263
f1_macro score: 0.8714942681822674
precision_micro score: 0.8748955167749409
precision_macro score: 0.8756839405492329
recall_micro score: 0.8735924551976451
recall_macro score: 0.8728436039882711

SMOTE OverSampling
f1_micro score: 0.8306802436367701
f1_macro score: 0.8271677022059413
precision_micro score: 0.8280742937904512
precision_macro score: 0.8284696153147945
recall_micro score: 0.8263988793887069
recall_macro score: 0.831050128691605

Borderline SMOTE OverSampling
f1_micro score: 0.8371960281210002
f1_macro score: 0.8350852274031121
precision_micro score: 0.835520787027529
precision_macro score: 0.8348517797006089
recall_micro score: 0.8337518728125246
recall_macro score: 0.

Karena score nya paling tinggi maka akan digunakan Random Oversampling

In [20]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(rf, X_randomOver, y_randomOver, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.8744306173327336
f1_macro score: 0.8735626150053946
precision_micro score: 0.8779677092025826
precision_macro score: 0.8739678054924603
recall_micro score: 0.8751746297485383
recall_macro score: 0.8758233795376451


## Hyperparameter Tuning

In [21]:
param_grid_c = {'criterion': ['gini', 'entropy'],
               'min_samples_split': [2, 5, 10],
               'max_depth' : [None, 10, 20, 30],
               'min_samples_leaf': [1, 2, 4]
               }

clf_rfc = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid_c, cv= 5)
clf_rfc.fit(X_train, y_train)

In [22]:
print("Hyperparameter terbaik untuk Decision Tree Classifier")
print(clf_rfc.best_params_)

Hyperparameter terbaik untuk Decision Tree Classifier
{'criterion': 'gini', 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [23]:
best_hypeparam_rfc = RandomForestClassifier(**clf_rfc.best_params_)
best_hypeparam_rfc.fit(X_train, y_train)

In [24]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(best_hypeparam_rfc, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.8367875631706954
f1_macro score: 0.8366808904435894
precision_micro score: 0.8387733679588066
precision_macro score: 0.8408403000609596
recall_micro score: 0.8423711518052841
recall_macro score: 0.8376035553310915


In [25]:
prediction_rf = best_hypeparam_rfc.predict(X_test)
classification_metrics(prediction_rf, y_test)

Accuracy: 0.8652271034996277
F1 Score: 0.8661082276351827
Recall Score: 0.86617404572638
Precision Score: 0.86608608834517


# Logistic Regression

In [26]:
logistic = LogisticRegression(max_iter=2000)
logistic

In [27]:
evaluate_classification(logistic)

ADASYN OverSampling
f1_micro score: 0.45516263919497957
f1_macro score: 0.4541026722143016
precision_micro score: 0.45516263919497957
precision_macro score: 0.4545835384835259
recall_micro score: 0.45516263919497957
recall_macro score: 0.45515577841125043

Random OverSampling
f1_micro score: 0.45415497919000913
f1_macro score: 0.4524813556538317
precision_micro score: 0.45415497919000913
precision_macro score: 0.4527090738670207
recall_micro score: 0.45415497919000913
recall_macro score: 0.4541517651917128

SMOTE OverSampling
f1_micro score: 0.4586254660909366
f1_macro score: 0.457558853694432
precision_micro score: 0.4586254660909366
precision_macro score: 0.45846515497157075
recall_micro score: 0.4586254660909366
recall_macro score: 0.4586280513935314

Borderline SMOTE OverSampling
f1_micro score: 0.5404457748742865
f1_macro score: 0.5366230128525545
precision_micro score: 0.5404457748742865
precision_macro score: 0.5363768180967612
recall_micro score: 0.5404457748742865
recall_macro

Karena Borderline SMOTE Oversampling menghasilkan nilai terbaik maka akan digunakan

In [28]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(logistic, X_borderSmote, y_borderSmote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.5404457748742865
f1_macro score: 0.5366230128525545
precision_micro score: 0.5404457748742865
precision_macro score: 0.5363768180967612
recall_micro score: 0.5404457748742865
recall_macro score: 0.5404456547428895


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X_borderSmote, y_borderSmote, test_size=0.25, random_state=42)

In [30]:
logistic.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [31]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(logistic, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.546728909343795
f1_macro score: 0.5423471743644245
precision_micro score: 0.546728909343795
precision_macro score: 0.5424417233557037
recall_micro score: 0.546728909343795
recall_macro score: 0.5463879275250638


In [32]:
prediction_logreg = logistic.predict(X_test)
classification_metrics(prediction_logreg, y_test)

Accuracy: 0.5238272524199553
F1 Score: 0.5199217982465247
Recall Score: 0.5247631779189458
Precision Score: 0.5205229046580845


# Softmax Regression

In [33]:
softmax = LogisticRegression(max_iter=2000, multi_class='multinomial')
softmax

In [34]:
evaluate_classification(softmax)

ADASYN OverSampling
f1_micro score: 0.45516263919497957
f1_macro score: 0.4541026722143016
precision_micro score: 0.45516263919497957
precision_macro score: 0.4545835384835259
recall_micro score: 0.45516263919497957
recall_macro score: 0.45515577841125043

Random OverSampling
f1_micro score: 0.45415497919000913
f1_macro score: 0.4524813556538317
precision_micro score: 0.45415497919000913
precision_macro score: 0.4527090738670207
recall_micro score: 0.45415497919000913
recall_macro score: 0.4541517651917128

SMOTE OverSampling
f1_micro score: 0.4586254660909366
f1_macro score: 0.457558853694432
precision_micro score: 0.4586254660909366
precision_macro score: 0.45846515497157075
recall_micro score: 0.4586254660909366
recall_macro score: 0.4586280513935314

Borderline SMOTE OverSampling
f1_micro score: 0.5404457748742865
f1_macro score: 0.5366230128525545
precision_micro score: 0.5404457748742865
precision_macro score: 0.5363768180967612
recall_micro score: 0.5404457748742865
recall_macro

Karena Borderline SMOTE Oversampling menghasilkan nilai terbaik maka akan digunakan

In [35]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(softmax, X_borderSmote, y_borderSmote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.5404457748742865
f1_macro score: 0.5366230128525545
precision_micro score: 0.5404457748742865
precision_macro score: 0.5363768180967612
recall_micro score: 0.5404457748742865
recall_macro score: 0.5404456547428895


In [36]:
softmax.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [37]:
prediction_softmax = softmax.predict(X_test)
classification_metrics(prediction_softmax, y_test)

Accuracy: 0.5238272524199553
F1 Score: 0.5199217982465247
Recall Score: 0.5247631779189458
Precision Score: 0.5205229046580845


# KNN

In [38]:
knn = KNeighborsClassifier()
knn

In [39]:
evaluate_classification(knn)

ADASYN OverSampling
f1_micro score: 0.7105459653113939
f1_macro score: 0.6843118320422545
precision_micro score: 0.7105459653113939
precision_macro score: 0.7113195050785386
recall_micro score: 0.7105459653113939
recall_macro score: 0.7096932925048314

Random OverSampling
f1_micro score: 0.7267981816496001
f1_macro score: 0.7194536277811502
precision_micro score: 0.7267981816496001
precision_macro score: 0.7188620761717777
recall_micro score: 0.7267981816496001
recall_macro score: 0.7267946050816951

SMOTE OverSampling
f1_micro score: 0.7187006125580907
f1_macro score: 0.6957747188672782
precision_micro score: 0.7187006125580907
precision_macro score: 0.7186983647409728
recall_micro score: 0.7187006125580907
recall_macro score: 0.7187020977640646

Borderline SMOTE OverSampling
f1_micro score: 0.7417832381437643
f1_macro score: 0.7264252070831398
precision_micro score: 0.7417832381437643
precision_macro score: 0.7457789336847773
recall_micro score: 0.7417832381437643
recall_macro score:

Karena Borderline SMOTE Oversampling menghasilkan nilai terbaik maka akan digunakan

In [40]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(knn, X_borderSmote, y_borderSmote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.7417832381437643
f1_macro score: 0.7264252070831398
precision_micro score: 0.7417832381437643
precision_macro score: 0.7457789336847773
recall_micro score: 0.7417832381437643
recall_macro score: 0.7417837357705525


## Hyperparameter Tuning

In [41]:
tuned_params = [{'n_neighbors': [2,3,4,5,6,7,8,9,10],'metric': ['euclidean', 'manhattan', 'jaccard']}]

clf_knn = GridSearchCV(knn, tuned_params, cv=cv, verbose = 2)
clf_knn.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END ....................metric=euclidean, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=4; total time=   0.0s
[CV] END ....................metric=euclidean, 

Traceback (most recent call last):
  File "C:\Users\fadlan\AppData\Roaming\Python\Python311\site-packages\sklearn\model_selection\_validation.py", line 810, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\fadlan\AppData\Roaming\Python\Python311\site-packages\sklearn\metrics\_scorer.py", line 527, in __call__
    return estimator.score(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\fadlan\AppData\Roaming\Python\Python311\site-packages\sklearn\base.py", line 706, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
                             ^^^^^^^^^^^^^^^
  File "C:\Users\fadlan\AppData\Roaming\Python\Python311\site-packages\sklearn\neighbors\_classification.py", line 254, in predict
    probabilities = self.predict_proba(X)
                    ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\fadlan\AppData\Roaming\Python\Python311\site-packages\sklearn\neighbo

[CV] END ....................metric=manhattan, n_neighbors=8; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=8; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=8; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=8; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=9; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=9; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=9; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=9; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=9; total time=   0.0s
[CV] END ...................metric=manhattan, n_neighbors=10; total time=   0.0s
[CV] END ...................metric=manhattan, n_neighbors=10; total time=   0.0s
[CV] END ...................metric=manhattan, n_neighbors=10; total time=   0.0s
[CV] END ...................



[CV] END ......................metric=jaccard, n_neighbors=2; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=2; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=2; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=2; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=2; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=3; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=3; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=3; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=3; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=3; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=4; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=4; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=4; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=4; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=4; total time=   0.5s




[CV] END ......................metric=jaccard, n_neighbors=5; total time=   0.5s




[CV] END ......................metric=jaccard, n_neighbors=5; total time=   0.5s




[CV] END ......................metric=jaccard, n_neighbors=5; total time=   0.5s




[CV] END ......................metric=jaccard, n_neighbors=5; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=5; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=6; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=6; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=6; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=6; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=6; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=7; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=7; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=7; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=7; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=7; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=8; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=8; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=8; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=8; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=8; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=9; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=9; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=9; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=9; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=9; total time=   0.4s




[CV] END .....................metric=jaccard, n_neighbors=10; total time=   0.4s




[CV] END .....................metric=jaccard, n_neighbors=10; total time=   0.4s




[CV] END .....................metric=jaccard, n_neighbors=10; total time=   0.4s




[CV] END .....................metric=jaccard, n_neighbors=10; total time=   0.4s




[CV] END .....................metric=jaccard, n_neighbors=10; total time=   0.4s


 0.70088012 0.69852233 0.69169628        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.4010135  0.39878788 0.39219887 0.39505324 0.39269523 0.39704074
 0.3856218  0.39989588 0.40858652]


In [42]:
print("Hyperparameter terbaik untuk Decision Tree Classifier")
print(clf_knn.best_params_)

Hyperparameter terbaik untuk Decision Tree Classifier
{'metric': 'euclidean', 'n_neighbors': 3}


In [43]:
best_hypeparam_knn = KNeighborsClassifier(**clf_knn.best_params_)
best_hypeparam_knn.fit(X_train, y_train)

In [44]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(best_hypeparam_knn, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.7487904188480868
f1_macro score: 0.7353391624785711
precision_micro score: 0.7487904188480868
precision_macro score: 0.7533327517488011
recall_micro score: 0.7487904188480868
recall_macro score: 0.7479423285546525


In [45]:
prediction_knn = best_hypeparam_knn.predict(X_test)
classification_metrics(prediction_knn, y_test)

Accuracy: 0.7557706626954579
F1 Score: 0.7428574509221031
Recall Score: 0.7583056227870433
Precision Score: 0.7641957642818168


# Naive Bayes

In [9]:
gaussNB = GaussianNB()

In [11]:
evaluate_classification(gaussNB)

ADASYN OverSampling
f1_micro score: 0.4454140249511512
f1_macro score: 0.442781934916377
precision_micro score: 0.4454140249511512
precision_macro score: 0.4477103375011936
recall_micro score: 0.4454140249511512
recall_macro score: 0.44553488848087996

Random OverSampling
f1_micro score: 0.43032691139527895
f1_macro score: 0.4204981300637397
precision_micro score: 0.43032691139527895
precision_macro score: 0.4340359996006171
recall_micro score: 0.43032691139527895
recall_macro score: 0.43032563780910005

SMOTE OverSampling
f1_micro score: 0.4528557303947183
f1_macro score: 0.4502657963139683
precision_micro score: 0.4528557303947183
precision_macro score: 0.4558401535731285
recall_micro score: 0.4528557303947183
recall_macro score: 0.4528548498944236

Borderline SMOTE OverSampling
f1_micro score: 0.4843159045868634
f1_macro score: 0.48169732493535056
precision_micro score: 0.4843159045868634
precision_macro score: 0.49999267507682943
recall_micro score: 0.4843159045868634
recall_macro 

Karena menghasilkan nilai paling baik maka Nearmiss 2 Undersampling akan digunakan

In [12]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(gaussNB, X_nearmiss2, y_nearmiss2, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.652994350282486
f1_macro score: 0.6580980120538886
precision_micro score: 0.652994350282486
precision_macro score: 0.6674644248379611
recall_micro score: 0.652994350282486
recall_macro score: 0.6531578947368422


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_nearmiss2, y_nearmiss2, test_size=0.25, random_state=42)

In [14]:
gaussNB.fit(X_train, y_train)

In [15]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(gaussNB, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.6574747474747474
f1_macro score: 0.6536522028269299
precision_micro score: 0.6574747474747474
precision_macro score: 0.6656520930050341
recall_micro score: 0.6574747474747474
recall_macro score: 0.6531349206349206


In [16]:
prediction_nb= gaussNB.predict(X_test)
classification_metrics(prediction_nb, y_test)

Accuracy: 0.6666666666666666
F1 Score: 0.6795959595959595
Recall Score: 0.6773636991028296
Precision Score: 0.6828541828541829


# Neural Network (MLP)

In [4]:
mlp = MLPClassifier()
mlp

In [47]:
evaluate_classification(mlp)

ADASYN OverSampling


f1_micro score: 0.6394446203792867
f1_macro score: 0.6544128575662563
precision_micro score: 0.6441708190879968
precision_macro score: 0.6584146428527446
recall_micro score: 0.6775866622718861
recall_macro score: 0.6584486218328665

Random OverSampling
f1_micro score: 0.7025048677961168
f1_macro score: 0.6742079733786197
precision_micro score: 0.6878868348970072
precision_macro score: 0.6986580870668404
recall_micro score: 0.6951504879061154
recall_macro score: 0.6899442639385605

SMOTE OverSampling
f1_micro score: 0.6928201848679347
f1_macro score: 0.6837227249328687
precision_micro score: 0.6805372643115806
precision_macro score: 0.6738429683234923
recall_micro score: 0.6872374487982371
recall_macro score: 0.6639064849660675

Borderline SMOTE OverSampling
f1_micro score: 0.7113464493034306
f1_macro score: 0.6840406829125928
precision_micro score: 0.7116278152845765
precision_macro score: 0.7054268795514508
recall_micro score: 0.7075328440840788
recall_macro score: 0.7117439205877999


Karena Borderline SMOTE Oversampling menghasilkan nilai terbaik maka akan digunakan

In [14]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(mlp, X_borderSmote, y_borderSmote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.7079960537706248
f1_macro score: 0.6932011575816885
precision_micro score: 0.699155078842266
precision_macro score: 0.7143909340944332
recall_micro score: 0.7125571808983089
recall_macro score: 0.7232614945887192


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_borderSmote, y_borderSmote, test_size=0.25, random_state=42)

## Hyperparameter Tuning

In [15]:
param_grid = {
    'hidden_layer_sizes': [(5), (10), (5, 10)],
    'alpha': [1e-5],
    'activation': ['identity', 'logistic', 'relu', 'tanh'],
    'solver': ['sgd', 'adam'],
    'max_iter': [1000],
    'random_state': [42]
}

tuned_mlp = GridSearchCV(mlp, param_grid, cv=cv)
tuned_mlp.fit(X_train, y_train)

In [16]:
print("Hyperparameter terbaik untuk Decision Tree Classifier")
print(tuned_mlp.best_params_)

Hyperparameter terbaik untuk Decision Tree Classifier
{'activation': 'relu', 'alpha': 1e-05, 'hidden_layer_sizes': 10, 'max_iter': 1000, 'random_state': 42, 'solver': 'adam'}


In [17]:
best_hypeparam_mlp = MLPClassifier(**tuned_mlp.best_params_)
best_hypeparam_mlp.fit(X_train, y_train)

In [18]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(best_hypeparam_mlp, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.622564703272939
f1_macro score: 0.6134008510663851
precision_micro score: 0.622564703272939
precision_macro score: 0.6138430878795382
recall_micro score: 0.622564703272939
recall_macro score: 0.6220264912778826


In [21]:
prediction_mlp = best_hypeparam_mlp.predict(X_test)
classification_metrics(prediction_mlp, y_test)

Accuracy: 0.6083395383469844
F1 Score: 0.5996622409438358
Recall Score: 0.6101521201616359
Precision Score: 0.5954253402946197


# Kesimpulan

Dari semua model classification, untuk kobinasi pre processing ini yang menghasilkan hasil paling bagus adalah Random Forest dengan data yang di Random Oversampling