In [6]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, NearMiss, TomekLinks
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

In [7]:
df = pd.read_csv("dataframe/UFC_kombinasi7_FS_lasso.csv")
df.head()

Unnamed: 0,weight_class,B_avg_SIG_STR_att,B_avg_SIG_STR_landed,B_avg_opp_SIG_STR_att,B_avg_opp_SIG_STR_landed,B_avg_TOTAL_STR_att,B_avg_TOTAL_STR_landed,B_avg_opp_TOTAL_STR_landed,B_avg_TD_att,B_avg_opp_TD_att,...,R_avg_CTRL_time(seconds),R_avg_opp_CTRL_time(seconds),R_total_title_bouts,R_win_by_Decision_Unanimous,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age,Winner
0,7,66.304047,29.5,64.0,27.270996,89.625,48.0,44.5,2.0,2.038061,...,104.125,89.53125,0.0,0.0,182.88,177.8,185.0,26.0,21.0,Red
1,5,133.9375,53.234375,89.953125,31.296875,141.210938,56.71875,33.078125,2.0,1.835938,...,252.3125,115.0,0.0,4.0,180.34,187.96,170.0,27.0,30.0,Blue
2,6,66.304047,29.5,64.0,27.270996,89.625,48.0,44.5,2.0,2.038061,...,104.125,89.53125,0.0,0.0,185.42,182.88,170.0,28.0,31.0,Blue
3,6,39.21875,19.015625,37.226562,14.53125,51.5,30.742188,28.671875,2.578125,1.625,...,145.693754,116.292816,1.0,4.0,177.8,185.42,170.0,28.0,34.0,Blue
4,3,74.522461,29.986572,93.299805,42.413086,102.17041,51.280762,71.359619,5.9375,3.682373,...,110.148438,162.796875,1.0,2.0,160.02,167.64,135.0,34.0,31.0,Red


In [8]:
X = df.drop('Winner', axis=1).values
X

array([[  7.        ,  66.30404663,  29.5       , ..., 185.        ,
         26.        ,  21.        ],
       [  5.        , 133.9375    ,  53.234375  , ..., 170.        ,
         27.        ,  30.        ],
       [  6.        ,  66.30404663,  29.5       , ..., 170.        ,
         28.        ,  31.        ],
       ...,
       [  4.        ,  44.75      ,  16.75      , ..., 145.        ,
         24.        ,  30.        ],
       [  5.        ,  75.3125    ,  38.421875  , ..., 155.        ,
         27.        ,  34.        ],
       [  8.        ,  40.25      ,  14.5       , ..., 225.        ,
         32.        ,  31.        ]])

In [9]:
y = df['Winner'].values
y

array(['Red', 'Blue', 'Blue', ..., 'Red', 'Blue', 'Red'], dtype=object)

In [10]:
counter_y = Counter(y)
print(counter_y)

Counter({'Red': 3581, 'Blue': 1730, 'Draw': 99})


In [11]:
# Over Sampling
adasyn = ADASYN()
randomOver = RandomOverSampler()
smote = SMOTE()
borderSmote = BorderlineSMOTE()
svmSmote = SVMSMOTE()

# Melakukan resampling
X_adasyn, y_adasyn = adasyn.fit_resample(X, y)
X_randomOver, y_randomOver = randomOver.fit_resample(X, y)
X_smote, y_smote = smote.fit_resample(X, y)
X_borderSmote, y_borderSmote = borderSmote.fit_resample(X, y)
X_svmSmote, y_svmSmote = svmSmote.fit_resample(X, y)


# Under Sampling
rand_under = RandomUnderSampler(sampling_strategy='majority')
nearmiss = NearMiss()
nearmiss2 = NearMiss(version=2)
nearmiss3 = NearMiss(version=3)
tomek = TomekLinks()

# Melakukan resampling
X_rand_under, y_rand_under = rand_under.fit_resample(X, y)
X_nearmiss, y_nearmiss = nearmiss.fit_resample(X, y)
X_nearmiss2, y_nearmiss2 = nearmiss2.fit_resample(X, y)
X_nearmiss3, y_nearmiss3 = nearmiss3.fit_resample(X, y)
X_tomek, y_tomek = tomek.fit_resample(X, y)


In [12]:
def evaluate_classification(classifier):
  cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

  scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
  print("ADASYN OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_adasyn, y_adasyn, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Random OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_randomOver, y_randomOver, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("SMOTE OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_smote, y_smote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Borderline SMOTE OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_borderSmote, y_borderSmote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("SVM SMOTE OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_svmSmote, y_svmSmote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Random Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_rand_under, y_rand_under , scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Near Miss 1 Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_nearmiss, y_nearmiss, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Near Miss 2 Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_nearmiss2, y_nearmiss2, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Near Miss 3 Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_nearmiss3, y_nearmiss3, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Tomek Links Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_tomek, y_tomek, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

# Decision Tree

In [13]:
dt = DecisionTreeClassifier()
dt

In [14]:
evaluate_classification(dt)

ADASYN OverSampling


f1_micro score: 0.7298127263365842
f1_macro score: 0.7258091123630892
precision_micro score: 0.7262860872880973
precision_macro score: 0.7214814902382742
recall_micro score: 0.7285134223922686
recall_macro score: 0.729033112729264

Random OverSampling
f1_micro score: 0.833474146305111
f1_macro score: 0.834135809574024
precision_micro score: 0.8341257854114295
precision_macro score: 0.8376984626994162
recall_micro score: 0.834870664368599
recall_macro score: 0.8322639333660581

SMOTE OverSampling
f1_micro score: 0.7246586043658088
f1_macro score: 0.7245613035724524
precision_micro score: 0.7255893564457246
precision_macro score: 0.722831291799117
recall_micro score: 0.7261487522237619
recall_macro score: 0.7278253066652123

Borderline SMOTE OverSampling
f1_micro score: 0.7570509171040534
f1_macro score: 0.7570221541015818
precision_micro score: 0.760495938953894
precision_macro score: 0.7583512124260074
recall_micro score: 0.7598445164829166
recall_macro score: 0.7603113012266609

SVM S

Karena Random Oversampling menghasilkan score terbaik maka akan digunakan Random Oversampling

In [15]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(dt, X_randomOver, y_randomOver, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.8349636875841087
f1_macro score: 0.8309974749302811
precision_micro score: 0.8330085103027434
precision_macro score: 0.8417263142584265
recall_micro score: 0.8342181153938475
recall_macro score: 0.8379407784868154


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_randomOver, y_randomOver, test_size=0.25, random_state=42)

## Hyperparameter Tuning

In [17]:
param_grid_c = {'criterion': ['gini', 'entropy'],
               'min_samples_split': [2, 5, 10],
               'max_depth' : [None, 10, 20, 50],
               'min_samples_leaf': [1, 2, 4]
               }

clf_dtc = GridSearchCV(estimator= DecisionTreeClassifier(), param_grid=param_grid_c, cv= 5)
clf_dtc.fit(X_train, y_train)

In [18]:
print("Hyperparameter terbaik untuk Decision Tree Classifier")
print(clf_dtc.best_params_)

Hyperparameter terbaik untuk Decision Tree Classifier
{'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [19]:
best_hypeparam_dtc = DecisionTreeClassifier(**clf_dtc.best_params_)
best_hypeparam_dtc.fit(X_train, y_train)

In [20]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(best_hypeparam_dtc, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.8140758402607385
f1_macro score: 0.8104824940674824
precision_micro score: 0.8150687041478175
precision_macro score: 0.8121099229247083
recall_micro score: 0.8103510604051241
recall_macro score: 0.8138917055303512


In [21]:
def classification_metrics(prediction, y_test):
  accuracy = accuracy_score(y_test, prediction)
  f1 = f1_score(y_test, prediction, average="macro")
  recall = recall_score(y_test, prediction, average="macro")
  precision = precision_score(y_test, prediction, average="macro")

  print('Accuracy: ' + str(accuracy))
  print('F1 Score: ' + str(f1))
  print('Recall Score: ' + str(recall))
  print('Precision Score: ' + str(precision))

In [22]:
prediction = best_hypeparam_dtc.predict(X_test)
classification_metrics(prediction, y_test)

Accuracy: 0.8224125093075205
F1 Score: 0.8204250899326139
Recall Score: 0.8240846930655706
Precision Score: 0.8259674692288138


In [23]:
dt.fit(X_train, y_train)
prediction = dt.predict(X_test)
classification_metrics(prediction, y_test)

Accuracy: 0.8324646314221892
F1 Score: 0.8309387236299332
Recall Score: 0.8340179164718332
Precision Score: 0.836254146368419


# Random Forest

In [24]:
rf = RandomForestClassifier()

In [25]:
evaluate_classification(rf)

ADASYN OverSampling


f1_micro score: 0.8434187084084648
f1_macro score: 0.8441352498068749
precision_micro score: 0.8397068422855444
precision_macro score: 0.8433414604700097
recall_micro score: 0.8462962986899676
recall_macro score: 0.8448651281177033

Random OverSampling
f1_micro score: 0.8800140033084549
f1_macro score: 0.8828594304372503
precision_micro score: 0.8768500874773506
precision_macro score: 0.883912998761738
recall_micro score: 0.8840169478160125
recall_macro score: 0.8813150438018955

SMOTE OverSampling
f1_micro score: 0.8438982923069325
f1_macro score: 0.8419453411801208
precision_micro score: 0.8435265460614396
precision_macro score: 0.8431291815773714
recall_micro score: 0.844270688458449
recall_macro score: 0.8437112529186115

Borderline SMOTE OverSampling
f1_micro score: 0.8438975990738407
f1_macro score: 0.8438616654035747
precision_micro score: 0.8425943641882716
precision_macro score: 0.8452169586096276
recall_micro score: 0.8438046625124673
recall_macro score: 0.8451988032070311

S

Karena score nya paling tinggi maka akan digunakan Random Oversampling

In [26]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(rf, X_randomOver, y_randomOver, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.882156050235136
f1_macro score: 0.8823977621632315
precision_micro score: 0.8829934758100645
precision_macro score: 0.8799728776681175
recall_micro score: 0.8802939394963488
recall_macro score: 0.8807585922099376


## Hyperparameter Tuning

In [27]:
param_grid_c = {
                'criterion': ['gini', 'entropy'],
               'min_samples_split': [2, 5, 10],
               'max_depth' : [None, 10, 20, 30],
               'min_samples_leaf': [1, 2, 4]
}

clf_rfc = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid_c, cv=cv)
clf_rfc.fit(X_train, y_train)

In [28]:
print("Hyperparameter terbaik untuk Decision Tree Classifier")
print(clf_rfc.best_params_)

Hyperparameter terbaik untuk Decision Tree Classifier
{'criterion': 'gini', 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [29]:
best_hypeparam_rfc = RandomForestClassifier(**clf_rfc.best_params_)
best_hypeparam_rfc.fit(X_train, y_train)

In [30]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(best_hypeparam_rfc, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.8540403830365986
f1_macro score: 0.8500024401797353
precision_micro score: 0.8527989951219361
precision_macro score: 0.855224479544572
recall_micro score: 0.8490760636012034
recall_macro score: 0.8505440757207289


In [31]:
prediction_rf = best_hypeparam_rfc.predict(X_test)
classification_metrics(prediction_rf, y_test)

Accuracy: 0.8659717051377513
F1 Score: 0.8668668545479159
Recall Score: 0.8669775797996703
Precision Score: 0.8671818799948757


# Logistic Regression

In [32]:
logistic = LogisticRegression(max_iter=2000)
logistic

In [33]:
evaluate_classification(logistic)

ADASYN OverSampling


f1_micro score: 0.5136419589518879
f1_macro score: 0.5115914898535976
precision_micro score: 0.5136419589518879
precision_macro score: 0.5127663993200582
recall_micro score: 0.5136419589518879
recall_macro score: 0.5133729167425353

Random OverSampling
f1_micro score: 0.4858047959598375
f1_macro score: 0.48460548022276645
precision_micro score: 0.4858047959598375
precision_macro score: 0.48540420100027
recall_micro score: 0.4858047959598375
recall_macro score: 0.48580353687644307

SMOTE OverSampling
f1_micro score: 0.5120558433917123
f1_macro score: 0.5107872699238839
precision_micro score: 0.5120558433917123
precision_macro score: 0.5114053573431475
recall_micro score: 0.5120558433917123
recall_macro score: 0.5120516896129902

Borderline SMOTE OverSampling
f1_micro score: 0.6529832852836147
f1_macro score: 0.6439265760553193
precision_micro score: 0.6529832852836147
precision_macro score: 0.6452321188873411
recall_micro score: 0.6529832852836147
recall_macro score: 0.6529795158286779


Karena Nearmiss 1 undersampling menghasilkan nilai terbaik maka akan digunakan

In [34]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(logistic, X_nearmiss, y_nearmiss, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.7106214689265536
f1_macro score: 0.715894068868365
precision_micro score: 0.7106214689265536
precision_macro score: 0.7407151860738226
recall_micro score: 0.7106214689265536
recall_macro score: 0.7114035087719298


In [35]:
X_train, X_test, y_train, y_test = train_test_split(X_nearmiss, y_nearmiss, test_size=0.25, random_state=42)

In [36]:
logistic.max_iter = 10000
logistic.fit(X_train, y_train)

In [37]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(logistic, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.6795959595959596
f1_macro score: 0.6845910561321404
precision_micro score: 0.6795959595959596
precision_macro score: 0.7197487479840421
recall_micro score: 0.6795959595959596
recall_macro score: 0.6776190476190476


In [38]:
prediction_logreg = logistic.predict(X_test)
classification_metrics(prediction_logreg, y_test)

Accuracy: 0.72
F1 Score: 0.7327897666243531
Recall Score: 0.7269668737060041
Precision Score: 0.7402298850574712


# Softmax Regression

In [39]:
softmax = LogisticRegression(max_iter=2000, multi_class='multinomial')
softmax

In [40]:
evaluate_classification(softmax)

ADASYN OverSampling


f1_micro score: 0.5136419589518879
f1_macro score: 0.5115914898535976
precision_micro score: 0.5136419589518879
precision_macro score: 0.5127663993200582
recall_micro score: 0.5136419589518879
recall_macro score: 0.5133729167425353

Random OverSampling
f1_micro score: 0.4858047959598375
f1_macro score: 0.48460548022276645
precision_micro score: 0.4858047959598375
precision_macro score: 0.48540420100027
recall_micro score: 0.4858047959598375
recall_macro score: 0.48580353687644307

SMOTE OverSampling
f1_micro score: 0.5120558433917123
f1_macro score: 0.5107872699238839
precision_micro score: 0.5120558433917123
precision_macro score: 0.5114053573431475
recall_micro score: 0.5120558433917123
recall_macro score: 0.5120516896129902

Borderline SMOTE OverSampling
f1_micro score: 0.6529832852836147
f1_macro score: 0.6439265760553193
precision_micro score: 0.6529832852836147
precision_macro score: 0.6452321188873411
recall_micro score: 0.6529832852836147
recall_macro score: 0.6529795158286779


Karena Nearmiss 1 undersampling menghasilkan nilai terbaik maka akan digunakan

In [41]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(softmax, X_nearmiss, y_nearmiss, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.7106214689265536
f1_macro score: 0.715894068868365
precision_micro score: 0.7106214689265536
precision_macro score: 0.7407151860738226
recall_micro score: 0.7106214689265536
recall_macro score: 0.7114035087719298


In [42]:
softmax.max_iter = 10000
softmax.fit(X_train, y_train)

In [43]:
prediction_softmax = softmax.predict(X_test)
classification_metrics(prediction_softmax, y_test)

Accuracy: 0.72
F1 Score: 0.7327897666243531
Recall Score: 0.7269668737060041
Precision Score: 0.7402298850574712


# KNN

In [44]:
knn = KNeighborsClassifier()
knn

In [45]:
evaluate_classification(knn)

ADASYN OverSampling
f1_micro score: 0.7215510990182835
f1_macro score: 0.7013045689062141
precision_micro score: 0.7215510990182835
precision_macro score: 0.7215729304860842
recall_micro score: 0.7215510990182835
recall_macro score: 0.7206720226032364

Random OverSampling
f1_micro score: 0.7281033229261716
f1_macro score: 0.7215774165272746
precision_micro score: 0.7281033229261716
precision_macro score: 0.7194711871483047
recall_micro score: 0.7281033229261716
recall_macro score: 0.728102688368928

SMOTE OverSampling
f1_micro score: 0.7293128846902072
f1_macro score: 0.7116162458185753
precision_micro score: 0.7293128846902072
precision_macro score: 0.7278910789258819
recall_micro score: 0.7293128846902072
recall_macro score: 0.7293136361157212

Borderline SMOTE OverSampling
f1_micro score: 0.7592845574529923
f1_macro score: 0.7517289736510588
precision_micro score: 0.7592845574529923
precision_macro score: 0.7660439516711102
recall_micro score: 0.7592845574529923
recall_macro score: 

Karena Borderline SMOTE oversampling menghasilkan nilai terbaik maka akan digunakan

In [46]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(knn, X_borderSmote, y_borderSmote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.7592845574529923
f1_macro score: 0.7517289736510588
precision_micro score: 0.7592845574529923
precision_macro score: 0.7660439516711102
recall_micro score: 0.7592845574529923
recall_macro score: 0.7592841058725447


In [47]:
X_train, X_test, y_train, y_test = train_test_split(X_borderSmote, y_borderSmote, test_size=0.25, random_state=42)

## Hyperparameter Tuning

In [48]:
tuned_params = [{'n_neighbors': [2,3,4,5,6,7,8,9,10],'metric': ['euclidean', 'manhattan', 'jaccard']}]

clf_knn = GridSearchCV(knn, tuned_params, cv=cv, verbose = 2)
clf_knn.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END ....................metric=euclidean, n_neighbors=2; total time=   0.0s


[CV] END ....................metric=euclidean, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=4; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=4; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=4; total time=   0.0s
[CV] END ...................

Traceback (most recent call last):
  File "c:\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 810, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python311\Lib\site-packages\sklearn\metrics\_scorer.py", line 527, in __call__
    return estimator.score(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python311\Lib\site-packages\sklearn\base.py", line 706, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
                             ^^^^^^^^^^^^^^^
  File "c:\Python311\Lib\site-packages\sklearn\neighbors\_classification.py", line 254, in predict
    probabilities = self.predict_proba(X)
                    ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python311\Lib\site-packages\sklearn\neighbors\_classification.py", line 332, in predict_proba
    probabilities = ArgKminClassMode.compute(
                    ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python

[CV] END ....................metric=manhattan, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=4; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=4; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=4; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=4; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=4; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=5; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=5; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=5; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=5; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=5; total time=   0.0s
[CV] END ...................

Traceback (most recent call last):
  File "c:\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 810, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python311\Lib\site-packages\sklearn\metrics\_scorer.py", line 527, in __call__
    return estimator.score(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python311\Lib\site-packages\sklearn\base.py", line 706, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
                             ^^^^^^^^^^^^^^^
  File "c:\Python311\Lib\site-packages\sklearn\neighbors\_classification.py", line 254, in predict
    probabilities = self.predict_proba(X)
                    ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python311\Lib\site-packages\sklearn\neighbors\_classification.py", line 332, in predict_proba
    probabilities = ArgKminClassMode.compute(
                    ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python

[CV] END ....................metric=manhattan, n_neighbors=8; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=8; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=8; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=8; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=9; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=9; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=9; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=9; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=9; total time=   0.0s
[CV] END ...................metric=manhattan, n_neighbors=10; total time=   0.0s
[CV] END ...................metric=manhattan, n_neighbors=10; total time=   0.0s
[CV] END ...................metric=manhattan, n_neighbors=10; total time=   0.0s
[CV] END ...................



[CV] END ......................metric=jaccard, n_neighbors=2; total time=   2.1s




[CV] END ......................metric=jaccard, n_neighbors=2; total time=   1.7s




[CV] END ......................metric=jaccard, n_neighbors=2; total time=   1.9s




[CV] END ......................metric=jaccard, n_neighbors=2; total time=   1.6s




[CV] END ......................metric=jaccard, n_neighbors=3; total time=   1.9s




[CV] END ......................metric=jaccard, n_neighbors=3; total time=   1.8s




[CV] END ......................metric=jaccard, n_neighbors=3; total time=   1.9s




[CV] END ......................metric=jaccard, n_neighbors=3; total time=   1.7s




[CV] END ......................metric=jaccard, n_neighbors=3; total time=   1.9s




[CV] END ......................metric=jaccard, n_neighbors=4; total time=   1.8s




[CV] END ......................metric=jaccard, n_neighbors=4; total time=   1.8s




[CV] END ......................metric=jaccard, n_neighbors=4; total time=   1.7s




[CV] END ......................metric=jaccard, n_neighbors=4; total time=   1.7s




[CV] END ......................metric=jaccard, n_neighbors=4; total time=   1.8s




[CV] END ......................metric=jaccard, n_neighbors=5; total time=   1.8s




[CV] END ......................metric=jaccard, n_neighbors=5; total time=   1.8s




[CV] END ......................metric=jaccard, n_neighbors=5; total time=   1.9s




[CV] END ......................metric=jaccard, n_neighbors=5; total time=   1.8s




[CV] END ......................metric=jaccard, n_neighbors=5; total time=   1.8s




[CV] END ......................metric=jaccard, n_neighbors=6; total time=   1.7s




[CV] END ......................metric=jaccard, n_neighbors=6; total time=   1.8s




[CV] END ......................metric=jaccard, n_neighbors=6; total time=   1.8s




[CV] END ......................metric=jaccard, n_neighbors=6; total time=   1.6s




[CV] END ......................metric=jaccard, n_neighbors=6; total time=   1.7s




[CV] END ......................metric=jaccard, n_neighbors=7; total time=   1.7s




[CV] END ......................metric=jaccard, n_neighbors=7; total time=   1.6s




[CV] END ......................metric=jaccard, n_neighbors=7; total time=   1.7s




[CV] END ......................metric=jaccard, n_neighbors=7; total time=   1.6s




[CV] END ......................metric=jaccard, n_neighbors=7; total time=   1.9s




[CV] END ......................metric=jaccard, n_neighbors=8; total time=   2.0s




[CV] END ......................metric=jaccard, n_neighbors=8; total time=   1.9s




[CV] END ......................metric=jaccard, n_neighbors=8; total time=   1.7s




[CV] END ......................metric=jaccard, n_neighbors=8; total time=   1.9s




[CV] END ......................metric=jaccard, n_neighbors=8; total time=   1.9s




[CV] END ......................metric=jaccard, n_neighbors=9; total time=   1.9s




[CV] END ......................metric=jaccard, n_neighbors=9; total time=   1.8s




[CV] END ......................metric=jaccard, n_neighbors=9; total time=   1.7s




[CV] END ......................metric=jaccard, n_neighbors=9; total time=   1.8s




[CV] END ......................metric=jaccard, n_neighbors=9; total time=   1.8s




[CV] END .....................metric=jaccard, n_neighbors=10; total time=   1.7s




[CV] END .....................metric=jaccard, n_neighbors=10; total time=   1.7s




[CV] END .....................metric=jaccard, n_neighbors=10; total time=   1.7s




[CV] END .....................metric=jaccard, n_neighbors=10; total time=   1.7s




[CV] END .....................metric=jaccard, n_neighbors=10; total time=   1.8s


 0.71614752 0.71130657 0.70795662        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.49695718 0.50105278 0.50750647 0.52376489 0.52041301 0.52364105
 0.52029094 0.51818038 0.50688451]


In [49]:
print("Hyperparameter terbaik untuk Decision Tree Classifier")
print(clf_knn.best_params_)

Hyperparameter terbaik untuk Decision Tree Classifier
{'metric': 'euclidean', 'n_neighbors': 3}


In [50]:
best_hypeparam_knn = KNeighborsClassifier(**clf_knn.best_params_)
best_hypeparam_knn.fit(X_train, y_train)

In [51]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(best_hypeparam_knn, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.7644284871533025
f1_macro score: 0.7569397646451896
precision_micro score: 0.7644284871533025
precision_macro score: 0.7712758547986633
recall_micro score: 0.7644284871533025
recall_macro score: 0.7636980152622866


In [52]:
prediction_knn = best_hypeparam_knn.predict(X_test)
classification_metrics(prediction_knn, y_test)

Accuracy: 0.7710349962769918
F1 Score: 0.7668805293102157
Recall Score: 0.7732183784290091
Precision Score: 0.7809957435053269


# Naive Bayes

# Neural Network (MLP)

In [53]:
mlp = MLPClassifier()
mlp

In [54]:
evaluate_classification(mlp)

ADASYN OverSampling
f1_micro score: 0.5897453397014566
f1_macro score: 0.6172886363071676
precision_micro score: 0.6672573768761296
precision_macro score: 0.6518853136503827
recall_micro score: 0.6167585046543742
recall_macro score: 0.6247418122208181

Random OverSampling
f1_micro score: 0.6738342635654884
f1_macro score: 0.5918308099482146
precision_micro score: 0.6396774559731997
precision_macro score: 0.6727305217317031
recall_micro score: 0.6896573088864684
recall_macro score: 0.6545770578320076

SMOTE OverSampling
f1_micro score: 0.6428429315787604
f1_macro score: 0.6342972138202732
precision_micro score: 0.6487904815630327
precision_macro score: 0.670594261018308
recall_micro score: 0.6623856923622178
recall_macro score: 0.6482364964716945

Borderline SMOTE OverSampling
f1_micro score: 0.6949615818885921
f1_macro score: 0.7147301251174826
precision_micro score: 0.6469163692263432
precision_macro score: 0.6815758369968576
recall_micro score: 0.6810915908226337
recall_macro score: 

Karena Borderline SMOTE Oversampling menghasilkan nilai terbaik maka akan digunakan

In [55]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(mlp, X_borderSmote, y_borderSmote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.6880788171363754
f1_macro score: 0.7086390164301617
precision_micro score: 0.7104166504190161
precision_macro score: 0.7143606746965666
recall_micro score: 0.7256840260898274
recall_macro score: 0.6977322487397053


In [56]:
X_train, X_test, y_train, y_test = train_test_split(X_borderSmote, y_borderSmote, test_size=0.25, random_state=42)

## Hyperparameter Tuning

In [57]:
param_grid = {
    'hidden_layer_sizes': [(100,), (50, 50), (30, 20, 10)],
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive'],
    'max_iter': [10000],
    'random_state': [42]
}

tuned_mlp = GridSearchCV(mlp, param_grid, cv=cv)
tuned_mlp.fit(X_train, y_train)



In [None]:
print("Hyperparameter terbaik untuk Decision Tree Classifier")
print(tuned_mlp.best_params_)

Hyperparameter terbaik untuk Decision Tree Classifier
{'activation': 'logistic', 'alpha': 1e-05, 'hidden_layer_sizes': 10, 'max_iter': 1000, 'random_state': 42, 'solver': 'adam'}


In [None]:
best_hypeparam_mlp = MLPClassifier(**tuned_mlp.best_params_)
best_hypeparam_mlp.fit(X_train, y_train)

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(best_hypeparam_mlp, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.676993063883709
f1_macro score: 0.7118221371248534
precision_micro score: 0.676993063883709
precision_macro score: 0.7119523723308918
recall_micro score: 0.676993063883709
recall_macro score: 0.7161506430083246


In [None]:
prediction_mlp = best_hypeparam_mlp.predict(X_test)
classification_metrics(prediction_mlp, y_test)

Accuracy: 0.6658428077113199
F1 Score: 0.7016426666936328
Recall Score: 0.7031492494782547
Precision Score: 0.7056713141602785


# Kesimpulan

Dari semua model classification, untuk kobinasi pre processing ini yang menghasilkan hasil paling bagus adalah Random Forest dengan data yang di Random Oversampling