In [1]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, NearMiss, TomekLinks
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

In [2]:
df = pd.read_csv('dataframe/UFC_Pre_kombinasi6.csv')
df.head()

Unnamed: 0,weight_class,id,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,B_avg_opp_SUB_ATT,...,R_age,gender,B_Stance_Open Stance,B_Stance_Orthodox,B_Stance_Southpaw,B_Stance_Switch,R_Stance_Open Stance,R_Stance_Orthodox,R_Stance_Southpaw,R_Stance_Switch
0,1,0,0.0,0.0,0.49375,0.44875,0.475,0.1775,0.0,0.125,...,27.0,0,False,True,False,False,False,True,False,False
1,6,1,0.0,0.25,0.473125,0.371875,0.0,0.20375,0.0,0.5,...,31.0,1,False,True,False,False,False,True,False,False
2,4,2,0.0,0.5,0.5,0.48,0.615,0.0,0.0,0.0,...,30.0,1,False,True,False,False,False,True,False,False
3,3,3,0.0,0.0,0.66,0.69,0.85,0.0,0.0,2.0,...,23.0,1,False,True,False,False,False,True,False,False
4,6,4,0.0,0.0,0.505312,0.439375,0.815937,0.067187,1.28125,0.640625,...,24.0,1,False,True,False,False,False,True,False,False


In [3]:
X = df.drop('Winner', axis=1).values
X

KeyError: "['Winner'] not found in axis"

In [13]:
y = df['Winner'].values
y

array(['Red', 'Blue', 'Blue', ..., 'Red', 'Blue', 'Red'], dtype=object)

In [14]:
counter_y = Counter(y)
print(counter_y)

Counter({'Red': 3581, 'Blue': 1730, 'Draw': 99})


In [15]:
# Over Sampling
adasyn = ADASYN()
randomOver = RandomOverSampler()
smote = SMOTE()
borderSmote = BorderlineSMOTE()
svmSmote = SVMSMOTE()

# Melakukan resampling
X_adasyn, y_adasyn = adasyn.fit_resample(X, y)
X_randomOver, y_randomOver = randomOver.fit_resample(X, y)
X_smote, y_smote = smote.fit_resample(X, y)
X_borderSmote, y_borderSmote = borderSmote.fit_resample(X, y)
X_svmSmote, y_svmSmote = svmSmote.fit_resample(X, y)


# Under Sampling
rand_under = RandomUnderSampler(sampling_strategy='majority')
nearmiss = NearMiss()
nearmiss2 = NearMiss(version=2)
nearmiss3 = NearMiss(version=3)
tomek = TomekLinks()

# Melakukan resampling
X_rand_under, y_rand_under = rand_under.fit_resample(X, y)
X_nearmiss, y_nearmiss = nearmiss.fit_resample(X, y)
X_nearmiss2, y_nearmiss2 = nearmiss2.fit_resample(X, y)
X_nearmiss3, y_nearmiss3 = nearmiss3.fit_resample(X, y)
X_tomek, y_tomek = tomek.fit_resample(X, y)


In [16]:
def evaluate_classification(classifier):
  cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

  scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
  print("ADASYN OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_adasyn, y_adasyn, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Random OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_randomOver, y_randomOver, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("SMOTE OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_smote, y_smote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Borderline SMOTE OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_borderSmote, y_borderSmote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("SVM SMOTE OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_svmSmote, y_svmSmote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Random Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_rand_under, y_rand_under , scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Near Miss 1 Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_nearmiss, y_nearmiss, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Near Miss 2 Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_nearmiss2, y_nearmiss2, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Near Miss 3 Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_nearmiss3, y_nearmiss3, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Tomek Links Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_tomek, y_tomek, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

# Decision Tree

In [17]:
dt = DecisionTreeClassifier()
dt

In [18]:
evaluate_classification(dt)

ADASYN OverSampling
f1_micro score: 0.7291451441197947
f1_macro score: 0.7200341197241279
precision_micro score: 0.7237847749735626
precision_macro score: 0.7188883387286903
recall_micro score: 0.7269828717457445
recall_macro score: 0.7240079904731556

Random OverSampling
f1_micro score: 0.8408274430184062
f1_macro score: 0.8374244689471888
precision_micro score: 0.8378480138438649
precision_macro score: 0.8479633409116074
recall_micro score: 0.8408280495973616
recall_macro score: 0.8435252929000153

SMOTE OverSampling
f1_micro score: 0.7277326165303163
f1_macro score: 0.7260045329597176
precision_micro score: 0.7295944239796259
precision_macro score: 0.7221199831079089
recall_micro score: 0.7285703020676543
recall_macro score: 0.7262445166467979

Borderline SMOTE OverSampling
f1_micro score: 0.7653349225701963
f1_macro score: 0.7633956863221709
precision_micro score: 0.7680345888651168
precision_macro score: 0.7628798395865433
recall_micro score: 0.7664522843330188
recall_macro score:

Karena Random Oversampling menghasilkan score terbaik maka akan digunakan Random Oversampling

In [28]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(dt, X_randomOver, y_randomOver, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.8371032648679002
f1_macro score: 0.8409763264945026
precision_micro score: 0.8459463628226025
precision_macro score: 0.8452647280330033
recall_micro score: 0.8403615903806976
recall_macro score: 0.842596661550169


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X_randomOver, y_randomOver, test_size=0.25, random_state=42)

## Hyperparameter Tuning

In [23]:
param_grid_c = {'criterion': ['gini', 'entropy'],
               'min_samples_split': [2, 5, 10, 20, 50],
               'max_depth' : [None, 10, 20, 30, 50, 75, 100],
               'min_samples_leaf': [1, 2, 4, 6, 10]
               }

clf_dtc = GridSearchCV(estimator= DecisionTreeClassifier(), param_grid=param_grid_c, cv= 5)
clf_dtc.fit(X_train, y_train)

In [25]:
print("Hyperparameter terbaik untuk Decision Tree Classifier")
print(clf_dtc.best_params_)

Hyperparameter terbaik untuk Decision Tree Classifier
{'criterion': 'gini', 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [26]:
best_hypeparam_dtc = DecisionTreeClassifier(**clf_dtc.best_params_)
best_hypeparam_dtc.fit(X_train, y_train)

In [27]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(best_hypeparam_dtc, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.8094834982202075
f1_macro score: 0.7989832746841123
precision_micro score: 0.8114703041897131
precision_macro score: 0.8075938325331344
recall_micro score: 0.8035268539954069
recall_macro score: 0.8029291544338806


In [29]:
def classification_metrics(prediction, y_test):
  accuracy = accuracy_score(y_test, prediction)
  f1 = f1_score(y_test, prediction, average="macro")
  recall = recall_score(y_test, prediction, average="macro")
  precision = precision_score(y_test, prediction, average="macro")

  print('Accuracy: ' + str(accuracy))
  print('F1 Score: ' + str(f1))
  print('Recall Score: ' + str(recall))
  print('Precision Score: ' + str(precision))

In [30]:
prediction = best_hypeparam_dtc.predict(X_test)
classification_metrics(prediction, y_test)

Accuracy: 0.819434102755026
F1 Score: 0.8166180400317762
Recall Score: 0.8210891803870418
Precision Score: 0.8221400977668715


In [32]:
dt.fit(X_train, y_train)
prediction = dt.predict(X_test)
classification_metrics(prediction, y_test)

Accuracy: 0.8261355174981385
F1 Score: 0.8241605351418898
Recall Score: 0.827825084701269
Precision Score: 0.8324064621048396


# Random Forest

In [24]:
rf = RandomForestClassifier()

In [33]:
evaluate_classification(rf)

ADASYN OverSampling
f1_micro score: 0.8458581398239099
f1_macro score: 0.8479201320401437
precision_micro score: 0.8451996704114683
precision_macro score: 0.8477988982219312
recall_micro score: 0.8490557943141113
recall_macro score: 0.8497707049027035

Random OverSampling
f1_micro score: 0.8855072256551703
f1_macro score: 0.8847958628189009
precision_micro score: 0.8871819034967544
precision_macro score: 0.888063294850844
recall_micro score: 0.8868096806535106
recall_macro score: 0.8851335873401744

SMOTE OverSampling
f1_micro score: 0.8532995728817612
f1_macro score: 0.8456895642893223
precision_micro score: 0.8470624681004459
precision_macro score: 0.8486156792645595
recall_micro score: 0.8537648189405145
recall_macro score: 0.8513452233468127

Borderline SMOTE OverSampling
f1_micro score: 0.8508797994476666
f1_macro score: 0.8530543754229962
precision_micro score: 0.8553478167057044
precision_macro score: 0.8515867102398327
recall_micro score: 0.8523699039785514
recall_macro score: 

Karena score nya paling tinggi maka akan digunakan Random Oversampling

In [34]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(rf, X_randomOver, y_randomOver, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.8835514851219181
f1_macro score: 0.8842418887502397
precision_micro score: 0.8884844884762997
precision_macro score: 0.8891593896331014
recall_micro score: 0.8868097239805792
recall_macro score: 0.8831767217534263


## Hyperparameter Tuning

In [40]:
param_grid_c = {
                'criterion': ['gini', 'entropy'],
               'min_samples_split': [2, 5, 10],
               'max_depth' : [None, 10, 20, 30],
               'min_samples_leaf': [1, 2, 4]
}

clf_rfc = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid_c, cv=cv)
clf_rfc.fit(X_train, y_train)

In [41]:
print("Hyperparameter terbaik untuk Decision Tree Classifier")
print(clf_rfc.best_params_)

Hyperparameter terbaik untuk Decision Tree Classifier
{'criterion': 'gini', 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [42]:
best_hypeparam_rfc = RandomForestClassifier(**clf_rfc.best_params_)
best_hypeparam_rfc.fit(X_train, y_train)

In [43]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(best_hypeparam_rfc, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.8530488283867271
f1_macro score: 0.8499636177852631
precision_micro score: 0.8506900450223572
precision_macro score: 0.853635271472327
recall_micro score: 0.8546606534171861
recall_macro score: 0.8520428692067993


In [44]:
prediction_rf = best_hypeparam_rfc.predict(X_test)
classification_metrics(prediction_rf, y_test)

Accuracy: 0.8700670141474312
F1 Score: 0.8709429515903011
Recall Score: 0.8710317557978327
Precision Score: 0.8711721762373035


# Logistic Regression

In [45]:
logistic = LogisticRegression(max_iter=2000)
logistic

In [46]:
evaluate_classification(logistic)

ADASYN OverSampling


f1_micro score: 0.5943762077062328
f1_macro score: 0.5886427421088528
precision_micro score: 0.5943762077062328
precision_macro score: 0.5901084576963718
recall_micro score: 0.5943762077062328
recall_macro score: 0.5932947339134451

Random OverSampling
f1_micro score: 0.5956426834013137
f1_macro score: 0.5923362050268545
precision_micro score: 0.5956426834013137
precision_macro score: 0.5927908772438208
recall_micro score: 0.5956426834013137
recall_macro score: 0.595639938809804

SMOTE OverSampling
f1_micro score: 0.6106309894255958
f1_macro score: 0.60594733345955
precision_micro score: 0.6106309894255958
precision_macro score: 0.6080929651416807
recall_micro score: 0.6106309894255958
recall_macro score: 0.6106281604762238

Borderline SMOTE OverSampling
f1_micro score: 0.7018521455130922
f1_macro score: 0.6941744433502268
precision_micro score: 0.7018521455130922
precision_macro score: 0.6935263316654183
recall_micro score: 0.7018521455130922
recall_macro score: 0.7018501203805428

SV

Karena Nearmiss 1 undersampling menghasilkan nilai terbaik maka akan digunakan

In [78]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(logistic, X_nearmiss, y_nearmiss, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.7780225988700564
f1_macro score: 0.7725487592094781
precision_micro score: 0.7780225988700564
precision_macro score: 0.7937762181920103
recall_micro score: 0.7780225988700564
recall_macro score: 0.7780701754385964


In [80]:
X_train, X_test, y_train, y_test = train_test_split(X_nearmiss, y_nearmiss, test_size=0.25, random_state=42)

In [83]:
logistic.max_iter = 10000
logistic.fit(X_train, y_train)

In [84]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(logistic, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.7837373737373736
f1_macro score: 0.7774610876155437
precision_micro score: 0.7837373737373736
precision_macro score: 0.8036729839361417
recall_micro score: 0.7837373737373736
recall_macro score: 0.7816666666666666


In [86]:
prediction_logreg = logistic.predict(X_test)
classification_metrics(prediction_logreg, y_test)

Accuracy: 0.8266666666666667
F1 Score: 0.8256621225565325
Recall Score: 0.8374741200828156
Precision Score: 0.8336338743070861


# Softmax Regression

In [87]:
softmax = LogisticRegression(max_iter=2000, multi_class='multinomial')
softmax

In [53]:
evaluate_classification(softmax)

ADASYN OverSampling


f1_micro score: 0.5943762077062328
f1_macro score: 0.5886427421088528
precision_micro score: 0.5943762077062328
precision_macro score: 0.5901084576963718
recall_micro score: 0.5943762077062328
recall_macro score: 0.5932947339134451

Random OverSampling
f1_micro score: 0.5956426834013137
f1_macro score: 0.5923362050268545
precision_micro score: 0.5956426834013137
precision_macro score: 0.5927908772438208
recall_micro score: 0.5956426834013137
recall_macro score: 0.595639938809804

SMOTE OverSampling
f1_micro score: 0.6106309894255958
f1_macro score: 0.60594733345955
precision_micro score: 0.6106309894255958
precision_macro score: 0.6080929651416807
recall_micro score: 0.6106309894255958
recall_macro score: 0.6106281604762238

Borderline SMOTE OverSampling
f1_micro score: 0.7018521455130922
f1_macro score: 0.6941744433502268
precision_micro score: 0.7018521455130922
precision_macro score: 0.6935263316654183
recall_micro score: 0.7018521455130922
recall_macro score: 0.7018501203805428

SV

Karena Nearmiss 1 undersampling menghasilkan nilai terbaik maka akan digunakan

In [88]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(softmax, X_nearmiss, y_nearmiss, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.7780225988700564
f1_macro score: 0.7725487592094781
precision_micro score: 0.7780225988700564
precision_macro score: 0.7937762181920103
recall_micro score: 0.7780225988700564
recall_macro score: 0.7780701754385964


In [90]:
softmax.max_iter = 10000
softmax.fit(X_train, y_train)

In [91]:
prediction_softmax = softmax.predict(X_test)
classification_metrics(prediction_softmax, y_test)

Accuracy: 0.8266666666666667
F1 Score: 0.8256621225565325
Recall Score: 0.8374741200828156
Precision Score: 0.8336338743070861


# KNN

In [57]:
knn = KNeighborsClassifier()
knn

In [58]:
evaluate_classification(knn)

ADASYN OverSampling


f1_micro score: 0.7119350676978914
f1_macro score: 0.6816214848076101
precision_micro score: 0.7119350676978914
precision_macro score: 0.7327128436791346
recall_micro score: 0.7119350676978914
recall_macro score: 0.7127886254403915

Random OverSampling
f1_micro score: 0.7511875949404384
f1_macro score: 0.7438071546485114
precision_micro score: 0.7511875949404383
precision_macro score: 0.7465345482537147
recall_micro score: 0.7511875949404383
recall_macro score: 0.751188741627254

SMOTE OverSampling
f1_micro score: 0.7235410692947133
f1_macro score: 0.6936075861054538
precision_micro score: 0.7235410692947133
precision_macro score: 0.7427362245746432
recall_micro score: 0.7235410692947133
recall_macro score: 0.7235414734993988

Borderline SMOTE OverSampling
f1_micro score: 0.7503481329933026
f1_macro score: 0.7293807708222901
precision_micro score: 0.7503481329933026
precision_macro score: 0.7720019640686806
recall_micro score: 0.7503481329933026
recall_macro score: 0.7503469868503412



Karena Borderline SMOTE oversampling menghasilkan nilai terbaik maka akan digunakan

In [59]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(knn, X_borderSmote, y_borderSmote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.7503481329933026
f1_macro score: 0.7293807708222901
precision_micro score: 0.7503481329933026
precision_macro score: 0.7720019640686806
recall_micro score: 0.7503481329933026
recall_macro score: 0.7503469868503412


In [60]:
X_train, X_test, y_train, y_test = train_test_split(X_borderSmote, y_borderSmote, test_size=0.25, random_state=42)

## Hyperparameter Tuning

In [61]:
tuned_params = [{'n_neighbors': [2,3,4,5,6,7,8,9,10],'metric': ['euclidean', 'manhattan', 'jaccard']}]

clf_knn = GridSearchCV(knn, tuned_params, cv=cv, verbose = 2)
clf_knn.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[CV] END ....................metric=euclidean, n_neighbors=2; total time=   0.1s
[CV] END ....................metric=euclidean, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=3; total time=   0.1s
[CV] END ....................metric=euclidean, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=3; total time=   0.1s
[CV] END ....................metric=euclidean, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=4; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=4; total time=   0.0s
[CV] END ...................

Traceback (most recent call last):
  File "c:\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 810, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python311\Lib\site-packages\sklearn\metrics\_scorer.py", line 527, in __call__
    return estimator.score(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python311\Lib\site-packages\sklearn\base.py", line 706, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
                             ^^^^^^^^^^^^^^^
  File "c:\Python311\Lib\site-packages\sklearn\neighbors\_classification.py", line 254, in predict
    probabilities = self.predict_proba(X)
                    ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python311\Lib\site-packages\sklearn\neighbors\_classification.py", line 332, in predict_proba
    probabilities = ArgKminClassMode.compute(
                    ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python

[CV] END ....................metric=manhattan, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=4; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=4; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=4; total time=   0.0s
[CV] END ...................

Traceback (most recent call last):
  File "c:\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 810, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python311\Lib\site-packages\sklearn\metrics\_scorer.py", line 527, in __call__
    return estimator.score(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python311\Lib\site-packages\sklearn\base.py", line 706, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
                             ^^^^^^^^^^^^^^^
  File "c:\Python311\Lib\site-packages\sklearn\neighbors\_classification.py", line 254, in predict
    probabilities = self.predict_proba(X)
                    ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python311\Lib\site-packages\sklearn\neighbors\_classification.py", line 332, in predict_proba
    probabilities = ArgKminClassMode.compute(
                    ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python

[CV] END ....................metric=manhattan, n_neighbors=5; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=5; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=5; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=5; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=5; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=6; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=6; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=6; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=6; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=6; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=7; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=7; total time=   0.0s
[CV] END ...................

Traceback (most recent call last):
  File "c:\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 810, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python311\Lib\site-packages\sklearn\metrics\_scorer.py", line 527, in __call__
    return estimator.score(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python311\Lib\site-packages\sklearn\base.py", line 706, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
                             ^^^^^^^^^^^^^^^
  File "c:\Python311\Lib\site-packages\sklearn\neighbors\_classification.py", line 254, in predict
    probabilities = self.predict_proba(X)
                    ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python311\Lib\site-packages\sklearn\neighbors\_classification.py", line 332, in predict_proba
    probabilities = ArgKminClassMode.compute(
                    ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python

[CV] END ....................metric=manhattan, n_neighbors=8; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=8; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=9; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=9; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=9; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=9; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=9; total time=   0.0s
[CV] END ...................metric=manhattan, n_neighbors=10; total time=   0.0s
[CV] END ...................metric=manhattan, n_neighbors=10; total time=   0.0s
[CV] END ...................metric=manhattan, n_neighbors=10; total time=   0.0s
[CV] END ...................metric=manhattan, n_neighbors=10; total time=   0.0s
[CV] END ...................metric=manhattan, n_neighbors=10; total time=   0.0s
[CV] END ...................



[CV] END ......................metric=jaccard, n_neighbors=2; total time=   5.2s




[CV] END ......................metric=jaccard, n_neighbors=2; total time=   5.5s




[CV] END ......................metric=jaccard, n_neighbors=2; total time=   5.5s




[CV] END ......................metric=jaccard, n_neighbors=2; total time=   5.4s




[CV] END ......................metric=jaccard, n_neighbors=3; total time=   5.8s




[CV] END ......................metric=jaccard, n_neighbors=3; total time=   5.3s




[CV] END ......................metric=jaccard, n_neighbors=3; total time=   5.3s




[CV] END ......................metric=jaccard, n_neighbors=3; total time=   4.9s




[CV] END ......................metric=jaccard, n_neighbors=3; total time=   5.6s




[CV] END ......................metric=jaccard, n_neighbors=4; total time=   5.6s




[CV] END ......................metric=jaccard, n_neighbors=4; total time=   5.3s




[CV] END ......................metric=jaccard, n_neighbors=4; total time=   5.6s




[CV] END ......................metric=jaccard, n_neighbors=4; total time=   5.7s




[CV] END ......................metric=jaccard, n_neighbors=4; total time=   5.5s




[CV] END ......................metric=jaccard, n_neighbors=5; total time=   5.9s




[CV] END ......................metric=jaccard, n_neighbors=5; total time=   5.7s




[CV] END ......................metric=jaccard, n_neighbors=5; total time=   5.7s




[CV] END ......................metric=jaccard, n_neighbors=5; total time=   5.6s




[CV] END ......................metric=jaccard, n_neighbors=5; total time=   5.8s




[CV] END ......................metric=jaccard, n_neighbors=6; total time=   5.8s




[CV] END ......................metric=jaccard, n_neighbors=6; total time=   5.3s




[CV] END ......................metric=jaccard, n_neighbors=6; total time=   5.7s




[CV] END ......................metric=jaccard, n_neighbors=6; total time=   5.7s




[CV] END ......................metric=jaccard, n_neighbors=6; total time=   5.5s




[CV] END ......................metric=jaccard, n_neighbors=7; total time=   5.5s




[CV] END ......................metric=jaccard, n_neighbors=7; total time=   5.8s




[CV] END ......................metric=jaccard, n_neighbors=7; total time=   5.4s




[CV] END ......................metric=jaccard, n_neighbors=7; total time=   5.6s




[CV] END ......................metric=jaccard, n_neighbors=7; total time=   5.4s




[CV] END ......................metric=jaccard, n_neighbors=8; total time=   5.6s




[CV] END ......................metric=jaccard, n_neighbors=8; total time=   5.6s




[CV] END ......................metric=jaccard, n_neighbors=8; total time=   5.8s




[CV] END ......................metric=jaccard, n_neighbors=8; total time=   5.1s




[CV] END ......................metric=jaccard, n_neighbors=8; total time=   5.4s




[CV] END ......................metric=jaccard, n_neighbors=9; total time=   5.6s




[CV] END ......................metric=jaccard, n_neighbors=9; total time=   5.3s




[CV] END ......................metric=jaccard, n_neighbors=9; total time=   5.7s




[CV] END ......................metric=jaccard, n_neighbors=9; total time=   6.0s




[CV] END ......................metric=jaccard, n_neighbors=9; total time=   5.7s




[CV] END .....................metric=jaccard, n_neighbors=10; total time=   5.5s




[CV] END .....................metric=jaccard, n_neighbors=10; total time=   5.7s




[CV] END .....................metric=jaccard, n_neighbors=10; total time=   5.6s




[CV] END .....................metric=jaccard, n_neighbors=10; total time=   5.4s




[CV] END .....................metric=jaccard, n_neighbors=10; total time=   5.5s


 0.70448113 0.70932146 0.69914407        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.72136044 0.74829507 0.73600595 0.73973065 0.72756668 0.73439343
 0.72396667 0.7235953  0.71503128]


In [62]:
print("Hyperparameter terbaik untuk Decision Tree Classifier")
print(clf_knn.best_params_)

Hyperparameter terbaik untuk Decision Tree Classifier
{'metric': 'euclidean', 'n_neighbors': 3}


In [63]:
best_hypeparam_knn = KNeighborsClassifier(**clf_knn.best_params_)
best_hypeparam_knn.fit(X_train, y_train)

In [64]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(best_hypeparam_knn, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.757229761888259
f1_macro score: 0.738759865374033
precision_micro score: 0.757229761888259
precision_macro score: 0.7758261427626215
recall_micro score: 0.757229761888259
recall_macro score: 0.7563468643287601


In [65]:
prediction_knn = best_hypeparam_knn.predict(X_test)
classification_metrics(prediction_knn, y_test)

Accuracy: 0.7676842889054356
F1 Score: 0.7535829095526632
Recall Score: 0.7702716766513079
Precision Score: 0.7882238200472841


# Naive Bayes

# Neural Network (MLP)

In [66]:
mlp = MLPClassifier()
mlp

In [67]:
evaluate_classification(mlp)

ADASYN OverSampling


f1_micro score: 0.7114655411474828
f1_macro score: 0.6985929436926878
precision_micro score: 0.7314920692206682
precision_macro score: 0.7238296523504985
recall_micro score: 0.7267977325087429
recall_macro score: 0.7167432742827116

Random OverSampling
f1_micro score: 0.7017613753051309
f1_macro score: 0.722628285619605
precision_micro score: 0.7036244825664875
precision_macro score: 0.7160797405719476
recall_micro score: 0.7271710977259355
recall_macro score: 0.7181494121222038

SMOTE OverSampling
f1_micro score: 0.7158169795314264
f1_macro score: 0.6944347363112696
precision_micro score: 0.7187897363374589
precision_macro score: 0.7222313922964211
recall_micro score: 0.7270749116344444
recall_macro score: 0.68505748917614

Borderline SMOTE OverSampling
f1_micro score: 0.7232616963587065
f1_macro score: 0.7259535147178219
precision_micro score: 0.7252212930010321
precision_macro score: 0.742285213212533
recall_micro score: 0.731358745525397
recall_macro score: 0.7407488786558936

SVM 

Karena Borderline SMOTE Oversampling menghasilkan nilai terbaik maka akan digunakan

In [92]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(mlp, X_borderSmote, y_borderSmote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.7424347905959465
f1_macro score: 0.7203365151168023
precision_micro score: 0.7489539979185676
precision_macro score: 0.7506237500385182
recall_micro score: 0.7432739059265364
recall_macro score: 0.7254987286671913


In [94]:
X_train, X_test, y_train, y_test = train_test_split(X_borderSmote, y_borderSmote, test_size=0.25, random_state=42)

## Hyperparameter Tuning

In [95]:
param_grid = {
    'hidden_layer_sizes': [(100,), (50, 50), (30, 20, 10)],
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive'],
    'max_iter': [10000],
    'random_state': [42]
}

tuned_mlp = GridSearchCV(mlp, param_grid, cv=cv)
tuned_mlp.fit(X_train, y_train)

In [71]:
print("Hyperparameter terbaik untuk Decision Tree Classifier")
print(tuned_mlp.best_params_)

Hyperparameter terbaik untuk Decision Tree Classifier
{'activation': 'logistic', 'alpha': 1e-05, 'hidden_layer_sizes': 10, 'max_iter': 1000, 'random_state': 42, 'solver': 'adam'}


In [72]:
best_hypeparam_mlp = MLPClassifier(**tuned_mlp.best_params_)
best_hypeparam_mlp.fit(X_train, y_train)

In [73]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(best_hypeparam_mlp, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.676993063883709
f1_macro score: 0.7118221371248534
precision_micro score: 0.676993063883709
precision_macro score: 0.7119523723308918
recall_micro score: 0.676993063883709
recall_macro score: 0.7161506430083246


In [74]:
prediction_mlp = best_hypeparam_mlp.predict(X_test)
classification_metrics(prediction_mlp, y_test)

Accuracy: 0.6658428077113199
F1 Score: 0.7016426666936328
Recall Score: 0.7031492494782547
Precision Score: 0.7056713141602785


# Kesimpulan

Dari semua model classification, untuk kobinasi pre processing ini yang menghasilkan hasil paling bagus adalah Random Forest dengan data yang di Random Oversampling