In [1]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, NearMiss, TomekLinks
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB

In [2]:
df = pd.read_csv('dataframe/UFC_kombinasi5.csv')
df.head()

Unnamed: 0,R_avg_opp_DISTANCE_landed,R_age,R_avg_opp_SIG_STR_pct,B_avg_DISTANCE_landed,R_avg_opp_CLINCH_att,B_avg_opp_LEG_landed,B_avg_BODY_landed,B_avg_opp_BODY_landed,B_avg_GROUND_att,R_avg_opp_HEAD_landed,...,R_avg_GROUND_landed,R_avg_opp_BODY_att,R_avg_CLINCH_att,R_avg_HEAD_landed,B_avg_CLINCH_att,B_age,R_avg_TD_landed,B_avg_BODY_att,R_avg_TOTAL_STR_landed,Winner
0,31.0,21.0,0.37,19.0,7.5,3.4375,3.5,8.433838,0.0,19.827877,...,0.455658,13.5,3.621492,40.0,12.0,26.0,1.0,6.25,61.234375,Red
1,38.40625,30.0,0.339062,46.8125,4.5625,2.09375,7.765625,2.070312,4.757812,27.65625,...,7.25,17.59375,14.28125,37.78125,5.671875,27.0,1.875,13.625,87.59375,Blue
2,15.0,31.0,0.373164,17.0,14.43969,0.25,5.25,2.125,22.75,15.5625,...,0.0,6.0,11.586592,21.25,7.0,28.0,1.625,7.375,36.0,Blue
3,33.409025,34.0,0.325567,8.875,6.983452,1.710938,4.546875,4.453125,1.648438,27.688076,...,4.844223,8.554874,8.778824,19.695154,11.867188,28.0,1.440809,5.429688,41.379938,Blue
4,43.398438,31.0,0.42,20.501221,23.234375,6.250977,5.452637,9.236328,7.906982,35.625,...,0.828125,24.078125,15.484375,15.8125,9.979004,34.0,1.171875,8.149902,72.429688,Red


In [3]:
X = df.drop('Winner', axis=1).values
X

array([[31.        , 21.        ,  0.37      , ...,  1.        ,
         6.25      , 61.234375  ],
       [38.40625   , 30.        ,  0.3390625 , ...,  1.875     ,
        13.625     , 87.59375   ],
       [15.        , 31.        ,  0.37316406, ...,  1.625     ,
         7.375     , 36.        ],
       ...,
       [16.        , 30.        ,  0.42      , ...,  1.78125   ,
         7.75      , 68.        ],
       [21.4609375 , 34.        ,  0.35289063, ...,  4.34375   ,
        15.3125    , 57.3046875 ],
       [12.22265625, 31.        ,  0.305     , ...,  0.53125   ,
         2.        , 40.7890625 ]])

In [4]:
y = df['Winner'].values
y

array(['Red', 'Blue', 'Blue', ..., 'Red', 'Blue', 'Red'], dtype=object)

In [5]:
counter_y = Counter(y)
print(counter_y)

Counter({'Red': 3581, 'Blue': 1730, 'Draw': 99})


In [5]:
# Over Sampling
adasyn = ADASYN()
randomOver = RandomOverSampler()
smote = SMOTE()
borderSmote = BorderlineSMOTE()
svmSmote = SVMSMOTE()

# Melakukan resampling
X_adasyn, y_adasyn = adasyn.fit_resample(X, y)
X_randomOver, y_randomOver = randomOver.fit_resample(X, y)
X_smote, y_smote = smote.fit_resample(X, y)
X_borderSmote, y_borderSmote = borderSmote.fit_resample(X, y)
X_svmSmote, y_svmSmote = svmSmote.fit_resample(X, y)


# Under Sampling
rand_under = RandomUnderSampler(sampling_strategy='majority')
nearmiss = NearMiss()
nearmiss2 = NearMiss(version=2)
nearmiss3 = NearMiss(version=3)
tomek = TomekLinks()

# Melakukan resampling
X_rand_under, y_rand_under = rand_under.fit_resample(X, y)
X_nearmiss, y_nearmiss = nearmiss.fit_resample(X, y)
X_nearmiss2, y_nearmiss2 = nearmiss2.fit_resample(X, y)
X_nearmiss3, y_nearmiss3 = nearmiss3.fit_resample(X, y)
X_tomek, y_tomek = tomek.fit_resample(X, y)


In [6]:
def evaluate_classification(classifier):
  cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

  scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
  print("ADASYN OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_adasyn, y_adasyn, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Random OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_randomOver, y_randomOver, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("SMOTE OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_smote, y_smote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Borderline SMOTE OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_borderSmote, y_borderSmote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("SVM SMOTE OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_svmSmote, y_svmSmote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Random Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_rand_under, y_rand_under , scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Near Miss 1 Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_nearmiss, y_nearmiss, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Near Miss 2 Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_nearmiss2, y_nearmiss2, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Near Miss 3 Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_nearmiss3, y_nearmiss3, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Tomek Links Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_tomek, y_tomek, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

# Decision Tree

In [8]:
dt = DecisionTreeClassifier()
dt

In [9]:
evaluate_classification(dt)

ADASYN OverSampling
f1_micro score: 0.6948476048196209
f1_macro score: 0.6954048637470457
precision_micro score: 0.6981848437937048
precision_macro score: 0.6924732233168966
recall_micro score: 0.6992044351579851
recall_macro score: 0.6981131161662675

Random OverSampling
f1_micro score: 0.8308672432632909
f1_macro score: 0.8299394393998796
precision_micro score: 0.8342175521419601
precision_macro score: 0.8362050407370785
recall_micro score: 0.83598633637576
recall_macro score: 0.8340312028444611

SMOTE OverSampling
f1_micro score: 0.6864951261380938
f1_macro score: 0.6844464855559377
precision_micro score: 0.6916146091941772
precision_macro score: 0.6900479564026332
recall_micro score: 0.6894729955381785
recall_macro score: 0.6905853844775328

Borderline SMOTE OverSampling
f1_micro score: 0.7521165272834882
f1_macro score: 0.7525363712157828
precision_micro score: 0.753420845345763
precision_macro score: 0.7501261497970129
recall_micro score: 0.7539788113305482
recall_macro score: 0.

Karena Random Oversampling menghasilkan score terbaik maka akan digunakan Random Oversampling

In [10]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(dt, X_randomOver, y_randomOver, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.8328212507138133
f1_macro score: 0.8346783786006471
precision_micro score: 0.8327290507126003
precision_macro score: 0.8356949459540125
recall_micro score: 0.8336594561759704
recall_macro score: 0.834031852146202


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_randomOver, y_randomOver, test_size=0.25, random_state=42)

## Hyperparameter Tuning

In [12]:
param_grid_c = {'criterion': ['gini', 'entropy'],
               'min_samples_split': [2, 5, 10],
               'max_depth' : [None, 10, 20, 30],
               'min_samples_leaf': [1, 2, 4]
               }

clf_dtc = GridSearchCV(estimator= DecisionTreeClassifier(), param_grid=param_grid_c, cv= 5)
clf_dtc.fit(X_train, y_train)

In [13]:
print("Hyperparameter terbaik untuk Decision Tree Classifier")
print(clf_dtc.best_params_)

Hyperparameter terbaik untuk Decision Tree Classifier
{'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [14]:
best_hypeparam_dtc = DecisionTreeClassifier(**clf_dtc.best_params_)
best_hypeparam_dtc.fit(X_train, y_train)

In [15]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(best_hypeparam_dtc, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.8007960162222194
f1_macro score: 0.8007689907347034
precision_micro score: 0.8014169797283872
precision_macro score: 0.8052796539948337
recall_micro score: 0.805015841770212
recall_macro score: 0.8030546880189551


In [7]:
def classification_metrics(prediction, y_test):
  accuracy = accuracy_score(y_test, prediction)
  f1 = f1_score(y_test, prediction, average="macro")
  recall = recall_score(y_test, prediction, average="macro")
  precision = precision_score(y_test, prediction, average="macro")

  print('Accuracy: ' + str(accuracy))
  print('F1 Score: ' + str(f1))
  print('Recall Score: ' + str(recall))
  print('Precision Score: ' + str(precision))

In [17]:
prediction = best_hypeparam_dtc.predict(X_test)
classification_metrics(prediction, y_test)

Accuracy: 0.815711094564408
F1 Score: 0.814152097147263
Recall Score: 0.8173594533179108
Precision Score: 0.8172640164329823


# Random Forest

In [18]:
rf = RandomForestClassifier()

In [19]:
evaluate_classification(rf)

ADASYN OverSampling
f1_micro score: 0.8170197426058143
f1_macro score: 0.8138105889816396
precision_micro score: 0.8182245618829226
precision_macro score: 0.8209661509228724
recall_micro score: 0.8193357574945119
recall_macro score: 0.8173911394295834

Random OverSampling
f1_micro score: 0.8788030117511674
f1_macro score: 0.8806436353304699
precision_micro score: 0.8773145103218075
precision_macro score: 0.8786151918741709
recall_micro score: 0.8794557773612602
recall_macro score: 0.8812227130943382

SMOTE OverSampling
f1_micro score: 0.8210007599567769
f1_macro score: 0.8206326094891108
precision_micro score: 0.8230487004912422
precision_macro score: 0.8249702430346251
recall_micro score: 0.8180220673423957
recall_macro score: 0.8227695835898075

Borderline SMOTE OverSampling
f1_micro score: 0.8277944009296256
f1_macro score: 0.8296160452222134
precision_micro score: 0.8334737130344285
precision_macro score: 0.8307904398004462
recall_micro score: 0.8292845487875786
recall_macro score:

Karena score nya paling tinggi maka akan digunakan Random Oversampling

In [20]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(rf, X_randomOver, y_randomOver, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.8767555044873845
f1_macro score: 0.8784798483852164
precision_micro score: 0.881689374383131
precision_macro score: 0.8796432466634583
recall_micro score: 0.8788968148539054
recall_macro score: 0.8798259351892975


## Hyperparameter Tuning

In [21]:
param_grid_c = {'criterion': ['gini', 'entropy'],
               'min_samples_split': [2, 5, 10],
               'max_depth' : [None, 10, 20, 30],
               'min_samples_leaf': [1, 2, 4]
               }

clf_rfc = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid_c, cv= 5)
clf_rfc.fit(X_train, y_train)

In [22]:
print("Hyperparameter terbaik untuk Decision Tree Classifier")
print(clf_rfc.best_params_)

Hyperparameter terbaik untuk Decision Tree Classifier
{'criterion': 'gini', 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5}


In [23]:
best_hypeparam_rfc = RandomForestClassifier(**clf_rfc.best_params_)
best_hypeparam_rfc.fit(X_train, y_train)

In [24]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(best_hypeparam_rfc, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.8483305685324067
f1_macro score: 0.8506974510096198
precision_micro score: 0.8453528240246568
precision_macro score: 0.8421465691388242
recall_micro score: 0.8462213103770141
recall_macro score: 0.8448465185153774


In [25]:
prediction_rf = best_hypeparam_rfc.predict(X_test)
classification_metrics(prediction_rf, y_test)

Accuracy: 0.8637379002233805
F1 Score: 0.8645872030090107
Recall Score: 0.8646896200953248
Precision Score: 0.8645255783119815


# Logistic Regression

In [26]:
logistic = LogisticRegression(max_iter=2000)
logistic

In [27]:
evaluate_classification(logistic)

ADASYN OverSampling
f1_micro score: 0.44150884054029316
f1_macro score: 0.4379636112602743
precision_micro score: 0.44150884054029316
precision_macro score: 0.4388835907462026
recall_micro score: 0.44150884054029316
recall_macro score: 0.4413994538931245

Random OverSampling
f1_micro score: 0.44103086360378957
f1_macro score: 0.43806559371331594
precision_micro score: 0.44103086360378957
precision_macro score: 0.43824090919670367
recall_micro score: 0.44103086360378957
recall_macro score: 0.4410331429580629

SMOTE OverSampling
f1_micro score: 0.44317399370717664
f1_macro score: 0.4410358742632764
precision_micro score: 0.44317399370717664
precision_macro score: 0.4411661840696756
recall_micro score: 0.44317399370717664
recall_macro score: 0.44317311163574685

Borderline SMOTE OverSampling
f1_micro score: 0.5610160804081062
f1_macro score: 0.549766344554494
precision_micro score: 0.5610160804081062
precision_macro score: 0.5538113375592868
recall_micro score: 0.5610160804081062
recall_m

Karena Nearmiss 2 undersampling menghasilkan nilai terbaik maka akan digunakan

In [28]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(logistic, X_nearmiss2, y_nearmiss2, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.6667231638418079
f1_macro score: 0.6707289480403491
precision_micro score: 0.6667231638418079
precision_macro score: 0.6835299561194701
recall_micro score: 0.6667231638418079
recall_macro score: 0.6668421052631579


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X_nearmiss2, y_nearmiss2, test_size=0.25, random_state=42)

In [30]:
logistic.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [31]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(logistic, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.693030303030303
f1_macro score: 0.6915311843964596
precision_micro score: 0.693030303030303
precision_macro score: 0.7139263124789441
recall_micro score: 0.693030303030303
recall_macro score: 0.6926190476190476


In [32]:
prediction_logreg = logistic.predict(X_test)
classification_metrics(prediction_logreg, y_test)

Accuracy: 0.5866666666666667
F1 Score: 0.5992869875222816
Recall Score: 0.5942028985507246
Precision Score: 0.6067019400352733


# Softmax Regression

In [33]:
softmax = LogisticRegression(max_iter=2000, multi_class='multinomial')
softmax

In [34]:
evaluate_classification(softmax)

ADASYN OverSampling
f1_micro score: 0.44150884054029316
f1_macro score: 0.4379636112602743
precision_micro score: 0.44150884054029316
precision_macro score: 0.4388835907462026
recall_micro score: 0.44150884054029316
recall_macro score: 0.4413994538931245

Random OverSampling
f1_micro score: 0.44103086360378957
f1_macro score: 0.43806559371331594
precision_micro score: 0.44103086360378957
precision_macro score: 0.43824090919670367
recall_micro score: 0.44103086360378957
recall_macro score: 0.4410331429580629

SMOTE OverSampling
f1_micro score: 0.44317399370717664
f1_macro score: 0.4410358742632764
precision_micro score: 0.44317399370717664
precision_macro score: 0.4411661840696756
recall_micro score: 0.44317399370717664
recall_macro score: 0.44317311163574685

Borderline SMOTE OverSampling
f1_micro score: 0.5610160804081062
f1_macro score: 0.549766344554494
precision_micro score: 0.5610160804081062
precision_macro score: 0.5538113375592868
recall_micro score: 0.5610160804081062
recall_m

Karena Nearmiss 2 undersampling menghasilkan nilai terbaik maka akan digunakan

In [35]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(softmax, X_nearmiss2, y_nearmiss2, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.6667231638418079
f1_macro score: 0.6707289480403491
precision_micro score: 0.6667231638418079
precision_macro score: 0.6835299561194701
recall_micro score: 0.6667231638418079
recall_macro score: 0.6668421052631579


In [36]:
softmax.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [37]:
prediction_softmax = softmax.predict(X_test)
classification_metrics(prediction_softmax, y_test)

Accuracy: 0.5866666666666667
F1 Score: 0.5992869875222816
Recall Score: 0.5942028985507246
Precision Score: 0.6067019400352733


# KNN

In [38]:
knn = KNeighborsClassifier()
knn

In [39]:
evaluate_classification(knn)

ADASYN OverSampling
f1_micro score: 0.7027245388959282
f1_macro score: 0.6747574201989857
precision_micro score: 0.7027245388959281
precision_macro score: 0.7067632405633725
recall_micro score: 0.7027245388959281
recall_macro score: 0.7015732770215718

Random OverSampling
f1_micro score: 0.7239132488108886
f1_macro score: 0.7160447806430184
precision_micro score: 0.7239132488108886
precision_macro score: 0.7154948913713597
recall_micro score: 0.7239132488108886
recall_macro score: 0.7239131338158944

SMOTE OverSampling
f1_micro score: 0.709206438748957
f1_macro score: 0.6836943537946286
precision_micro score: 0.709206438748957
precision_macro score: 0.7129270059464068
recall_micro score: 0.709206438748957
recall_macro score: 0.7092074882671175

Borderline SMOTE OverSampling
f1_micro score: 0.7502554130672705
f1_macro score: 0.7378912811918127
precision_micro score: 0.7502554130672705
precision_macro score: 0.7618493325896653
recall_micro score: 0.7502554130672705
recall_macro score: 0.

Karena Borderline SMOTE oversampling menghasilkan nilai terbaik maka akan digunakan

In [40]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(knn, X_borderSmote, y_borderSmote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.7502554130672705
f1_macro score: 0.7378912811918127
precision_micro score: 0.7502554130672705
precision_macro score: 0.7618493325896653
recall_micro score: 0.7502554130672705
recall_macro score: 0.7502560846066141


In [41]:
X_train, X_test, y_train, y_test = train_test_split(X_borderSmote, y_borderSmote, test_size=0.25, random_state=42)

## Hyperparameter Tuning

In [42]:
tuned_params = [{'n_neighbors': [2,3,4,5,6,7,8,9,10],'metric': ['euclidean', 'manhattan', 'jaccard']}]

clf_knn = GridSearchCV(knn, tuned_params, cv=cv, verbose = 2)
clf_knn.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END ....................metric=euclidean, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=4; total time=   0.0s
[CV] END ....................metric=euclidean, 

Traceback (most recent call last):
  File "C:\Users\fadlan\AppData\Roaming\Python\Python311\site-packages\sklearn\model_selection\_validation.py", line 810, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\fadlan\AppData\Roaming\Python\Python311\site-packages\sklearn\metrics\_scorer.py", line 527, in __call__
    return estimator.score(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\fadlan\AppData\Roaming\Python\Python311\site-packages\sklearn\base.py", line 706, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
                             ^^^^^^^^^^^^^^^
  File "C:\Users\fadlan\AppData\Roaming\Python\Python311\site-packages\sklearn\neighbors\_classification.py", line 254, in predict
    probabilities = self.predict_proba(X)
                    ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\fadlan\AppData\Roaming\Python\Python311\site-packages\sklearn\neighbo

[CV] END ....................metric=manhattan, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=4; total time=   0.0s
[CV] END ....................metric=manhattan, n_neighbors=4; total time=   0.0s
[CV] END ...................

Traceback (most recent call last):
  File "C:\Users\fadlan\AppData\Roaming\Python\Python311\site-packages\sklearn\model_selection\_validation.py", line 810, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\fadlan\AppData\Roaming\Python\Python311\site-packages\sklearn\metrics\_scorer.py", line 527, in __call__
    return estimator.score(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\fadlan\AppData\Roaming\Python\Python311\site-packages\sklearn\base.py", line 706, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
                             ^^^^^^^^^^^^^^^
  File "C:\Users\fadlan\AppData\Roaming\Python\Python311\site-packages\sklearn\neighbors\_classification.py", line 254, in predict
    probabilities = self.predict_proba(X)
                    ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\fadlan\AppData\Roaming\Python\Python311\site-packages\sklearn\neighbo

[CV] END ...................metric=manhattan, n_neighbors=10; total time=   0.0s
[CV] END ...................metric=manhattan, n_neighbors=10; total time=   0.0s
[CV] END ......................metric=jaccard, n_neighbors=2; total time=   0.5s




[CV] END ......................metric=jaccard, n_neighbors=2; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=2; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=2; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=2; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=3; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=3; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=3; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=3; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=3; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=4; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=4; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=4; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=4; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=4; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=5; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=5; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=5; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=5; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=5; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=6; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=6; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=6; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=6; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=6; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=7; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=7; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=7; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=7; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=7; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=8; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=8; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=8; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=8; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=8; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=9; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=9; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=9; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=9; total time=   0.4s




[CV] END ......................metric=jaccard, n_neighbors=9; total time=   0.4s




[CV] END .....................metric=jaccard, n_neighbors=10; total time=   0.4s




[CV] END .....................metric=jaccard, n_neighbors=10; total time=   0.4s




[CV] END .....................metric=jaccard, n_neighbors=10; total time=   0.4s




[CV] END .....................metric=jaccard, n_neighbors=10; total time=   0.4s




[CV] END .....................metric=jaccard, n_neighbors=10; total time=   0.4s


 0.71366636 0.71180447 0.70373741        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.37855346 0.4001491  0.41690518 0.42000615 0.38934181 0.41416887
 0.40709406 0.41168587 0.38115969]


In [43]:
print("Hyperparameter terbaik untuk Decision Tree Classifier")
print(clf_knn.best_params_)

Hyperparameter terbaik untuk Decision Tree Classifier
{'metric': 'euclidean', 'n_neighbors': 3}


In [44]:
best_hypeparam_knn = KNeighborsClassifier(**clf_knn.best_params_)
best_hypeparam_knn.fit(X_train, y_train)

In [45]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(best_hypeparam_knn, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.7516439398490219
f1_macro score: 0.7407029043437638
precision_micro score: 0.7516439398490219
precision_macro score: 0.7600064448561674
recall_micro score: 0.7516439398490219
recall_macro score: 0.7508280190725982


In [46]:
prediction_knn = best_hypeparam_knn.predict(X_test)
classification_metrics(prediction_knn, y_test)

Accuracy: 0.7568875651526433
F1 Score: 0.7478868615710721
Recall Score: 0.7593701600261925
Precision Score: 0.7710574040792736


# Naive Bayes

In [8]:
gaussNB = GaussianNB()

In [9]:
evaluate_classification(gaussNB)

ADASYN OverSampling
f1_micro score: 0.43733534759558185
f1_macro score: 0.4173087505455319
precision_micro score: 0.43733534759558185
precision_macro score: 0.4459452903294907
recall_micro score: 0.43733534759558185
recall_macro score: 0.4364360316472533

Random OverSampling
f1_micro score: 0.41319928804961464
f1_macro score: 0.4000780750380623
precision_micro score: 0.41319928804961464
precision_macro score: 0.41342026481985145
recall_micro score: 0.41319928804961464
recall_macro score: 0.4131960190011662

SMOTE OverSampling
f1_micro score: 0.4486642698132517
f1_macro score: 0.4324565589795911
precision_micro score: 0.4486642698132517
precision_macro score: 0.46066041778741446
recall_micro score: 0.4486642698132517
recall_macro score: 0.44865880232398075

Borderline SMOTE OverSampling
f1_micro score: 0.5418409497986592
f1_macro score: 0.5246216494481313
precision_micro score: 0.5418409497986592
precision_macro score: 0.5461309198872608
recall_micro score: 0.5418409497986592
recall_mac

Karena menghasilkan nilai paling baik maka Nearmiss 2 Undersampling akan digunakan

In [10]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(gaussNB, X_nearmiss2, y_nearmiss2, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.7105084745762712
f1_macro score: 0.7061918433255358
precision_micro score: 0.7105084745762712
precision_macro score: 0.7124465554702912
recall_micro score: 0.7105084745762712
recall_macro score: 0.7110526315789474


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_nearmiss2, y_nearmiss2, test_size=0.25, random_state=42)

In [12]:
gaussNB.fit(X_train, y_train)

In [13]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(gaussNB, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.7614141414141414
f1_macro score: 0.7572507030864466
precision_micro score: 0.7614141414141414
precision_macro score: 0.762990413725708
recall_micro score: 0.7614141414141414
recall_macro score: 0.7590079365079365


In [14]:
prediction_nb= gaussNB.predict(X_test)
classification_metrics(prediction_nb, y_test)

Accuracy: 0.6266666666666667
F1 Score: 0.6224567346318373
Recall Score: 0.6331090407177363
Precision Score: 0.6215538847117794


# Neural Network (MLP)

In [47]:
mlp = MLPClassifier()
mlp

In [48]:
evaluate_classification(mlp)

ADASYN OverSampling
f1_micro score: 0.6987378206524612
f1_macro score: 0.6778408627190603
precision_micro score: 0.7049500666622841
precision_macro score: 0.7086812205075079
recall_micro score: 0.7038374101949684
recall_macro score: 0.6920461740776032

Random OverSampling
f1_micro score: 0.7299633539656833
f1_macro score: 0.7100576526633232
precision_micro score: 0.7094852917601449
precision_macro score: 0.7222602337843136
recall_micro score: 0.7183289096396661
recall_macro score: 0.7162035846650513

SMOTE OverSampling
f1_micro score: 0.7184240992085879
f1_macro score: 0.699603897532815
precision_micro score: 0.7283807894711758
precision_macro score: 0.7156963850087695
recall_micro score: 0.7275457468850004
recall_macro score: 0.713757275426007

Borderline SMOTE OverSampling
f1_micro score: 0.735177073395187
f1_macro score: 0.7264958012019924
precision_micro score: 0.7188872222409973
precision_macro score: 0.7404945986853655
recall_micro score: 0.7384345323666197
recall_macro score: 0.

Karena SVM SMOTE Oversampling menghasilkan nilai terbaik maka akan digunakan

In [49]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(mlp, X_svmSmote, y_svmSmote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.7141945138950939
f1_macro score: 0.7283947072543453
precision_micro score: 0.7065644772712328
precision_macro score: 0.7489291061227831
recall_micro score: 0.7012626437178866
recall_macro score: 0.7343986225004693


In [50]:
X_train, X_test, y_train, y_test = train_test_split(X_svmSmote, y_svmSmote, test_size=0.25, random_state=42)

## Hyperparameter Tuning

In [51]:
param_grid = {
    'hidden_layer_sizes': [(5), (10), (5, 10)],
    'alpha': [1e-5],
    'activation': ['identity', 'logistic', 'relu', 'tanh'],
    'solver': ['sgd', 'adam'],
    'max_iter': [1000],
    'random_state': [42]
}

tuned_mlp = GridSearchCV(mlp, param_grid, cv=cv)
tuned_mlp.fit(X_train, y_train)

In [52]:
print("Hyperparameter terbaik untuk Decision Tree Classifier")
print(tuned_mlp.best_params_)

Hyperparameter terbaik untuk Decision Tree Classifier
{'activation': 'tanh', 'alpha': 1e-05, 'hidden_layer_sizes': 10, 'max_iter': 1000, 'random_state': 42, 'solver': 'adam'}


In [53]:
best_hypeparam_mlp = MLPClassifier(**tuned_mlp.best_params_)
best_hypeparam_mlp.fit(X_train, y_train)

In [54]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(best_hypeparam_mlp, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.657759207520954
f1_macro score: 0.6834676041416483
precision_micro score: 0.657759207520954
precision_macro score: 0.6870695340047452
recall_micro score: 0.657759207520954
recall_macro score: 0.6884414394033271


In [55]:
prediction_mlp = best_hypeparam_mlp.predict(X_test)
classification_metrics(prediction_mlp, y_test)

Accuracy: 0.6697753285290378
F1 Score: 0.6900808270486151
Recall Score: 0.6993403621688161
Precision Score: 0.6902213664342799


# Kesimpulan

Dari semua model classification, untuk kobinasi pre processing ini yang menghasilkan hasil paling bagus adalah Random Forest dengan data yang di Random Oversampling