# **Instalasi Cardea**

In [1]:
#! pip install cardea==0.1.2
#! pip install 'urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1' # Dijadikan komentar karena keberadaannya terdapat pada baris code asli milik Author, namun tidak memengaruhi baris code dari hasil modifikasi

# **Import Library** 

In [2]:
# Library
import pandas as pd
import numpy as np
import sklearn
# Cardea
from cardea import Cardea

# Model
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier

# Grid Searh & Random Search
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.21.3.


In [3]:
# optional
import warnings
warnings.filterwarnings("ignore")

In [4]:
cd = Cardea()

In [5]:
#! curl -O https://dai-cardea.s3.amazonaws.com/kaggle.zip && unzip kaggle.zip

In [6]:
cd.load_entityset(data='kaggle')

# to view the loaded entityset
cd.es

Entityset: fhir
  Entities:
    Observation [Rows: 110527, Columns: 3]
    Appointment [Rows: 110527, Columns: 5]
    Coding [Rows: 3, Columns: 2]
    Reference [Rows: 6100, Columns: 1]
    Address [Rows: 81, Columns: 2]
    Appointment_Participant [Rows: 6100, Columns: 2]
    CodeableConcept [Rows: 4, Columns: 2]
    Patient [Rows: 6100, Columns: 4]
    Identifier [Rows: 227151, Columns: 1]
  Relationships:
    Observation.code -> CodeableConcept.object_id
    Observation.subject -> Reference.identifier
    Appointment.participant -> Appointment_Participant.object_id
    Appointment_Participant.actor -> Reference.identifier
    CodeableConcept.coding -> Coding.object_id
    Patient.address -> Address.object_id

In [7]:
cd.list_problems()

{'DiagnosisPrediction',
 'LengthOfStay',
 'MissedAppointment',
 'MortalityPrediction',
 'ProlongedLengthOfStay',
 'Readmission'}

In [8]:
# select problem
label_times = cd.select_problem('MissedAppointment')

In [9]:
# feature engineering
feature_matrix = cd.generate_features(label_times[:1000]) # takes a while for the full dataset
feature_matrix.head(5)

Built 13 features
Elapsed: 00:30 | Progress: 100%|██████████


Unnamed: 0,participant = 2680425062,participant = 4275143764,participant = 2615334244,participant = 2410824900,participant = 2406221984,participant = 1868414665,participant = 1692482157,participant = 1125465544,participant = 846537388,participant = 4121228070,participant is unknown,DAY(created) = 29,DAY(created) = 28,DAY(created) = 27,DAY(created) = 18,DAY(created) = 15,DAY(created) = 26,DAY(created) = 25,DAY(created) = 5,DAY(created) = 1,DAY(created) = 8,DAY(created) is unknown,DAY(start) = 29,DAY(start) is unknown,IS_WEEKEND(created),IS_WEEKEND(start),MONTH(created) = 4,MONTH(created) = 3,MONTH(created) = 2,MONTH(created) = 1,MONTH(created) is unknown,MONTH(start) = 4,MONTH(start) is unknown,WEEKDAY(created) = 4,WEEKDAY(created) = 2,WEEKDAY(created) = 1,WEEKDAY(created) = 3,WEEKDAY(created) = 0,WEEKDAY(created) is unknown,WEEKDAY(start) = 4,WEEKDAY(start) is unknown,YEAR(created) = 2016,YEAR(created) is unknown,YEAR(start) = 2016,YEAR(start) is unknown,Appointment_Participant.actor = 74200000000000,Appointment_Participant.actor = 713000000000000,Appointment_Participant.actor = 41400000000000,Appointment_Participant.actor = 28200000000000,Appointment_Participant.actor = 7270000000000,Appointment_Participant.actor = 7230000000000,Appointment_Participant.actor = 4920000000000,Appointment_Participant.actor = 3880000000000,Appointment_Participant.actor = 2760000000000,Appointment_Participant.actor = 735000000000000,Appointment_Participant.actor is unknown,Appointment_Participant.COUNT(Appointment),label
0,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,True,False,False,False,False,False,True,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,True,39,noshow
1,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,True,False,False,False,False,False,True,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,True,27,noshow
2,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,True,False,False,False,False,False,True,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,True,55,noshow
3,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,True,False,False,False,False,False,True,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,True,39,noshow
4,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,True,False,False,False,False,False,True,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,True,28,noshow


In [10]:
from sklearn import preprocessing
from collections import Counter
# shuffle the dataframe
feature_matrix = feature_matrix.sample(frac=1)

label = list(feature_matrix.pop('label'))
target = preprocessing.LabelEncoder()
target.fit(label) 
y= target.transform(label) #representing class dengan sklearn

# pop the target labels

X = feature_matrix.values

X_train, X_test, y_train, y_test = cd.train_test_split(X, y, test_size=0.5, shuffle=True)



print('Resampled dataset shape %s' % Counter(y_train))

Resampled dataset shape Counter({1: 425, 0: 75})


In [11]:
gnb = GaussianNB().fit(X_train, y_train)
y_test_pred = gnb.predict(X_test)
print(classification_report(y_test , y_test_pred))

              precision    recall  f1-score   support

           0       0.20      0.98      0.33        96
           1       0.89      0.04      0.08       404

    accuracy                           0.22       500
   macro avg       0.54      0.51      0.20       500
weighted avg       0.76      0.22      0.12       500



# **Pengujian Pipeline**

##**Modeling**

In [12]:
cd.select_pipeline('Random Forest')
cd.fit(X_train, y_train)
y_pred = cd.predict(X_test) # Dijadikan komentar karena keberadaannya terdapat pada baris code asli milik Author, namun tidak memengaruhi baris code dari hasil modifikasi

In [13]:
cd.evaluate(X_test, y_test, test_size=0.2, shuffle=True)

{'Accuracy': 0.7, 'Confusion Matrix': array([[ 3, 16],
        [14, 67]]), 'F1 Macro': 0.49186991869918706, 'Precision': 0.4918497519489724, 'Recall': 0.49252761533463285}

## **Modeling 1**

In [14]:
cd.select_pipeline('Logistic Regression')
cd.fit(X_train, y_train)
# y_pred = cd.predict(X_test) # Dijadikan komentar karena keberadaannya terdapat pada baris code asli milik Author, namun tidak memengaruhi baris code dari hasil modifikasi

In [15]:
cd.evaluate(X_test, y_test, test_size=0.2, shuffle=True)

{'Accuracy': 0.9, 'Confusion Matrix': array([[ 0, 10],
        [ 0, 90]]), 'F1 Macro': 0.4736842105263158, 'Precision': 0.45, 'Recall': 0.5}

## **Modeling 2**

In [16]:
cd.select_pipeline('Gaussian Naive Bayes')
cd.fit(X_train, y_train)
y_pred = cd.predict(X_test) # Dijadikan komentar karena keberadaannya terdapat pada baris code asli milik Author, namun tidak memengaruhi baris code dari hasil modifikasi

In [17]:
cd.evaluate(y_test, y_pred)

{'Accuracy': 0.98, 'Confusion Matrix': array([[98,  0],
        [ 2,  0]]), 'F1 Macro': 0.494949494949495, 'Precision': 0.49, 'Recall': 0.5}

## **Modeling 3**

In [18]:
cd.select_pipeline('Gradient Boosting')
cd.fit(X_train, y_train)
# y_pred = cd.predict(X_test) # Dijadikan komentar karena keberadaannya terdapat pada baris code asli milik Author, namun tidak memengaruhi baris code dari hasil modifikasi

In [19]:
cd.evaluate(X_test, y_test, test_size=0.2, shuffle=True)

{'Accuracy': 0.79, 'Confusion Matrix': array([[ 0, 21],
        [ 0, 79]]), 'F1 Macro': 0.44134078212290506, 'Precision': 0.395, 'Recall': 0.5}

## **Modeling 4**

In [20]:
cd.select_pipeline('K-Nearest Neightbors')
cd.fit(X_train, y_train)
# y_pred = cd.predict(X_test) # Dijadikan komentar karena keberadaannya terdapat pada baris code asli milik Author, namun tidak memengaruhi baris code dari hasil modifikasi

In [21]:
cd.evaluate(X_test, y_test, test_size=0.2, shuffle=True)

{'Accuracy': 0.81, 'Confusion Matrix': array([[ 2, 16],
        [ 3, 79]]), 'F1 Macro': 0.5332842053549497, 'Precision': 0.6157894736842106, 'Recall': 0.5372628726287263}

## **Modeling 5**

In [22]:
cd.select_pipeline('Multinomial Naive Bayes')
cd.fit(X_train, y_train)
# y_pred = cd.predict(X_test) # Dijadikan komentar karena keberadaannya terdapat pada baris code asli milik Author, namun tidak memengaruhi baris code dari hasil modifikasi

In [23]:
cd.evaluate(X_test, y_test, test_size=0.2, shuffle=True)

{'Accuracy': 0.69, 'Confusion Matrix': array([[ 4, 14],
        [17, 65]]), 'F1 Macro': 0.5062908106386368, 'Precision': 0.5066305003013865, 'Recall': 0.5074525745257452}

## **Modeling 6**

In [24]:
# modeling6
cd.select_pipeline('Stochastic Gradient Descent')
cd.fit(X_train, y_train)
# y_pred = cd.predict(X_test) # Dijadikan komentar karena keberadaannya terdapat pada baris code asli milik Author, namun tidak memengaruhi baris code dari hasil modifikasi

In [25]:
cd.evaluate(X_test, y_test, test_size=0.2, shuffle=True)

{'Accuracy': 0.72, 'Confusion Matrix': array([[ 4, 14],
        [14, 68]]), 'F1 Macro': 0.5257452574525745, 'Precision': 0.5257452574525745, 'Recall': 0.5257452574525745}

## **Modeling 7**

In [26]:
# modeling7
cd.select_pipeline('XGB')
cd.fit(X_train, y_train)
# y_pred = cd.predict(X_test) # Dijadikan komentar karena keberadaannya terdapat pada baris code asli milik Author, namun tidak memengaruhi baris code dari hasil modifikasi

In [27]:
cd.evaluate(X_test, y_test, test_size=0.2, shuffle=True)

{'Accuracy': 0.74, 'Confusion Matrix': array([[ 2, 21],
        [ 5, 72]]), 'F1 Macro': 0.4901960784313725, 'Precision': 0.5299539170506913, 'Recall': 0.5110107284020328}

# **Inisiasi Hyperparameter**

In [28]:
param_rf = {'criterion': ["gini","entropy"], 'n_estimators': [10, 100, 1000]} # Hyperparameter Random Forest
param_lr = {'C': np.logspace(-3,3,5,7), 'penalty': ["l1","l2"], 'fit_intercept': ["True", "False"]} # Hyperparameter Logistic Regression
param_gnb = {'var_smoothing': [1e-9]} # Hyperparameter Gaussian Naive Bayes
param_gb = {'learning_rate': [0.001, 0.01, 0.1, 0.3], 'n_estimators': [10, 100, 1000]} #Hyperparameter Gradient Boosting
param_knn = {'n_neighbors': [5], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']} # Hyperparameter K-Nearest Neighbors
param_mnb = {'alpha': [1.0, 0.01, 0.001], "fit_prior": ['True', 'False']} # Hyperparameter Multinomial Naive Bayes
param_sgd = {'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'], 'penalty': ['l2', 'l1', 'elasticnet']} # Hyperparameter Stochastic Gradient Descent
param_xgb = {'booster': ['gbtree', 'gblinear']} # Hyperparameter XGboost

# **Implementasi Grid Search**

## **Grid Search Random Forest**

In [29]:
gridsearch = GridSearchCV(RandomForestClassifier(), param_rf, scoring='accuracy')
gridsearch.fit(X_train, y_train)

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
           

## **Grid Search Logistic Regression**

In [30]:
gridsearch1 = GridSearchCV(LogisticRegression(), param_lr, scoring='accuracy')
gridsearch1.fit(X_train, y_train)

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': array([1.00000000e-03, 3.16227766e-02, 1.00000000e+00, 3.16227766e+01,
       1.00000000e+03]),
                         'fit_intercept': ['True', 'False'],
                         'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='ac

## **Grid Search Gaussian Naive Bayes**

In [31]:
gridsearch2 = GridSearchCV(GaussianNB(), param_gnb, scoring='accuracy')
gridsearch2.fit(X_train, y_train)

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=GaussianNB(priors=None, var_smoothing=1e-09), iid='warn',
             n_jobs=None, param_grid={'var_smoothing': [1e-09]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

## **Grid Search Gradient Boosting**

In [32]:
gridsearch3 = GridSearchCV(GradientBoostingClassifier(), param_gb, scoring='accuracy')
gridsearch3.fit(X_train, y_train)

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_change=None,
                                                  pre

## **Grid Search K-Nearest Neightbors**

In [33]:
gridsearch4 = GridSearchCV(KNeighborsClassifier(), param_knn, scoring='accuracy')
gridsearch4.fit(X_train, y_train)

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                         'n_neighbors': [5],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

## **Grid Search Multinomial Naive Bayes**

In [34]:
gridsearch5 = GridSearchCV(MultinomialNB(), param_mnb, scoring='accuracy')
gridsearch5.fit(X_train, y_train)

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                     fit_prior=True),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [1.0, 0.01, 0.001],
                         'fit_prior': ['True', 'False']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

## **Grid Search Stochastic Gradient Descent**

In [35]:
gridsearch6 = GridSearchCV(SGDClassifier(), param_sgd, scoring='accuracy')
gridsearch6.fit(X_train, y_train)

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SGDClassifier(alpha=0.0001, average=False,
                                     class_weight=None, early_stopping=False,
                                     epsilon=0.1, eta0=0.0, fit_intercept=True,
                                     l1_ratio=0.15, learning_rate='optimal',
                                     loss='hinge', max_iter=1000,
                                     n_iter_no_change=5, n_jobs=None,
                                     penalty='l2', power_t=0.5,
                                     random_state=None, shuffle=True, tol=0.001,
                                     validation_fraction=0.1, verbose=0,
                                     warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'loss': ['hinge', 'log', 'modified_huber',
                                  'squared_hinge', 'perceptron', 'huber',
                                  'epsilon_insensiti

## **Grid Search XGBoost**

In [36]:
gridsearch7 = GridSearchCV(XGBClassifier(), param_xgb, scoring='accuracy')
gridsearch7.fit(X_train, y_train)

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='warn', n_jobs=None,
             param_grid={'booster': ['gbtree', 'gblinear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='acc

# **Hasil Grid Search**

In [37]:
print (gridsearch.best_params_) # RF
print(gridsearch1.best_params_) # LR
print(gridsearch2.best_params_) # GNB
print(gridsearch3.best_params_) # GB
print(gridsearch4.best_params_) # KNN
print(gridsearch5.best_params_) # MNB
print(gridsearch6.best_params_) # SGD
print(gridsearch7.best_params_) # XGB

{'criterion': 'gini', 'n_estimators': 10}
{'C': 1.0, 'fit_intercept': 'True', 'penalty': 'l2'}
{'var_smoothing': 1e-09}
{'learning_rate': 0.001, 'n_estimators': 10}
{'algorithm': 'auto', 'n_neighbors': 5, 'weights': 'uniform'}
{'alpha': 1.0, 'fit_prior': 'True'}
{'loss': 'log', 'penalty': 'l2'}
{'booster': 'gblinear'}


# **Implementasi Hyperparameter Berdasarkan Hasil Grid Search**

## **Random Forest**

In [38]:
rf = RandomForestClassifier(**gridsearch.best_params_).fit(X_train, y_train)
y_test_pred = rf.predict(X_test)
print(classification_report(y_test , y_test_pred))

              precision    recall  f1-score   support

           0       0.34      0.27      0.30        96
           1       0.83      0.88      0.86       404

    accuracy                           0.76       500
   macro avg       0.59      0.57      0.58       500
weighted avg       0.74      0.76      0.75       500



## **Logistic Regression**

In [39]:
lr = LogisticRegression(**gridsearch1.best_params_).fit(X_train, y_train)
y_test_pred = lr.predict(X_test)
print(classification_report(y_test , y_test_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        96
           1       0.81      0.99      0.89       404

    accuracy                           0.80       500
   macro avg       0.40      0.50      0.45       500
weighted avg       0.65      0.80      0.72       500



## **Gaussian Naive Bayes**

In [40]:
gnb = GaussianNB().fit(X_train, y_train)
y_test_pred = gnb.predict(X_test)
print(classification_report(y_test , y_test_pred))

              precision    recall  f1-score   support

           0       0.20      0.98      0.33        96
           1       0.89      0.04      0.08       404

    accuracy                           0.22       500
   macro avg       0.54      0.51      0.20       500
weighted avg       0.76      0.22      0.12       500



## **Gradient Boosting**

In [41]:
gb = GradientBoostingClassifier(**gridsearch3.best_params_).fit(X_train, y_train)
y_test_pred = gb.predict(X_test)
print(classification_report(y_test , y_test_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        96
           1       0.81      1.00      0.89       404

    accuracy                           0.81       500
   macro avg       0.40      0.50      0.45       500
weighted avg       0.65      0.81      0.72       500



## **K-Nearest Neighbors**

In [42]:
knn = KNeighborsClassifier(**gridsearch4.best_params_).fit(X_train, y_train)
y_test_pred = knn.predict(X_test)
print(classification_report(y_test , y_test_pred))

              precision    recall  f1-score   support

           0       0.38      0.05      0.09        96
           1       0.81      0.98      0.89       404

    accuracy                           0.80       500
   macro avg       0.60      0.52      0.49       500
weighted avg       0.73      0.80      0.74       500



## **Multinomial Naive Bayes**

In [43]:
mnb = MultinomialNB(**gridsearch5.best_params_).fit(X_train, y_train)
y_test_pred = mnb.predict(X_test)
print(classification_report(y_test , y_test_pred))

              precision    recall  f1-score   support

           0       0.29      0.05      0.09        96
           1       0.81      0.97      0.88       404

    accuracy                           0.79       500
   macro avg       0.55      0.51      0.49       500
weighted avg       0.71      0.79      0.73       500



## **Stochastic Gradient Descent**

In [44]:
sgd = SGDClassifier(**gridsearch6.best_params_).fit(X_train, y_train)
y_test_pred = sgd.predict(X_test)
print(classification_report(y_test , y_test_pred))

              precision    recall  f1-score   support

           0       0.20      0.01      0.02        96
           1       0.81      0.99      0.89       404

    accuracy                           0.80       500
   macro avg       0.50      0.50      0.45       500
weighted avg       0.69      0.80      0.72       500



## **XGBoost**

In [45]:
xgb = XGBClassifier(**gridsearch7.best_params_).fit(X_train, y_train)
y_test_pred = xgb.predict(X_test)
print(classification_report(y_test , y_test_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        96
           1       0.81      1.00      0.89       404

    accuracy                           0.81       500
   macro avg       0.40      0.50      0.45       500
weighted avg       0.65      0.81      0.72       500



# **Implementasi Random Search**

## **Random Forest**

In [46]:
randomsearch = RandomizedSearchCV(RandomForestClassifier(), param_rf, scoring='accuracy')
randomsearch.fit(X_train, y_train)

RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs

## **Logistic Regression**

In [47]:
randomsearch1 = RandomizedSearchCV(LogisticRegression(), param_lr, scoring='accuracy')
randomsearch1.fit(X_train, y_train)

RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
                   estimator=LogisticRegression(C=1.0, class_weight=None,
                                                dual=False, fit_intercept=True,
                                                intercept_scaling=1,
                                                l1_ratio=None, max_iter=100,
                                                multi_class='warn', n_jobs=None,
                                                penalty='l2', random_state=None,
                                                solver='warn', tol=0.0001,
                                                verbose=0, warm_start=False),
                   iid='warn', n_iter=10, n_jobs=None,
                   param_distributions={'C': array([1.00000000e-03, 3.16227766e-02, 1.00000000e+00, 3.16227766e+01,
       1.00000000e+03]),
                                        'fit_intercept': ['True', 'False'],
                                        'penalty':

## **Gaussian Naive Bayes**

In [48]:
randomsearch2 = RandomizedSearchCV(GaussianNB(), param_gnb, scoring='accuracy')
randomsearch2.fit(X_train, y_train)

RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
                   estimator=GaussianNB(priors=None, var_smoothing=1e-09),
                   iid='warn', n_iter=10, n_jobs=None,
                   param_distributions={'var_smoothing': [1e-09]},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring='accuracy', verbose=0)

## **Gradient Boosting**

In [49]:
randomsearch3 = RandomizedSearchCV(GradientBoostingClassifier(), param_gb, scoring='accuracy')
randomsearch3.fit(X_train, y_train)

RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
                   estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                        init=None,
                                                        learning_rate=0.1,
                                                        loss='deviance',
                                                        max_depth=3,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                

## **K-Nearest Neighbors**

In [50]:
randomsearch4 = RandomizedSearchCV(KNeighborsClassifier(), param_knn, scoring='accuracy')
randomsearch4.fit(X_train, y_train)

RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
                   estimator=KNeighborsClassifier(algorithm='auto',
                                                  leaf_size=30,
                                                  metric='minkowski',
                                                  metric_params=None,
                                                  n_jobs=None, n_neighbors=5,
                                                  p=2, weights='uniform'),
                   iid='warn', n_iter=10, n_jobs=None,
                   param_distributions={'algorithm': ['auto', 'ball_tree',
                                                      'kd_tree', 'brute'],
                                        'n_neighbors': [5],
                                        'weights': ['uniform', 'distance']},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring='accuracy', verbose=0)

## **Multinomial Naive Bayes**

In [51]:
randomsearch5 = RandomizedSearchCV(MultinomialNB(), param_mnb, scoring='accuracy')
randomsearch5.fit(X_train, y_train)

RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
                   estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                           fit_prior=True),
                   iid='warn', n_iter=10, n_jobs=None,
                   param_distributions={'alpha': [1.0, 0.01, 0.001],
                                        'fit_prior': ['True', 'False']},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring='accuracy', verbose=0)

## **Stochastic Stochastic Gradient Descent**

In [52]:
randomsearch6 = RandomizedSearchCV(SGDClassifier(), param_sgd, scoring='accuracy')
randomsearch6.fit(X_train, y_train)

RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
                   estimator=SGDClassifier(alpha=0.0001, average=False,
                                           class_weight=None,
                                           early_stopping=False, epsilon=0.1,
                                           eta0=0.0, fit_intercept=True,
                                           l1_ratio=0.15,
                                           learning_rate='optimal',
                                           loss='hinge', max_iter=1000,
                                           n_iter_no_change=5, n_jobs=None,
                                           penalty='l2', power_t=0.5,
                                           random_state=None, shuffle=True,
                                           tol=0.0...
                                           verbose=0, warm_start=False),
                   iid='warn', n_iter=10, n_jobs=None,
                   param_distributions={'loss': ['

## **XGBoost**

In [53]:
randomsearch7 = RandomizedSearchCV(XGBClassifier(), param_xgb, scoring='accuracy')
randomsearch7.fit(X_train, y_train)

RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=1, nthread=None,
                                           objective='binary:logistic',
                                           random_state=0, reg_alpha=0,
                                           reg_lambda=1, scale_pos_weight=1,
                                           seed=None, silent=None, subsample=1,
                                           verbosity=1),
     

# **Hasil Random Search**

In [54]:
print(randomsearch.best_params_) # RF
print(randomsearch1.best_params_) # LR
print(randomsearch2.best_params_) # GNB
print(randomsearch3.best_params_) # GB
print(randomsearch4.best_params_) # KNN
print(randomsearch5.best_params_) # MNB
print(randomsearch6.best_params_) # SGD
print(randomsearch7.best_params_) # XGB

{'n_estimators': 10, 'criterion': 'entropy'}
{'penalty': 'l2', 'fit_intercept': 'True', 'C': 1.0}
{'var_smoothing': 1e-09}
{'n_estimators': 100, 'learning_rate': 0.01}
{'weights': 'uniform', 'n_neighbors': 5, 'algorithm': 'auto'}
{'fit_prior': 'True', 'alpha': 1.0}
{'penalty': 'l2', 'loss': 'squared_hinge'}
{'booster': 'gblinear'}


# **Implementasi Hyperparameter Berdasarkan Hasil Random Search**

## **Random Forest**

In [55]:
rf = RandomForestClassifier(**randomsearch.best_params_).fit(X_train, y_train)
y_test_pred = rf.predict(X_test)
print(classification_report(y_test , y_test_pred))

              precision    recall  f1-score   support

           0       0.30      0.21      0.25        96
           1       0.82      0.88      0.85       404

    accuracy                           0.75       500
   macro avg       0.56      0.55      0.55       500
weighted avg       0.72      0.75      0.74       500



## **Logistic Regression**

In [56]:
lr = LogisticRegression(**randomsearch1.best_params_).fit(X_train, y_train)
y_test_pred = lr.predict(X_test)
print(classification_report(y_test , y_test_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        96
           1       0.81      0.99      0.89       404

    accuracy                           0.80       500
   macro avg       0.40      0.50      0.45       500
weighted avg       0.65      0.80      0.72       500



## **Gaussian Naive Bayes**

In [57]:
gnb = GaussianNB().fit(X_train, y_train)
y_test_pred = gnb.predict(X_test)
print(classification_report(y_test , y_test_pred))

              precision    recall  f1-score   support

           0       0.20      0.98      0.33        96
           1       0.89      0.04      0.08       404

    accuracy                           0.22       500
   macro avg       0.54      0.51      0.20       500
weighted avg       0.76      0.22      0.12       500



## **Gradient Boosting**

In [58]:
gb = GradientBoostingClassifier(**randomsearch3.best_params_).fit(X_train, y_train)
y_test_pred = gb.predict(X_test)
print(classification_report(y_test , y_test_pred))

              precision    recall  f1-score   support

           0       0.50      0.01      0.02        96
           1       0.81      1.00      0.89       404

    accuracy                           0.81       500
   macro avg       0.65      0.50      0.46       500
weighted avg       0.75      0.81      0.73       500



## **K-Nearest Neighbors**

In [59]:
knn = KNeighborsClassifier(**randomsearch4.best_params_).fit(X_train, y_train)
y_test_pred = knn.predict(X_test)
print(classification_report(y_test , y_test_pred))

              precision    recall  f1-score   support

           0       0.38      0.05      0.09        96
           1       0.81      0.98      0.89       404

    accuracy                           0.80       500
   macro avg       0.60      0.52      0.49       500
weighted avg       0.73      0.80      0.74       500



## **Multinomial Naive Bayes**

In [60]:
mnb = MultinomialNB(**randomsearch5.best_params_).fit(X_train, y_train)
y_test_pred = mnb.predict(X_test)
print(classification_report(y_test , y_test_pred))

              precision    recall  f1-score   support

           0       0.29      0.05      0.09        96
           1       0.81      0.97      0.88       404

    accuracy                           0.79       500
   macro avg       0.55      0.51      0.49       500
weighted avg       0.71      0.79      0.73       500



## **Stochastic Gradient Descent**

In [61]:
sgd = SGDClassifier(**randomsearch6.best_params_).fit(X_train, y_train)
y_test_pred = sgd.predict(X_test)
print(classification_report(y_test , y_test_pred))

              precision    recall  f1-score   support

           0       0.22      0.02      0.04        96
           1       0.81      0.98      0.89       404

    accuracy                           0.80       500
   macro avg       0.52      0.50      0.46       500
weighted avg       0.70      0.80      0.72       500



## **XGBoost**

In [62]:
xgb = XGBClassifier(**randomsearch7.best_params_).fit(X_train, y_train)
y_test_pred = xgb.predict(X_test)
print(classification_report(y_test , y_test_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        96
           1       0.81      1.00      0.89       404

    accuracy                           0.81       500
   macro avg       0.40      0.50      0.45       500
weighted avg       0.65      0.81      0.72       500

