# Import Library

In [7]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# Load Data

In [8]:
data = pd.read_csv('D:\Capstone Project\Deteksi_Cardiovascular\cardio.csv', sep=';')
data.head(10)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
5,8,21914,1,151,67.0,120,80,2,2,0,0,0,0
6,9,22113,1,157,93.0,130,80,3,1,0,0,1,0
7,12,22584,2,178,95.0,130,90,3,3,0,0,1,1
8,13,17668,1,158,71.0,110,70,1,1,0,0,1,0
9,14,19834,1,164,68.0,110,60,1,1,0,0,0,0


In [9]:
data.shape

(70000, 13)

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   gender       70000 non-null  int64  
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.9 MB


In [11]:
data = data.drop(columns=['id'])

In [12]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,70000.0,19468.865814,2467.251667,10798.0,17664.0,19703.0,21327.0,23713.0
gender,70000.0,1.349571,0.476838,1.0,1.0,1.0,2.0,2.0
height,70000.0,164.359229,8.210126,55.0,159.0,165.0,170.0,250.0
weight,70000.0,74.20569,14.395757,10.0,65.0,72.0,82.0,200.0
ap_hi,70000.0,128.817286,154.011419,-150.0,120.0,120.0,140.0,16020.0
ap_lo,70000.0,96.630414,188.47253,-70.0,80.0,80.0,90.0,11000.0
cholesterol,70000.0,1.366871,0.68025,1.0,1.0,1.0,2.0,3.0
gluc,70000.0,1.226457,0.57227,1.0,1.0,1.0,1.0,3.0
smoke,70000.0,0.088129,0.283484,0.0,0.0,0.0,0.0,1.0
alco,70000.0,0.053771,0.225568,0.0,0.0,0.0,0.0,1.0


# Data Preprocessing

In [13]:
data.isnull().sum()

age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64

In [14]:
# sum of duplicated values in the DataFrame
data.duplicated(keep=False).sum()

48

In [15]:
data = data.drop_duplicates()

In [16]:
import pandas as pd

# Fungsi untuk menghapus outlier berdasarkan metode IQR
def drop_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)  # Kuartil 1 (25%)
    Q3 = df[column].quantile(0.75)  # Kuartil 3 (75%)
    IQR = Q3 - Q1  # Interquartile Range
    lower_bound = Q1 - 1.5 * IQR  # Batas bawah
    upper_bound = Q3 + 1.5 * IQR  # Batas atas

    # Hapus data yang berada di luar batas
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# List fitur yang akan dibersihkan dari outlier
columns_to_clean = ['height', 'weight', 'ap_hi', 'ap_lo']

# Loop untuk menghapus outlier pada setiap kolom
for col in columns_to_clean:
    data = drop_outliers_iqr(data, col)

# Cek jumlah data setelah penghapusan outlier
print("Jumlah data setelah penghapusan outlier:", len(data))

Jumlah data setelah penghapusan outlier: 62481


In [17]:
data["gender"] = data["gender"].replace({1: 0, 2: 1})

In [18]:
# Menghapus baris di mana weight memiliki nilai desimal
data = data[data['weight'].astype(int) == data['weight']]

# Mengonversi kolom weight menjadi integer
data['weight'] = data['weight'].astype('int64')

In [19]:
data['age'] = data['age'] / 365.25
data['age_years'] = data['age'].astype('int64')

In [20]:
#Calculate age
minAge=min(data.age_years)
maxAge=max(data.age_years)
meanAge=data.age_years.mean()
print(f"min = {minAge}, max = {maxAge}, mean = {meanAge:.2f}")

min = 29, max = 64, mean = 52.87


In [21]:
# Drop the original 'age' column
data.drop(columns=['age'], inplace=True)

In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 62319 entries, 0 to 69999
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   gender       62319 non-null  int64
 1   height       62319 non-null  int64
 2   weight       62319 non-null  int64
 3   ap_hi        62319 non-null  int64
 4   ap_lo        62319 non-null  int64
 5   cholesterol  62319 non-null  int64
 6   gluc         62319 non-null  int64
 7   smoke        62319 non-null  int64
 8   alco         62319 non-null  int64
 9   active       62319 non-null  int64
 10  cardio       62319 non-null  int64
 11  age_years    62319 non-null  int64
dtypes: int64(12)
memory usage: 6.2 MB


In [44]:
data = data[(data["height"] <= 200) & (data["weight"] <= 150)]

# Mengatur ulang indeks setelah penghapusan
data = data.reset_index(drop=True)

# Menampilkan data dengan indeks yang telah diperbaiki
print(data)

       gender  height  weight  ap_hi  ap_lo  cholesterol  gluc  smoke  alco  \
0           1     168      62    110     80            1     1      0     0   
1           0     156      85    140     90            3     1      0     0   
2           0     165      64    130     70            3     1      0     0   
3           1     169      82    150    100            1     1      0     0   
4           0     151      67    120     80            2     2      0     0   
...       ...     ...     ...    ...    ...          ...   ...    ...   ...   
62314       0     172      70    130     90            1     1      0     0   
62315       0     165      80    150     80            1     1      0     0   
62316       1     168      76    120     80            1     1      1     0   
62317       0     163      72    135     80            1     2      0     0   
62318       0     170      72    120     80            2     1      0     0   

       active  cardio  age_years  
0           1   

In [24]:
# Menghapus baris di mana nilai ap_hi kurang dari 50 atau lebih dari 245
data = data[(data['ap_hi'] >= 50) & (data['ap_hi'] <= 245)]

# Mengatur ulang indeks setelah penghapusan
data = data.reset_index(drop=True)

# Menampilkan data dengan indeks yang telah diperbaiki
print(data)

       gender  height  weight  ap_hi  ap_lo  cholesterol  gluc  smoke  alco  \
0           1     168      62    110     80            1     1      0     0   
1           0     156      85    140     90            3     1      0     0   
2           0     165      64    130     70            3     1      0     0   
3           1     169      82    150    100            1     1      0     0   
4           0     151      67    120     80            2     2      0     0   
...       ...     ...     ...    ...    ...          ...   ...    ...   ...   
62314       0     172      70    130     90            1     1      0     0   
62315       0     165      80    150     80            1     1      0     0   
62316       1     168      76    120     80            1     1      1     0   
62317       0     163      72    135     80            1     2      0     0   
62318       0     170      72    120     80            2     1      0     0   

       active  cardio  age_years  
0           1   

In [25]:
# Menghapus nilai ap_lo yang tidak masuk akal
data = data[(data['ap_lo'] >= 50) & (data['ap_lo'] <= 200)]

# Mengatur ulang indeks
data = data.reset_index(drop=True)

# Menampilkan data dengan indeks yang telah diperbaiki
print(data)


       gender  height  weight  ap_hi  ap_lo  cholesterol  gluc  smoke  alco  \
0           1     168      62    110     80            1     1      0     0   
1           0     156      85    140     90            3     1      0     0   
2           0     165      64    130     70            3     1      0     0   
3           1     169      82    150    100            1     1      0     0   
4           0     151      67    120     80            2     2      0     0   
...       ...     ...     ...    ...    ...          ...   ...    ...   ...   
62314       0     172      70    130     90            1     1      0     0   
62315       0     165      80    150     80            1     1      0     0   
62316       1     168      76    120     80            1     1      1     0   
62317       0     163      72    135     80            1     2      0     0   
62318       0     170      72    120     80            2     1      0     0   

       active  cardio  age_years  
0           1   

In [26]:
# Memeriksa nilai yang di luar dari 0 atau 1 pada fitur biner
binary_features = ['smoke', 'alco', 'active', 'cardio']
for feature in binary_features:
    outlier_indices = data[~data[feature].isin([0, 1])].index
    if len(outlier_indices) > 0:
        print(f"Outlier ditemukan pada fitur {feature}:")
        print(data.loc[outlier_indices])
    else:
        print(f"Tidak ada outlier pada fitur {feature}.")

Tidak ada outlier pada fitur smoke.
Tidak ada outlier pada fitur alco.
Tidak ada outlier pada fitur active.
Tidak ada outlier pada fitur cardio.


In [27]:
data.head(10)

Unnamed: 0,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years
0,1,168,62,110,80,1,1,0,0,1,0,50
1,0,156,85,140,90,3,1,0,0,1,1,55
2,0,165,64,130,70,3,1,0,0,0,1,51
3,1,169,82,150,100,1,1,0,0,1,1,48
4,0,151,67,120,80,2,2,0,0,0,0,59
5,0,157,93,130,80,3,1,0,0,1,0,60
6,1,178,95,130,90,3,3,0,0,1,1,61
7,0,158,71,110,70,1,1,0,0,1,0,48
8,0,169,80,120,80,1,1,0,0,1,0,61
9,1,173,60,120,80,1,1,0,0,1,0,51


In [28]:
# Menentukan urutan kolom yang baru
new_order = ['gender','age_years', 'height','weight','ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio']

# Mengatur ulang kolom DataFrame sesuai dengan urutan yang ditentukan
new_data = data[new_order]

# Creating New Features

In [29]:
#Body Mass Index-Indeks massa tubuh
new_data['bmi'] = round(new_data['weight'] / (new_data['height'] / 100) ** 2, 2)

#pulse_pressure
new_data['tekanan_denyut_nadi'] = new_data['ap_hi'] - new_data['ap_lo']

#MAP(Mean Arterial Pressure)
new_data['tekanan_arteri_ratarata'] = round(new_data['ap_lo'] + (new_data['ap_hi'] - new_data['ap_lo']) / 3, 2)

#Rasio antara tekanan darah sistolik dan diastolik
new_data['sys_dsys_ratio'] = round(new_data['ap_hi'] / new_data['ap_lo'], 2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data['bmi'] = round(new_data['weight'] / (new_data['height'] / 100) ** 2, 2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data['tekanan_denyut_nadi'] = new_data['ap_hi'] - new_data['ap_lo']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data['tekanan_arteri_ratarata'] = round(new_data

# Oversampling with Adaptive Synthetic Sampling

In [None]:
import pandas as pd
from imblearn.over_sampling import ADASYN

# Kolom dengan kategori yang tidak seimbang
columns_to_up_sample = ['cholesterol', 'gluc', 'smoke', 'alco', 'active'] 

# Membuat salinan dari DataFrame asli untuk menjaga keutuhan data
new_data_copy = new_data.copy()

# Melakukan oversampling menggunakan ADASYN untuk setiap kolom yang tidak seimbang
for column in columns_to_up_sample:
    print(f"UP SAMPLING : {column}")
    adasyn = ADASYN(sampling_strategy='auto', random_state=42)
    
    # Menggunakan kolom yang sedang diproses sebagai variabel target (y)
    X = new_data_copy.drop(columns=column)  # Semua kolom kecuali yang sedang diproses
    y = new_data_copy[column]  # Kolom target yang ingin di-oversampling
    
    X_res, y_res = adasyn.fit_resample(X, y)
    X_res[column] = y_res  # Menambahkan kembali kolom yang telah di-oversampling

    new_data_copy = X_res.copy()  # Memperbarui DataFrame untuk iterasi berikutnya

UP SAMPLING : cholesterol
UP SAMPLING : gluc
UP SAMPLING : smoke
UP SAMPLING : alco
UP SAMPLING : active


In [31]:
print("New Dataset Shape: " , new_data_copy.shape)

new_data = new_data_copy

New Dataset Shape:  (1784585, 16)


In [32]:
new_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
gender,1784585.0,0.50984,0.499903,0.0,0.0,1.0,1.0,1.0
age_years,1784585.0,52.055611,6.622473,29.0,47.0,53.0,57.0,64.0
height,1784585.0,166.264971,7.557784,143.0,161.0,167.0,171.0,186.0
weight,1784585.0,75.350012,12.387385,40.0,66.0,74.0,84.0,107.0
ap_hi,1784585.0,128.20177,14.789247,90.0,120.0,120.0,140.0,170.0
ap_lo,1784585.0,82.609181,8.000887,65.0,80.0,80.0,90.0,105.0
cardio,1784585.0,0.3167,0.465189,0.0,0.0,0.0,1.0,1.0
bmi,1784585.0,27.311318,4.423235,13.52,24.053562,26.657648,30.106689,50.89
tekanan_denyut_nadi,1784585.0,45.553492,10.882209,-10.0,40.0,40.0,50.0,103.0
tekanan_arteri_ratarata,1784585.0,97.840051,9.448786,73.33,93.33,93.33,104.869419,126.0


# Split Feature and Target

In [None]:
features=new_data[['gender','age_years', 'bmi', 'tekanan_denyut_nadi', 'tekanan_arteri_ratarata',
                   'sys_dsys_ratio', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']]
target=new_data['cardio']

In [34]:
features

Unnamed: 0,gender,age_years,bmi,tekanan_denyut_nadi,tekanan_arteri_ratarata,sys_dsys_ratio,cholesterol,gluc,smoke,alco,active
0,1,50,21.970000,30,90.00,1.38,1,1,0,0,1
1,0,55,34.930000,50,106.67,1.56,3,1,0,0,1
2,0,51,23.510000,60,90.00,1.86,3,1,0,0,0
3,1,48,28.710000,50,116.67,1.50,1,1,0,0,1
4,0,59,29.380000,40,93.33,1.50,2,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1784580,0,54,32.119058,50,116.67,1.50,1,1,1,1,0
1784581,0,54,32.121554,50,116.67,1.50,1,1,1,1,0
1784582,0,54,32.120180,50,116.67,1.50,1,1,1,1,0
1784583,0,54,32.128318,50,116.67,1.50,1,1,1,1,0


In [35]:
target

0          0
1          1
2          1
3          1
4          0
          ..
1784580    1
1784581    1
1784582    1
1784583    1
1784584    1
Name: cardio, Length: 1784585, dtype: int64

In [36]:
# cek NaN
print(features.isna().values.any())

False


In [37]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(features, target, random_state=42, test_size=0.2)

# Undersampling

In [38]:
from imblearn.under_sampling import RandomUnderSampler

# Undersampling
undersample = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersample.fit_resample(x_train, y_train)

# Cek distribusi setelah undersampling
print("Distribusi kelas setelah undersampling:", np.bincount(y_train_resampled))

Distribusi kelas setelah undersampling: [452147 452147]


In [39]:
# Cek distribusi setelah undersampling
print("Distribusi kelas setelah undersampling:", np.bincount(y_train_resampled))

Distribusi kelas setelah undersampling: [452147 452147]


# Normalisasi

In [40]:
from sklearn.preprocessing import MinMaxScaler

# Normalisasi Data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(x_test)

# Model Training

## library

In [41]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier,ExtraTreesClassifier, AdaBoostClassifier, VotingClassifier, BaggingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
#from catboost import CatBoostClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier

## Decision Tree

In [69]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Inisialisasi model Decision Tree
model_dt = DecisionTreeClassifier(criterion="gini", max_depth=5, random_state=42)

# Melatih model
model_dt.fit(X_train_scaled, y_train_resampled)

# Prediksi terhadap data uji
y_pred_dt = model_dt.predict(X_test_scaled)

# Evaluasi Model
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"\nDecision Tree Accuracy: {accuracy_dt:.2%}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_dt))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt))



Decision Tree Accuracy: 78.61%

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.80      0.84    233262
           1       0.63      0.76      0.69    105727

    accuracy                           0.79    338989
   macro avg       0.76      0.78      0.76    338989
weighted avg       0.80      0.79      0.79    338989


Confusion Matrix:
[[185617  47645]
 [ 24851  80876]]


## Random Forest

In [42]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

model_rf = RandomForestClassifier(n_estimators=50, random_state=42)
model_rf.fit(X_train_scaled, y_train_resampled)

# Prediksi terhadap Data Uji
y_pred_rf = model_rf.predict(X_test_scaled)

# Evaluasi Model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"\nRandom Forest Accuracy: {accuracy_rf:.2%}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))


Random Forest Accuracy: 93.92%

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.94      0.95    243886
           1       0.88      0.94      0.91    113031

    accuracy                           0.94    356917
   macro avg       0.92      0.94      0.93    356917
weighted avg       0.94      0.94      0.94    356917


Confusion Matrix:
[[228909  14977]
 [  6728 106303]]


## ExtraTreesClassifier

In [36]:
from sklearn.ensemble import ExtraTreesClassifier

# Inisialisasi dan latih model Extra Trees
model_et = ExtraTreesClassifier(
    n_estimators=50, 
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=42
)

# Gunakan data yang sudah diresample & dinormalisasi untuk pelatihan
model_et.fit(X_train_scaled, y_train_resampled)

# Prediksi terhadap data uji yang sudah dinormalisasi
y_pred_et = model_et.predict(X_test_scaled)

# Hitung akurasi
accuracy_et = accuracy_score(y_test, y_pred_et)
print(f"Extra Trees Accuracy: {accuracy_et:.2%}")

# Tampilkan classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_et))

# Tampilkan confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_et))


Extra Trees Accuracy: 92.77%

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.92      0.95    233262
           1       0.85      0.94      0.89    105727

    accuracy                           0.93    338989
   macro avg       0.91      0.93      0.92    338989
weighted avg       0.93      0.93      0.93    338989


Confusion Matrix:
[[215420  17842]
 [  6677  99050]]


## GradientBoostingClassifier

In [37]:
from sklearn.ensemble import GradientBoostingClassifier

# Inisialisasi dan latih model Gradient Boosting
model_gb = GradientBoostingClassifier(
    n_estimators=50, 
    learning_rate=1.0, 
    max_depth=1, 
    random_state=0
)

# Gunakan data yang sudah diresample & dinormalisasi untuk pelatihan
model_gb.fit(X_train_scaled, y_train_resampled)

# Prediksi terhadap data uji yang sudah dinormalisasi
y_pred_gb = model_gb.predict(X_test_scaled)

# Hitung akurasi
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print(f"Gradient Boosting Accuracy: {accuracy_gb:.2%}")

# Tampilkan classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_gb))

# Tampilkan confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_gb))


Gradient Boosting Accuracy: 77.63%

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.78      0.83    233262
           1       0.61      0.76      0.68    105727

    accuracy                           0.78    338989
   macro avg       0.75      0.77      0.75    338989
weighted avg       0.80      0.78      0.78    338989


Confusion Matrix:
[[182706  50556]
 [ 25259  80468]]


## AdaBoostClassifier

In [39]:
from sklearn.ensemble import AdaBoostClassifier

# Inisialisasi dan latih model AdaBoost
model_ab = AdaBoostClassifier(n_estimators=50, random_state=0)
model_ab.fit(X_train_scaled, y_train_resampled)  # Gunakan data yang sudah diresample & dinormalisasi

# Prediksi terhadap data uji yang sudah dinormalisasi
y_pred_ab = model_ab.predict(X_test_scaled)

# Hitung akurasi
accuracy_ab = accuracy_score(y_test, y_pred_ab)
print(f"AdaBoost Accuracy: {accuracy_ab:.2%}")

# Tampilkan classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_ab))

# Tampilkan confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_ab))


AdaBoost Accuracy: 76.82%

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.78      0.82    233262
           1       0.60      0.74      0.67    105727

    accuracy                           0.77    338989
   macro avg       0.74      0.76      0.74    338989
weighted avg       0.79      0.77      0.77    338989


Confusion Matrix:
[[182144  51118]
 [ 27467  78260]]


## XGBClassifier

In [40]:
from xgboost import XGBClassifier

# Inisialisasi dan latih model XGBoost
model_xgb = XGBClassifier(
    objective='binary:logistic', 
    learning_rate=0.1, 
    max_depth=5, 
    n_estimators=50, 
    seed=42
)

model_xgb.fit(X_train_scaled, y_train_resampled)  # Gunakan data yang sudah di-resample & dinormalisasi

# Prediksi terhadap data uji yang sudah dinormalisasi
y_pred_xgb = model_xgb.predict(X_test_scaled)

# Hitung akurasi
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Accuracy: {accuracy_xgb:.2%}")

# Tampilkan classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb))

# Tampilkan confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb))


XGBoost Accuracy: 79.24%

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.78      0.84    233262
           1       0.63      0.82      0.71    105727

    accuracy                           0.79    338989
   macro avg       0.77      0.80      0.77    338989
weighted avg       0.82      0.79      0.80    338989


Confusion Matrix:
[[181634  51628]
 [ 18753  86974]]


## LGBMClassifier

In [41]:
from lightgbm import LGBMClassifier

# Inisialisasi dan latih model LightGBM
model_lgbm = LGBMClassifier(n_estimators=50, learning_rate=0.1, random_state=42)
model_lgbm.fit(X_train_scaled, y_train_resampled)  # Gunakan data yang sudah di-resample & dinormalisasi

# Prediksi terhadap data uji yang sudah dinormalisasi
y_pred_lgbm = model_lgbm.predict(X_test_scaled)

# Hitung akurasi
accuracy_lgbm = accuracy_score(y_test, y_pred_lgbm)
print(f"LightGBM Accuracy: {accuracy_lgbm:.2%}")

# Tampilkan classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lgbm))

# Tampilkan confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_lgbm))


[LightGBM] [Info] Number of positive: 425020, number of negative: 425020
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.093497 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 896
[LightGBM] [Info] Number of data points in the train set: 850040, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




LightGBM Accuracy: 79.92%

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.78      0.84    233262
           1       0.63      0.84      0.72    105727

    accuracy                           0.80    338989
   macro avg       0.78      0.81      0.78    338989
weighted avg       0.83      0.80      0.81    338989


Confusion Matrix:
[[181834  51428]
 [ 16626  89101]]


## HistGradientBoostingClassifier

In [42]:
from sklearn.ensemble import HistGradientBoostingClassifier

# Inisialisasi dan latih model HistGradientBoosting
model_hgb = HistGradientBoostingClassifier(max_iter=50, random_state=42)
model_hgb.fit(X_train_scaled, y_train_resampled)  # Gunakan data hasil resampling & normalisasi

# Prediksi terhadap data uji yang sudah dinormalisasi
y_pred_hgb = model_hgb.predict(X_test_scaled)

# Hitung akurasi
accuracy_hgb = accuracy_score(y_test, y_pred_hgb)
print(f"HistGradientBoosting Accuracy: {accuracy_hgb:.2%}")

# Tampilkan classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_hgb))

# Tampilkan confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_hgb))

HistGradientBoosting Accuracy: 79.81%

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.78      0.84    233262
           1       0.63      0.84      0.72    105727

    accuracy                           0.80    338989
   macro avg       0.77      0.81      0.78    338989
weighted avg       0.83      0.80      0.80    338989


Confusion Matrix:
[[181672  51590]
 [ 16836  88891]]


## MLPClassifier

In [None]:
from sklearn.neural_network import MLPClassifier

# Inisialisasi dan latih model MLP
model_mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
model_mlp.fit(X_train_scaled, y_train_resampled)  # Gunakan data hasil resampling & normalisasi

# Prediksi terhadap data uji yang sudah dinormalisasi
y_pred_mlp = model_mlp.predict(X_test_scaled)

# Hitung akurasi
accuracy_mlp = accuracy_score(y_test, y_pred_mlp)
print(f"MLPClassifier Accuracy: {accuracy_mlp:.2%}")

# Tampilkan classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_mlp))

# Tampilkan confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_mlp))


MLPClassifier Accuracy: 81.06%

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.80      0.85    239162
           1       0.65      0.84      0.73    105981

    accuracy                           0.81    345143
   macro avg       0.78      0.82      0.79    345143
weighted avg       0.84      0.81      0.82    345143


Confusion Matrix:
[[190452  48710]
 [ 16652  89329]]


## VotingClassifier

In [43]:
from sklearn.ensemble import VotingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

# Inisialisasi dan latih model VotingClassifier
model_vot = VotingClassifier(
    estimators=[
        ('et', ExtraTreesClassifier(n_estimators=50, max_depth=None, min_samples_split=5, min_samples_leaf=2, max_features='sqrt', random_state=42)),
        ('rf', RandomForestClassifier(n_estimators=50, random_state=42)),
        ('hgb', HistGradientBoostingClassifier(max_iter=50, random_state=42))
    ],
    voting='hard'  # Ubah menjadi 'hard' karena HGB tidak support predict_proba
)

model_vot.fit(X_train_scaled, y_train_resampled)

# Prediksi terhadap data uji yang sudah dinormalisasi
y_pred_vot = model_vot.predict(X_test_scaled)

# Hitung akurasi
accuracy_vot = accuracy_score(y_test, y_pred_vot)
print(f"VotingClassifier Accuracy: {accuracy_vot:.2%}")

# Tampilkan classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_vot))

# Tampilkan confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_vot))




VotingClassifier Accuracy: 92.79%

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.92      0.95    233262
           1       0.85      0.94      0.89    105727

    accuracy                           0.93    338989
   macro avg       0.91      0.93      0.92    338989
weighted avg       0.93      0.93      0.93    338989


Confusion Matrix:
[[215588  17674]
 [  6775  98952]]


## StackingClassifier

In [58]:
from sklearn.ensemble import StackingClassifier, ExtraTreesClassifier, RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

base_models = [
    ('et', ExtraTreesClassifier(n_estimators=50, max_depth=None, min_samples_split=5, min_samples_leaf=2, max_features='sqrt', random_state=42)),
    ('hgb', HistGradientBoostingClassifier(max_iter=50, random_state=42)),  # Tetap mempertahankan model boosting
    ('xgb', XGBClassifier(n_estimators=50, learning_rate=0.1, max_depth=5, random_state=42))  # Ganti RF dengan XGBoost
]

# Stacking classifier dengan Logistic Regression sebagai meta-model
stacking_model = StackingClassifier(
    estimators=base_models, 
    final_estimator=LogisticRegression(), 
    passthrough=True
)

# Latih model Stacking
stacking_model.fit(X_train_scaled, y_train_resampled)

# Prediksi terhadap data uji
y_pred_stacking = stacking_model.predict(X_test_scaled)

# Evaluasi model
accuracy_stacking = accuracy_score(y_test, y_pred_stacking)
print(f"StackingClassifier Accuracy: {accuracy_stacking:.2%}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_stacking))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_stacking))


StackingClassifier Accuracy: 93.62%

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.93      0.95    233262
           1       0.87      0.94      0.90    105727

    accuracy                           0.94    338989
   macro avg       0.92      0.94      0.93    338989
weighted avg       0.94      0.94      0.94    338989


Confusion Matrix:
[[217969  15293]
 [  6327  99400]]


## Naive Bayes

In [44]:
from sklearn.naive_bayes import GaussianNB

# Inisialisasi dan latih model Naïve Bayes
model_nb = GaussianNB()
model_nb.fit(X_train_scaled, y_train_resampled)

# Prediksi terhadap data uji yang sudah dinormalisasi
y_pred_nb = model_nb.predict(X_test_scaled)

# Hitung akurasi
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f"Naïve Bayes Accuracy: {accuracy_nb:.2%}")

# Tampilkan classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_nb))

# Tampilkan confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_nb))


Naïve Bayes Accuracy: 74.56%

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.76      0.80    233262
           1       0.57      0.72      0.64    105727

    accuracy                           0.75    338989
   macro avg       0.72      0.74      0.72    338989
weighted avg       0.77      0.75      0.75    338989


Confusion Matrix:
[[176471  56791]
 [ 29445  76282]]


## KNeighborsClassifier

In [35]:
from sklearn.neighbors import KNeighborsClassifier

# Inisialisasi dan latih model KNN dengan 7 tetangga (nilai optimal dapat diuji lebih lanjut)
model_knn = KNeighborsClassifier(n_neighbors=10)
model_knn.fit(X_train_scaled, y_train_resampled)

# Prediksi terhadap data uji yang sudah dinormalisasi
y_pred_knn = model_knn.predict(X_test_scaled)

# Hitung akurasi
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"KNN Accuracy: {accuracy_knn:.2%}")

# Tampilkan classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_knn))

# Tampilkan confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_knn))


KNN Accuracy: 92.46%

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.93      0.94    233262
           1       0.85      0.92      0.88    105727

    accuracy                           0.92    338989
   macro avg       0.91      0.92      0.91    338989
weighted avg       0.93      0.92      0.93    338989


Confusion Matrix:
[[215903  17359]
 [  8215  97512]]


## StackingClassifier2

In [35]:
from sklearn.ensemble import StackingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Definisi base models
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=5, min_samples_leaf=2, random_state=42)),
    ('et', ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=5, min_samples_leaf=2, random_state=42)),
    ('knn', KNeighborsClassifier(n_neighbors=10))
]

# Stacking classifier dengan Logistic Regression sebagai meta-model
stacking_model = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(), 
    passthrough=True
)

# Latih model Stacking
stacking_model.fit(X_train_scaled, y_train_resampled)

# Prediksi terhadap data uji
y_pred_stacking = stacking_model.predict(X_test_scaled)

# Evaluasi model
accuracy_stacking = accuracy_score(y_test, y_pred_stacking)
print(f"StackingClassifier Accuracy: {accuracy_stacking:.2%}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_stacking))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_stacking))

StackingClassifier Accuracy: 94.09%

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.94      0.96    233262
           1       0.87      0.95      0.91    105727

    accuracy                           0.94    338989
   macro avg       0.92      0.94      0.93    338989
weighted avg       0.94      0.94      0.94    338989


Confusion Matrix:
[[218945  14317]
 [  5728  99999]]


In [36]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Base models
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)),
    ('xgb', XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42))
]

# Stacking classifier dengan Logistic Regression sebagai meta-classifier
stack_model = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(),
    passthrough=True
)

# Latih model Stacking
stack_model.fit(X_train_scaled, y_train_resampled)

# Prediksi terhadap data uji
y_pred_stack = stack_model.predict(X_test_scaled)

# Evaluasi model
accuracy_stack = accuracy_score(y_test, y_pred_stacking)
print(f"StackingClassifier Accuracy: {accuracy_stack:.2%}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_stack))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_stack))

StackingClassifier Accuracy: 94.09%

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.94      0.95    233262
           1       0.87      0.94      0.91    105727

    accuracy                           0.94    338989
   macro avg       0.92      0.94      0.93    338989
weighted avg       0.94      0.94      0.94    338989


Confusion Matrix:
[[218734  14528]
 [  6122  99605]]


## Deep Learning

In [42]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.callbacks import EarlyStopping


In [41]:
# Bangun Model Neural Network 
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),  # Layer input
    Dropout(0.3),  # Mengurangi overfitting
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')  # Output layer (sigmoid untuk klasifikasi biner)
])

# Kompilasi Model 
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# Melatih Model
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(X_train_scaled, y_train_resampled,  # Gunakan y_train_resampled!
                    epochs=50, 
                    batch_size=32, 
                    validation_data=(X_test_scaled, y_test), 
                    callbacks=[early_stopping])

Epoch 1/50
[1m26564/26564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m411s[0m 15ms/step - accuracy: 0.7609 - loss: 0.5037 - val_accuracy: 0.7790 - val_loss: 0.4690
Epoch 2/50
[1m26564/26564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m327s[0m 12ms/step - accuracy: 0.7853 - loss: 0.4687 - val_accuracy: 0.7757 - val_loss: 0.4722
Epoch 3/50
[1m26564/26564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m327s[0m 12ms/step - accuracy: 0.7891 - loss: 0.4596 - val_accuracy: 0.7921 - val_loss: 0.4389
Epoch 4/50
[1m26564/26564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m374s[0m 12ms/step - accuracy: 0.7897 - loss: 0.4577 - val_accuracy: 0.7746 - val_loss: 0.4624
Epoch 5/50
[1m26564/26564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m345s[0m 13ms/step - accuracy: 0.7902 - loss: 0.4549 - val_accuracy: 0.7937 - val_loss: 0.4398
Epoch 6/50
[1m26564/26564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m290s[0m 11ms/step - accuracy: 0.7928 - loss: 0.4516 - val_accuracy: 0.7949 - val

In [44]:
# Evaluasi Model 
test_loss, test_acc = model.evaluate(X_test_scaled, y_test)
print(f"Test Accuracy: {test_acc:.2%}")

# Prediksi dan Evaluasi
y_pred_prob = model.predict(X_test_scaled)
y_pred = (y_pred_prob > 0.5).astype(int)  # Konversi probabilitas ke label biner

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)

[1m10594/10594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 16ms/step - accuracy: 0.7910 - loss: 0.4408
Test Accuracy: 79.21%
[1m10594/10594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 7ms/step

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.79      0.84    233262
           1       0.63      0.80      0.71    105727

    accuracy                           0.79    338989
   macro avg       0.76      0.79      0.77    338989
weighted avg       0.81      0.79      0.80    338989



## Stacking with DL

In [43]:
def build_mlp():
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Bungkus model dalam KerasClassifier agar kompatibel dengan Stacking
mlp_clf = KerasClassifier(build_fn=build_mlp, epochs=50, batch_size=32, verbose=0)


In [None]:
# Model Base (Level 1)
xgb_clf = XGBClassifier(n_estimators=50, learning_rate=0.1, random_state=42)
rf_clf = RandomForestClassifier(n_estimators=50, random_state=42)

# Model Meta (Level 2)
meta_clf = LogisticRegression()

# Stacking Classifier
stacked_modell = StackingClassifier(
    estimators=[('xgb', xgb_clf), ('rf', rf_clf), ('mlp', mlp_clf)],
    final_estimator=meta_clf
)

# Training Model
stacked_modell.fit(X_train_scaled, y_train_resampled)


  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)


In [48]:
# Prediksi
y_pred = stacked_modell.predict(X_test_scaled)

# Akurasi Model
accuracy = accuracy_score(y_test, y_pred)
print(f"Akurasi Stacked Model: {accuracy:.2%}")


Akurasi Stacked Model: 96.66%


In [51]:
# Model Base (Level 1)
xgb_clf = XGBClassifier(n_estimators=200, learning_rate=0.1, random_state=42)
rf_clf = RandomForestClassifier(n_estimators=200, random_state=42)

# Model Meta (Level 2)
meta_clf = LogisticRegression()

# Stacking Classifier
stacked2_model = StackingClassifier(
    estimators=[('xgb', xgb_clf), ('rf', rf_clf), ('mlp', mlp_clf)],
    final_estimator=meta_clf
)

# Training Model
stacked2_model.fit(X_train_scaled, y_train)


  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [52]:
# Prediksi
y_pred = stacked2_model.predict(X_test_scaled)

# Akurasi Model
accuracy = accuracy_score(y_test, y_pred)
print(f"Akurasi Stacked Model: {accuracy:.2%}")


Akurasi Stacked Model: 96.68%


In [None]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

# Model Base (Level 1)
xgb_clf = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
mlp_clf = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)

# Model Meta (Level 2)
meta_clf = LogisticRegression()

# Stacking Classifier
stacked_model = StackingClassifier(
    estimators=[('xgb', xgb_clf), ('rf', rf_clf), ('mlp', mlp_clf)],
    final_estimator=meta_clf,
    passthrough=True  # Memastikan level-2 juga menerima input awal
)

# Training Model
stacked_model.fit(x_train_scaled, y_train_resampled)

# Evaluasi Model
accuracy = stacked_model.score(x_test_scaled, y_test)
print(f"Stacking Classifier Accuracy: {accuracy:.4f}")


## Stacking 3

In [35]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [36]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.callbacks import EarlyStopping


In [37]:
# Base models
rf = RandomForestClassifier(n_estimators=50, max_depth=None, min_samples_split=5, min_samples_leaf=2, random_state=42)
et = ExtraTreesClassifier(n_estimators=50, max_depth=None, min_samples_split=5, min_samples_leaf=2, random_state=42)
knn = KNeighborsClassifier(n_neighbors=7)

In [38]:
# Train base models
rf.fit(X_train_scaled, y_train_resampled)
et.fit(X_train_scaled, y_train_resampled)
knn.fit(X_train_scaled, y_train_resampled)

# Predict on training and test set
train_meta = np.column_stack([
    rf.predict_proba(X_train_scaled)[:, 1],
    et.predict_proba(X_train_scaled)[:, 1],
    knn.predict_proba(X_train_scaled)[:, 1]
])

test_meta = np.column_stack([
    rf.predict_proba(x_test)[:, 1],
    et.predict_proba(x_test)[:, 1],
    knn.predict_proba(x_test)[:, 1]
])



In [39]:
# Build Deep Learning Meta Model
def build_mlp():
    model = Sequential([
        Dense(64, activation='relu', input_shape=(3,)),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

mlp = build_mlp()
mlp.fit(train_meta, y_train_resampled, epochs=20, batch_size=16, verbose=0, validation_data=(test_meta, y_test))

<keras.callbacks.History at 0x1700bde5cf0>

In [40]:
# Evaluate the model
y_pred = (mlp.predict(test_meta) > 0.5).astype(int)
accuracy = accuracy_score(y_test, y_pred)
print(f'Final Stacking Model Accuracy: {accuracy:.2%}')

Final Stacking Model Accuracy: 40.10%


In [None]:
# Definisikan Model MLP
def build_mlp():
    model = Sequential([
        Dense(64, activation='relu', input_shape=(x_train.shape[1],)),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Bungkus MLP dalam KerasClassifier
mlp_clf = KerasClassifier(build_fn=build_mlp, epochs=100, batch_size=32, verbose=0)

# Evaluasi Model

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Simpan hasil akurasi model
model_scores = {
    "Random Forest": accuracy_rf,
    "Extra Trees": accuracy_et,
    "Gradient Boosting": accuracy_gb,
    "AdaBoost": accuracy_ab,
    "XGBoost": accuracy_xgb,
    "LightGBM": accuracy_lgbm,
    "HistGradientBoosting": accuracy_hgb,
    "MLP Classifier": accuracy_mlp,
    "KNN": accuracy_knn,
    "Voting Classifier": accuracy_vot,
    "Stacking Classifier": accuracy_stacking
}

# Konversi ke DataFrame
df_scores = pd.DataFrame(list(model_scores.items()), columns=["Model", "Accuracy"])

# Urutkan berdasarkan akurasi
df_scores = df_scores.sort_values(by="Accuracy", ascending=False)

# Visualisasi dalam Bar Plot
plt.figure(figsize=(12, 6))
sns.barplot(x="Accuracy", y="Model", data=df_scores, palette="viridis")

# Tambahkan label nilai akurasi di ujung bar
for index, value in enumerate(df_scores["Accuracy"]):
    plt.text(value + 0.01, index, f"{value:.2%}", va="center", fontsize=12)

plt.xlabel("Accuracy (%)")
plt.ylabel("Model")
plt.title("Model Performance Comparison")
plt.xlim(0, 1)  # Skala dari 0 hingga 100%
plt.grid(axis="x", linestyle="--", alpha=0.6)
plt.show()


# Testing Model

In [35]:
features

Unnamed: 0,gender,age_years,bmi,tekanan_denyut_nadi,tekanan_arteri_ratarata,sys_dsys_ratio,cholesterol,gluc,smoke,alco,active
0,1,50,21.970000,30,90.00,1.38,1,1,0,0,1
1,0,55,34.930000,50,106.67,1.56,3,1,0,0,1
2,0,51,23.510000,60,90.00,1.86,3,1,0,0,0
3,1,48,28.710000,50,116.67,1.50,1,1,0,0,1
4,0,47,23.010000,40,73.33,1.67,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1694938,1,59,29.836861,40,93.33,1.50,2,1,1,1,0
1694939,1,59,29.836745,40,93.33,1.50,2,1,1,1,0
1694940,1,59,29.837008,40,93.33,1.50,2,1,1,1,0
1694941,1,59,29.836915,40,93.33,1.50,2,1,1,1,0


In [36]:
new_data.head()

Unnamed: 0,gender,age_years,height,weight,ap_hi,ap_lo,cardio,bmi,tekanan_denyut_nadi,tekanan_arteri_ratarata,sys_dsys_ratio,cholesterol,gluc,smoke,alco,active
0,1,50,168,62,110,80,0,21.97,30,90.0,1.38,1,1,0,0,1
1,0,55,156,85,140,90,1,34.93,50,106.67,1.56,3,1,0,0,1
2,0,51,165,64,130,70,1,23.51,60,90.0,1.86,3,1,0,0,0
3,1,48,169,82,150,100,1,28.71,50,116.67,1.5,1,1,0,0,1
4,0,47,156,56,100,60,0,23.01,40,73.33,1.67,1,1,0,0,0


In [None]:
import joblib
import pandas as pd
import numpy as np
from scipy.stats import boxcox

# Load scaler & model
scaler = joblib.load("scaler.pkl")
rf_clf = joblib.load("stacking_model.pkl")
expected_columns = x_train.columns.tolist()
print("Kolom yang diharapkan:", expected_columns)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [None]:
# Fungsi untuk invers transformasi Box-Cox
def inverse_boxcox(age_box, lmbda=0.5):
    return (age_box * lmbda + 1) ** (1 / lmbda)

# Input user
gender = int(input("Jenis kelamin (0=Wanita, 1=Pria): "))
age_years = float(input("Usia: "))
bmi = float(input("Body Mass Index: "))
smoke = int(input("Merokok? (0=Tidak, 1=Ya): "))
alco = int(input("Konsumsi alkohol? (0=Tidak, 1=Ya): "))
active = int(input("Aktif secara fisik? (0=Tidak, 1=Ya): "))
tekanan_denyut_nadi = float(input("Tekanan denyut nadi: "))
tekanan_arteri_ratarata = float(input("Tekanan arteri rata-rata: "))
sys_dsys_ratio = float(input("Rasio sistolik/diastolik: "))

cholesterol = int(input("Kolesterol (1=normal, 2=above normal, 3=well above normal): "))
gluc = int(input("Glukosa (1=normal, 2=above normal, 3=well above normal): "))

# Transformasi Box-Cox untuk usia (jika > 0)
age_box = boxcox(age_years, lmbda=0.5) if age_years > 0 else age_years

# Buat DataFrame sesuai dengan fitur yang digunakan saat training
new_data = pd.DataFrame([[
    gender, age_box, bmi, tekanan_denyut_nadi, tekanan_arteri_ratarata, sys_dsys_ratio,
    smoke, alco, active, cholesterol, gluc
]], columns=[
    'gender', 'age_box', 'bmi', 'tekanan_denyut_nadi', 'tekanan_arteri_ratarata', 'sys_dsys_ratio',
    'smoke', 'alco', 'active', 'cholesterol', 'gluc'
])

# Sesuaikan fitur dengan model
new_data = new_data.reindex(columns=expected_columns, fill_value=0).astype(float)

# Transformasi dengan Scaler
try:
    new_data_scaled = scaler.transform(new_data)
    rf_pred = rf_clf.predict(new_data_scaled)
    print("\nHasil Prediksi:", "Berisiko" if rf_pred[0] == 1 else "Tidak Berisiko")
    
    # Mengembalikan usia ke nilai aslinya
    original_age = inverse_boxcox(age_box)
    print(f"Usia setelah inverse Box-Cox: {original_age:.2f} tahun")
except Exception as e:
    print("❌ ERROR:", e)



✅ Hasil Prediksi: 🚨 Berisiko
🔄 Usia setelah inverse Box-Cox: 55.00 tahun


In [40]:
print("Fitur yang diharapkan oleh scaler:", scaler.feature_names_in_)
print("Fitur yang ada di new_data:", new_data.columns.tolist())


Fitur yang diharapkan oleh scaler: ['gender' 'age_years' 'bmi' 'tekanan_denyut_nadi'
 'tekanan_arteri_ratarata' 'sys_dsys_ratio' 'cholesterol' 'gluc' 'smoke'
 'alco' 'active']
Fitur yang ada di new_data: ['gender', 'age_years', 'bmi', 'tekanan_denyut_nadi', 'tekanan_arteri_ratarata', 'sys_dsys_ratio', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']


In [41]:
rf_pred_proba = rf_clf.predict_proba(new_data_scaled)
print("Probabilitas:", rf_pred_proba)


Probabilitas: [[0.0682593 0.9317407]]


# Save Model

In [37]:
import joblib

In [None]:
import joblib
import tensorflow as tf

# Simpan Model Base
joblib.dump(xgb_clf, "xgb_model.pkl")
joblib.dump(rf_clf, "rf_model.pkl")
joblib.dump(meta_clf, "meta_model.pkl")

# Simpan Model MLP
mlp_clf.model_.save("mlp_model.h5")



In [None]:
# Simpan Scaler
import joblib
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

In [50]:
joblib.dump(model_rf, "random_forest_model.pkl")

['random_forest_model.pkl']

In [51]:
# Menyimpan model yang sudah dilatih
joblib.dump(model_et, "extra_trees_model.pkl")

['extra_trees_model.pkl']

In [52]:
joblib.dump(model_gb, "gradient_boosting_model.pkl")

['gradient_boosting_model.pkl']

In [53]:
joblib.dump(model_ab, "adaboost_model.pkl")

['adaboost_model.pkl']

In [54]:
joblib.dump(model_xgb, "xgboost_model.pkl")

['xgboost_model.pkl']

In [55]:
joblib.dump(model_lgbm, "lgbm_model.pkl")

['lgbm_model.pkl']

In [56]:
joblib.dump(model_hgb, "hgb_model.pkl")

['hgb_model.pkl']

In [60]:
joblib.dump(model_mlp, "mlp_model.pkl")

['mlp_model.pkl']

In [57]:
joblib.dump(model_vot, "voting_model.pkl")

['voting_model.pkl']

In [50]:
# Tambahkan fungsi ke namespace global
globals()['build_mlp'] = build_mlp

In [38]:
joblib.dump(stacking_model, "stacking_model.pkl")

['stacking_model.pkl']

In [39]:
joblib.dump(stack_model, "stack_model.pkl")

['stack_model.pkl']

In [55]:
joblib.dump(stacked2_model, "stacking3_model.pkl")

['stacking3_model.pkl']

In [50]:
joblib.dump(model, "deepLearn_model.pkl")

['deepLearn_model.pkl']