In [73]:
import pandas as pd

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [74]:
# Load Data

dbt = pd.read_csv('data/diabetes.csv')

dbt.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [75]:
# Cek nama kolom
dbt.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [76]:
# Cek kolom null
dbt.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [77]:
# Pada kasus ini, agak tidak masuk akal jika beberapa parameter bernilai 0
# sebagai contoh adalah nilai 'Glucose', 'BloodPlessure' ataupun 'Insulin'.
# Sekecil apapun nilainya, setiap manusia yang hidup pasti miliki nilai-nilai tersebut

# Kita akan manipulasi nilai yang 0 dengan melakukan 'imputasi' atau mengganti nilainya dengan nilai sintetis
# Pada kasus ini, kita akan menggunakan nilai mean

# Cek kolom dengan nilai 0
feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
for column in feature_columns:
    print("============================================")
    print(f"{column} ==> Missing zeros : {len(dbt.loc[dbt[column] == 0])}")

Pregnancies ==> Missing zeros : 111
Glucose ==> Missing zeros : 5
BloodPressure ==> Missing zeros : 35
SkinThickness ==> Missing zeros : 227
Insulin ==> Missing zeros : 374
BMI ==> Missing zeros : 11
DiabetesPedigreeFunction ==> Missing zeros : 0
Age ==> Missing zeros : 0


In [78]:
# Impute nilai 0 dengan mean
from sklearn.impute import SimpleImputer

fill_values = SimpleImputer(missing_values=0, strategy="mean", copy=False)

dbt[feature_columns] = fill_values.fit_transform(dbt[feature_columns])

## Split Data training dan testing

In [79]:
from sklearn.model_selection import train_test_split
X = dbt[feature_columns]
y = dbt.Outcome

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.3, 
                                                    random_state=42)

#### Standarisasi Fitur

In [80]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

# Standarisasi pada fitur di X_train dan X_test
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

## Logistik Regression

In [81]:
from sklearn.linear_model import LogisticRegression # import model LogisticRegression

# Buat obyek LogisticRegression
# Model LogReg tanpa tunning hyperparameter
LogReg = LogisticRegression()

# Fit dengan data yang telah di standarisasi
LogReg.fit(X_train_std, y_train)

# Prediksi dengan data test
y_pred_lr = LogReg.predict(X_test_std)

# Evaluasi akurasi, confusion matrix  & classification_report testing data
acc_lr = accuracy_score(y_test, y_pred_lr)
conf_lr = confusion_matrix(y_test, y_pred_lr)
cr_lr = classification_report(y_test, y_pred_lr)

# Print hasil evaluasi
print("Test set accuracy: {:.2f}".format(acc_lr))
print(f"Test set accuracy: {acc_lr}")
print(f"Confusion matrix: \n {conf_lr}")
print(f"Classification report: \n {cr_lr}")

Test set accuracy: 0.74
Test set accuracy: 0.7359307359307359
Confusion matrix: 
 [[123  28]
 [ 33  47]]
Classification report: 
               precision    recall  f1-score   support

           0       0.79      0.81      0.80       151
           1       0.63      0.59      0.61        80

    accuracy                           0.74       231
   macro avg       0.71      0.70      0.70       231
weighted avg       0.73      0.74      0.73       231



## SVM Kernel Polynomial

In [82]:
from sklearn.svm import SVC

# Model SVM Polynomial tanpa tunnning hyperparameter
svm_poly = SVC(kernel='poly')

# Fit ke model
svm_poly.fit(X_train_std, y_train)

# Prediksi
y_pred_svm_poly = svm_poly.predict(X_test_std)

# Evaluasi akurasi testing data
acc_svm_poly = accuracy_score(y_test, y_pred_svm_poly)
conf_svm_poly = confusion_matrix(y_test, y_pred_svm_poly)
cr_svm_poly = classification_report(y_test, y_pred_svm_poly)

# Print hasil evaluasi
print("Test set accuracy: {:.2f}".format(acc_svm_poly))
print(f"Test set accuracy: {acc_svm_poly}")
print(f"Confusion matrix: \n {conf_svm_poly}")
print(f"Classification report: \n {cr_svm_poly}")

Test set accuracy: 0.70
Test set accuracy: 0.696969696969697
Confusion matrix: 
 [[133  18]
 [ 52  28]]
Classification report: 
               precision    recall  f1-score   support

           0       0.72      0.88      0.79       151
           1       0.61      0.35      0.44        80

    accuracy                           0.70       231
   macro avg       0.66      0.62      0.62       231
weighted avg       0.68      0.70      0.67       231



## Decission Tree

In [83]:
# Secara default, DecisionTreeClassifier dari scikit-learn akan menggunakan nilai "Gini" untuk kriteria
# Terdapat beberapa "hyperparamater" yang dapat digunakan. Silahka baca dokumentasi
# Pada kasus ini kita akan menggunakan parameter default
from sklearn.tree import DecisionTreeClassifier # import model decision Tree
dt = DecisionTreeClassifier()

# Sesuaikan dt ke set training
dt.fit(X_train_std, y_train)

# Memprediksi label set test
y_pred_dt = dt.predict(X_test_std)

conf_dt = confusion_matrix(y_test, y_pred_dt)
cr_dt= classification_report(y_test, y_pred_dt )

#  menghitung set accuracy
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Test set accuracy: {:.2f}".format(acc_dt))
print(f"Test set accuracy: {acc_dt}")
print(f"Confusion matrix: \n {conf_dt}")
print(f"classification_report: \n {cr_dt}")


Test set accuracy: 0.71
Test set accuracy: 0.7142857142857143
Confusion matrix: 
 [[116  35]
 [ 31  49]]
classification_report: 
               precision    recall  f1-score   support

           0       0.79      0.77      0.78       151
           1       0.58      0.61      0.60        80

    accuracy                           0.71       231
   macro avg       0.69      0.69      0.69       231
weighted avg       0.72      0.71      0.72       231



## Training dengan Voting

In [84]:
from sklearn.ensemble import VotingClassifier # import model Voting
# Definisikan algoritma yang akan digunakan untuk voting
clf1 = LogisticRegression()
clf2 = SVC(kernel='poly')
clf3 = DecisionTreeClassifier()

# model hard voting
voting = VotingClassifier(estimators=[('LogistikRegression', clf1), ('SVM-POLY', clf2), ('DecisionTreeClassifier', clf3)], voting='hard')

# Fit model
voting.fit(X_train_std, y_train)

# Prediksi
y_pred_vt1 = voting.predict(X_test_std)

# Evaluasi akurasi testing data
acc_vt1 = accuracy_score(y_test, y_pred_vt1)
conf_vt1 = confusion_matrix(y_test, y_pred_vt1)
cr_vt1 = classification_report(y_test, y_pred_vt1)

# Print hasil evaluasi
print('Voting Hard')
print("Test set accuracy: {:.2f}".format(acc_vt1))
print(f"Test set accuracy: {acc_vt1}")
print(f"Confusion matrix: \n {conf_vt1}")
print(f"classification_report: \n {cr_vt1}")

Voting Hard
Test set accuracy: 0.74
Test set accuracy: 0.7359307359307359
Confusion matrix: 
 [[127  24]
 [ 37  43]]
classification_report: 
               precision    recall  f1-score   support

           0       0.77      0.84      0.81       151
           1       0.64      0.54      0.59        80

    accuracy                           0.74       231
   macro avg       0.71      0.69      0.70       231
weighted avg       0.73      0.74      0.73       231

