# *Tentukan Library yang ditentukan*

# fungsi library
- NumPy adalah library Python yang digunakan untuk bekerja dengan array
- Pandas digunakan untuk menganalisis data
- Melalui scikit-learn dapat mengimplementasikan berbagai model pembelajaran mesin untuk regresi, klasifikasi, pengelompokan, dan alat statistik untuk  menganalisisnya
- Pickle untuk menyimpan data codingan dalam format 'sav'

In [75]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
import pickle

# METODE CRISP-DM

# '''1.Business Understanding'''
Tahap ini bertujuan untuk memahami masalah bisnis, yaitu bagaimana meningkatkan akurasi prediksi tentang suatu penyakit yaitu Hepatitis C agar dapat memberikan informasi yang lebih akurat bagi pasien dan membantu Dokter dalam pengambilan keputusan ketika mendiagnosa pasien tertentu


# '''2. Data Understanding'''
Konteks
Kumpulan data berisi nilai laboratorium donor darah dan pasien Hepatitis C dan nilai demografis seperti usia.

Isi
Semua atribut kecuali Kategori dan Jenis Kelamin adalah numerik.
Atribut 1 sampai 4 mengacu pada data pasien:
1) X (KTP/No. Pasien)
2) Kategori (diagnosis) (nilai: '0=Donor Darah', '0s=Suspek Donor Darah', '1=Hepatitis', '2=Fibrosis', '3=Sirosis')
3) Umur (dalam tahun)
4) Jenis Kelamin (f,m)
Atribut 5 sampai 14 mengacu pada data laboratorium:
5) ALB (Albumin Blood Test)
6) ALP (Alkaline phosphatase)
7) ALT (Alanine Transaminase)
8) AST (Aspartate Transaminase)
9) BIL (Bilirubin)
10) CHE (Acetylcholinesterase)
11) CHOL (Cholesterol)
12) CREA (Creatinine)
13) GGT (Gamma-Glutamyl Transferase)
14) PROT (Proteins)

Atribut target untuk klasifikasi adalah Kategori (2)

# *Load Dataset*

In [5]:
HepatitisC_Dataset = pd.read_csv('HepatitisCdata.csv') 

In [6]:
HepatitisC_Dataset.head(10)

Unnamed: 0.1,Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,1,0=Blood Donor,32,m,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0
1,2,0=Blood Donor,32,m,38.5,70.3,18.0,24.7,3.9,11.17,4.8,74.0,15.6,76.5
2,3,0=Blood Donor,32,m,46.9,74.7,36.2,52.6,6.1,8.84,5.2,86.0,33.2,79.3
3,4,0=Blood Donor,32,m,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7
4,5,0=Blood Donor,32,m,39.2,74.1,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7
5,6,0=Blood Donor,32,m,41.6,43.3,18.5,19.7,12.3,9.92,6.05,111.0,91.0,74.0
6,7,0=Blood Donor,32,m,46.3,41.3,17.5,17.8,8.5,7.01,4.79,70.0,16.9,74.5
7,8,0=Blood Donor,32,m,42.2,41.9,35.8,31.1,16.1,5.82,4.6,109.0,21.5,67.1
8,9,0=Blood Donor,32,m,50.9,65.5,23.2,21.2,6.9,8.69,4.1,83.0,13.7,71.3
9,10,0=Blood Donor,32,m,42.4,86.3,20.3,20.0,35.2,5.46,4.45,81.0,15.9,69.9


# '''3.Data Preparation'''
# a. Melakukan pengecekan kembali pada kebenaran data

In [7]:
# Mengubah Nama-nama Kolom Yang di perlukan
HepatitisC_Dataset.columns = ['Patient_ID', 'Category', 'Age', 'Gender', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT']

In [8]:
HepatitisC_Dataset.head(10)

Unnamed: 0,Patient_ID,Category,Age,Gender,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,1,0=Blood Donor,32,m,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0
1,2,0=Blood Donor,32,m,38.5,70.3,18.0,24.7,3.9,11.17,4.8,74.0,15.6,76.5
2,3,0=Blood Donor,32,m,46.9,74.7,36.2,52.6,6.1,8.84,5.2,86.0,33.2,79.3
3,4,0=Blood Donor,32,m,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7
4,5,0=Blood Donor,32,m,39.2,74.1,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7
5,6,0=Blood Donor,32,m,41.6,43.3,18.5,19.7,12.3,9.92,6.05,111.0,91.0,74.0
6,7,0=Blood Donor,32,m,46.3,41.3,17.5,17.8,8.5,7.01,4.79,70.0,16.9,74.5
7,8,0=Blood Donor,32,m,42.2,41.9,35.8,31.1,16.1,5.82,4.6,109.0,21.5,67.1
8,9,0=Blood Donor,32,m,50.9,65.5,23.2,21.2,6.9,8.69,4.1,83.0,13.7,71.3
9,10,0=Blood Donor,32,m,42.4,86.3,20.3,20.0,35.2,5.46,4.45,81.0,15.9,69.9


In [80]:
# Menghapus Columns Patient ID
HepatitisC_Dataset = HepatitisC_Dataset.drop('Patient_ID', axis=1)

In [81]:
HepatitisC_Dataset.head(10)

Unnamed: 0,Category,Age,Gender,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,0,32,0,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0
1,0,32,0,38.5,70.3,18.0,24.7,3.9,11.17,4.8,74.0,15.6,76.5
2,0,32,0,46.9,74.7,36.2,52.6,6.1,8.84,5.2,86.0,33.2,79.3
3,0,32,0,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7
4,0,32,0,39.2,74.1,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7
5,0,32,0,41.6,43.3,18.5,19.7,12.3,9.92,6.05,111.0,91.0,74.0
6,0,32,0,46.3,41.3,17.5,17.8,8.5,7.01,4.79,70.0,16.9,74.5
7,0,32,0,42.2,41.9,35.8,31.1,16.1,5.82,4.6,109.0,21.5,67.1
8,0,32,0,50.9,65.5,23.2,21.2,6.9,8.69,4.1,83.0,13.7,71.3
9,0,32,0,42.4,86.3,20.3,20.0,35.2,5.46,4.45,81.0,15.9,69.9


In [82]:
# Mengganti Category ['0=Blood Donor', '0s=suspect Blood Donor'] menjadi '0'
# dan mengganti Category ['1=Hepatitis', '2=Fibrosis', '3=Cirrhosis'] menjadi '1'


HepatitisC_Dataset['Category'] = HepatitisC_Dataset['Category'].replace({'0=Blood Donor': 0, '0s=suspect Blood Donor': 0, '1=Hepatitis': 1, '2=Fibrosis': 1, '3=Cirrhosis': 1})

In [83]:
# Mengganti Gender ['m'] menjadi 0
# dan mengganti Gender ['f']  menjadi 1

HepatitisC_Dataset['Gender'] = HepatitisC_Dataset['Gender'].replace({'m': 0, 'f': 1})

In [84]:
HepatitisC_Dataset

Unnamed: 0,Category,Age,Gender,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,0,32,0,38.5,52.50000,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0
1,0,32,0,38.5,70.30000,18.0,24.7,3.9,11.17,4.80,74.0,15.6,76.5
2,0,32,0,46.9,74.70000,36.2,52.6,6.1,8.84,5.20,86.0,33.2,79.3
3,0,32,0,43.2,52.00000,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7
4,0,32,0,39.2,74.10000,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
610,1,62,1,32.0,416.60000,5.9,110.3,50.0,5.57,6.30,55.7,650.9,68.5
611,1,64,1,24.0,102.80000,2.9,44.4,20.0,1.54,3.02,63.0,35.9,71.3
612,1,64,1,29.0,87.30000,3.5,99.0,48.0,1.66,3.63,66.7,64.2,82.0
613,1,46,1,33.0,68.28392,39.0,62.0,20.0,3.56,4.20,52.0,50.0,71.0


# b.Memberlakukan data missing dan data inkonsistensi

In [85]:
# Handling Duplicate Values 
# Mencari data yang sama atau duplikat
# Menghapus data yang duplikat jika ditemukan
HepatitisC_Dataset = HepatitisC_Dataset.drop_duplicates()

In [86]:
# Handling Missing Values
# - Mencari terlebih dahulu kolom yang memiliki data null

print(HepatitisC_Dataset.isnull().sum()) #Menampilkan jumlah data null pada suatu kolom



Category    0
Age         0
Gender      0
ALB         0
ALP         0
ALT         0
AST         0
BIL         0
CHE         0
CHOL        0
CREA        0
GGT         0
PROT        0
dtype: int64


In [87]:
# - Mereplace kolom yang memiliki data kosong dengan rata-rata dari kolom tersebut (mean imputation)
HepatitisC_Dataset['ALB'].fillna(HepatitisC_Dataset['ALB'].mean(), inplace=True) #Mengisi data yang kosong di kolom tsb dengan rata-rata dari kolom tsb
HepatitisC_Dataset['ALP'].fillna(HepatitisC_Dataset['ALP'].mean(), inplace=True) #Mengisi data yang kosong di kolom tsb dengan rata-rata dari kolom tsb
HepatitisC_Dataset['ALT'].fillna(HepatitisC_Dataset['ALT'].mean(), inplace=True) #Mengisi data yang kosong di kolom tsb dengan rata-rata dari kolom tsb
HepatitisC_Dataset['CHOL'].fillna(HepatitisC_Dataset['CHOL'].mean(), inplace=True) #Mengisi data yang kosong di kolom tsb dengan rata-rata dari kolom tsb
HepatitisC_Dataset['PROT'].fillna(HepatitisC_Dataset['PROT'].mean(), inplace=True) #Mengisi data yang kosong di kolom tsb dengan rata-rata dari kolom tsb


print(HepatitisC_Dataset.isnull().sum()) #Menampilkan jumlah data null pada suatu kolom

Category    0
Age         0
Gender      0
ALB         0
ALP         0
ALT         0
AST         0
BIL         0
CHE         0
CHOL        0
CREA        0
GGT         0
PROT        0
dtype: int64


In [88]:
# Jumlah Dataset yang ada 
HepatitisC_Dataset.shape

(615, 13)

In [89]:
HepatitisC_Dataset['Category'].value_counts()

0    540
1     75
Name: Category, dtype: int64

# '''4.Modeling'''

In [90]:
# Memisahkan data dan Label
X = HepatitisC_Dataset.drop (columns='Category', axis=1)
Y = HepatitisC_Dataset['Category']

In [91]:
print(X)

     Age  Gender   ALB        ALP    ALT    AST   BIL    CHE  CHOL   CREA  \
0     32       0  38.5   52.50000    7.7   22.1   7.5   6.93  3.23  106.0   
1     32       0  38.5   70.30000   18.0   24.7   3.9  11.17  4.80   74.0   
2     32       0  46.9   74.70000   36.2   52.6   6.1   8.84  5.20   86.0   
3     32       0  43.2   52.00000   30.6   22.6  18.9   7.33  4.74   80.0   
4     32       0  39.2   74.10000   32.6   24.8   9.6   9.15  4.32   76.0   
..   ...     ...   ...        ...    ...    ...   ...    ...   ...    ...   
610   62       1  32.0  416.60000    5.9  110.3  50.0   5.57  6.30   55.7   
611   64       1  24.0  102.80000    2.9   44.4  20.0   1.54  3.02   63.0   
612   64       1  29.0   87.30000    3.5   99.0  48.0   1.66  3.63   66.7   
613   46       1  33.0   68.28392   39.0   62.0  20.0   3.56  4.20   52.0   
614   59       1  36.0   68.28392  100.0   80.0  12.0   9.07  5.30   67.0   

       GGT  PROT  
0     12.1  69.0  
1     15.6  76.5  
2     33.2  79.3  

In [92]:
print(Y)

0      0
1      0
2      0
3      0
4      0
      ..
610    1
611    1
612    1
613    1
614    1
Name: Category, Length: 615, dtype: int64


# *Standarisasi Data*

In [93]:
scaler = StandardScaler()

In [94]:
scaler.fit(X)

In [95]:
standarized_data = scaler.transform(X)

In [96]:
print(standarized_data)

[[-1.53361617 -0.79454373 -0.54064724 ...  0.49707027 -0.50228619
  -0.56437244]
 [-1.53361617 -0.79454373 -0.54064724 ... -0.14658961 -0.43820313
   0.82610164]
 [-1.53361617 -0.79454373  0.91485031 ...  0.09478284 -0.11595686
   1.34521196]
 ...
 [ 1.65143725  1.25858397 -2.18674566 ... -0.29342452  0.45163601
   1.84578263]
 [-0.1401553   1.25858397 -1.49365159 ... -0.58910578  0.19164186
  -0.19357936]
 [ 1.15377266  1.25858397 -0.97383104 ... -0.28739021 -0.1013093
  -0.74976899]]


In [97]:
X = standarized_data
Y = HepatitisC_Dataset['Category']

In [98]:
print(X)
print(Y)

[[-1.53361617 -0.79454373 -0.54064724 ...  0.49707027 -0.50228619
  -0.56437244]
 [-1.53361617 -0.79454373 -0.54064724 ... -0.14658961 -0.43820313
   0.82610164]
 [-1.53361617 -0.79454373  0.91485031 ...  0.09478284 -0.11595686
   1.34521196]
 ...
 [ 1.65143725  1.25858397 -2.18674566 ... -0.29342452  0.45163601
   1.84578263]
 [-0.1401553   1.25858397 -1.49365159 ... -0.58910578  0.19164186
  -0.19357936]
 [ 1.15377266  1.25858397 -0.97383104 ... -0.28739021 -0.1013093
  -0.74976899]]
0      0
1      0
2      0
3      0
4      0
      ..
610    1
611    1
612    1
613    1
614    1
Name: Category, Length: 615, dtype: int64


# *Memisahkan Data Training dan Data Testing*

In [99]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size= 0.2, stratify=Y, random_state=2)

In [100]:
print(X.shape, X_train.shape, X_test.shape)

(615, 12) (492, 12) (123, 12)


# *Membuat Data Latih Menggunakan Algoritma SVM*

In [101]:
classifier = svm.SVC(kernel='linear')

In [102]:
classifier.fit(X_train, Y_train)

# '''5.Evaluasi'''

# *Membuat Model Evaluasi Untuk Mengukur Tingkat akurasi*

In [103]:
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [104]:
print('Akurasi data training adalah = ',training_data_accuracy)

Akurasi data training adalah =  0.9715447154471545


In [106]:
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [107]:
print('Akurasi data testing adalah = ',test_data_accuracy)

Akurasi data testing adalah =  0.967479674796748


# *Membuat Model Prediksi*

In [109]:
input_data = (32, 0, 38.5, 52.5, 7.7, 22.1, 7.5, 6.93, 3.23, 106, 12.1, 69)

input_data_as_numpy_array = np.array(input_data)

input_data_reshape = input_data_as_numpy_array.reshape(1, -1)

std_data = scaler.transform(input_data_reshape)
print(std_data)

prediction = classifier.predict(std_data)
print(prediction)

if (prediction[0] == 0):
    print('Pasien tidak terdiagnosa Hepatitis C')
else :
    print('Pasien terdiagnosa Hepatitis C')

[[-1.53361617 -0.79454373 -0.54064724 -0.61600357 -0.81605387 -0.38369306
  -0.19823566 -0.57473353 -1.9046761   0.49707027 -0.50228619 -0.56437244]]
[0]
Pasien tidak terdiagnosa Hepatitis C




In [110]:
input_data = (23, 0, 47, 19.1, 38.9, 164.2, 17, 7.09, 3.2, 79.3, 90.4, 70.1)

input_data_as_numpy_array = np.array(input_data)

input_data_reshape = input_data_as_numpy_array.reshape(1, -1)

std_data = scaler.transform(input_data_reshape)
print(std_data)

prediction = classifier.predict(std_data)
print(prediction)

if (prediction[0] == 0):
    print('Pasien tidak terdiagnosa Hepatitis C')
else :
    print('Pasien terdiagnosa Hepatitis C')

[[-2.42941245 -0.79454373  0.93217766 -1.91951498  0.41092837  3.91406163
   0.28504907 -0.50213375 -1.93140091 -0.03998345  0.93134353 -0.36043624]]
[1]
Pasien terdiagnosa Hepatitis C




# '''6.Deployment'''

# *Simpan Model*

In [111]:
filename = 'hepatitisC_model.sav'
pickle.dump(classifier, open(filename, 'wb'))