<a href="https://colab.research.google.com/github/arifroska/PrakAPM/blob/main/Decision_Tree_ID3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np

class GadId3Classifier:
  def fit(self, input, output):
    data = input.copy()
    data[output.name] = output
    self.tree = self.decision_tree(data, data, input.columns, output.name)

  def predict(self, input):
    # mengubah data input menjadi kamus sampel
    samples = input.to_dict(orient='records')
    predictions = []

    # buat prediksi untuk setiap sampel
    for sample in samples:
      predictions.append(self.make_prediction(sample, self.tree, 1.0))

    return predictions

  def entropy(self, attribute_column):
    # temukan nilai unik dan frekuensinya dihitung untuk atribut yang diberikan
    values, counts = np.unique(attribute_column, return_counts=True)

    # hitung entropi untuk setiap nilai unik
    entropy_list = []

    for i in range(len(values)):
      probability = counts[i]/np.sum(counts)
      entropy_list.append(-probability*np.log2(probability))

    # hitung jumlah nilai entropi individu
    total_entropy = np.sum(entropy_list)

    return total_entropy

  def information_gain(self, data, feature_attribute_name, target_attribute_name):
    # temukan entropi total dari subset yang diberikan
    total_entropy = self.entropy(data[target_attribute_name])

    # temukan nilai unik dan frekuensinya dihitung untuk atribut yang akan dipisah
    values, counts = np.unique(data[feature_attribute_name], return_counts=True)

    # hitung entropi tertimbang dari subset
    weighted_entropy_list = []

    for i in range(len(values)):
      subset_probability = counts[i]/np.sum(counts)
      subset_entropy = self.entropy(data.where(data[feature_attribute_name]==values[i]).dropna()[target_attribute_name])
      weighted_entropy_list.append(subset_probability*subset_entropy)

    total_weighted_entropy = np.sum(weighted_entropy_list)

    # menghitung perolehan informasi
    information_gain = total_entropy - total_weighted_entropy

    return information_gain

  def decision_tree(self, data, orginal_data, feature_attribute_names, target_attribute_name, parent_node_class=None):
    # base cases:
    # jika datanya murni, kembalikan kelas mayoritas dari subset
    unique_classes = np.unique(data[target_attribute_name])
    if len(unique_classes) <= 1:
      return unique_classes[0]
    # jika subset kosong, mis. tidak ada sampel, kembalikan sebagian besar kelas data asli
    elif len(data) == 0:
      majority_class_index = np.argmax(np.unique(original_data[target_attribute_name], return_counts=True)[1])
      return np.unique(original_data[target_attribute_name])[majority_class_index]
    # jika kumpulan data tidak berisi fitur untuk dilatih, kembalikan kelas simpul induk
    elif len(feature_attribute_names) == 0:
      return parent_node_class
    # jika tidak satu pun di atas yang benar, buat cabang:
    else:
      # tentukan kelas simpul induk dari cabang saat ini
      majority_class_index = np.argmax(np.unique(data[target_attribute_name], return_counts=True)[1])
      parent_node_class = unique_classes[majority_class_index]

      # tentukan nilai perolehan informasi untuk setiap fitur
      # pilih fitur yang paling baik membagi data, mis. nilai tertinggi
      ig_values = [self.information_gain(data, feature, target_attribute_name) for feature in feature_attribute_names]
      best_feature_index = np.argmax(ig_values)
      best_feature = feature_attribute_names[best_feature_index]

      # buat struktur pohon, kosongkan terlebih dahulu
      tree = {best_feature: {}}

      # hapus fitur terbaik dari fitur yang tersedia, itu akan menjadi simpul induk
      feature_attribute_names = [i for i in feature_attribute_names if i != best_feature]

      #buat node di bawah node induk
      parent_attribute_values = np.unique(data[best_feature])
      for value in parent_attribute_values:
        sub_data = data.where(data[best_feature] == value).dropna()

        # panggil algoritma secara rekursif
        subtree = self.decision_tree(sub_data, orginal_data, feature_attribute_names, target_attribute_name, parent_node_class)

        # tambahkan subpohon ke pohon asli
        tree[best_feature][value] = subtree

      return tree

  def make_prediction(self, sample, tree, default=1):
    # memetakan data sampel ke pohon
    for attribute in list(sample.keys()):
      # periksa apakah fitur ada di pohon
      if attribute in list(tree.keys()):
        try:
          result = tree[attribute][sample[attribute]]
        except:
          return default

        result = tree[attribute][sample[attribute]]

        # jika ada lebih banyak atribut dalam hasil, temukan hasil terbaik secara rekursif
        if isinstance(result, dict):
          return self.make_prediction(sample, result)
        else:
          return result

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
df = pd.read_csv(data_url, header=None)

# ganti nama kolom yang dikenal
columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
           'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'disease_present']
df.columns = columns

# ubah fitur penyakit sekarang menjadi biner
df['disease_present'] = df.disease_present.replace([1,2,3,4], 1)

# jatuhkan baris dengan nilai yang hilang, missing = ?
df = df.replace("?", np.nan)
df = df.dropna()

# mengatur data menjadi input dan output
X = df.drop(columns="disease_present")
y = df["disease_present"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# inisialisasi dan sesuaikan model
model = GadId3Classifier()
model.fit(X_train, y_train)

# kembali skor akurasi
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.5555555555555556

In [4]:
print (df)

      age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0    63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1    67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5   
2    67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6   
3    37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5   
4    41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4   
..    ...  ...  ...       ...    ...  ...      ...      ...    ...      ...   
297  57.0  0.0  4.0     140.0  241.0  0.0      0.0    123.0    1.0      0.2   
298  45.0  1.0  1.0     110.0  264.0  0.0      0.0    132.0    0.0      1.2   
299  68.0  1.0  4.0     144.0  193.0  1.0      0.0    141.0    0.0      3.4   
300  57.0  1.0  4.0     130.0  131.0  0.0      0.0    115.0    1.0      1.2   
301  57.0  0.0  2.0     130.0  236.0  0.0      2.0    174.0    0.0      0.0   

     slope   ca thal  disease_present  
0      3.0 

In [8]:
# from sklearn.tree import export_graphviz
# from six import StringIO  
# from IPython.display import Image  
# import pydotplus

# dot_data = StringIO()
# export_graphviz(model, out_file=dot_data,  
#                 filled=True, rounded=True,
#                 special_characters=True,feature_names = model,class_names=['Tidak Sakit Dada','Sakit Dada'])
# graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
# graph.write_png('Lung Cancer.png')
# Image(graph.create_png())