In [3]:
import numpy as np
import pandas as pd

# Data
data_1 = pd.read_csv('tk4_data1.csv')
data_2 = pd.read_csv('tk4_data2.csv')

df_1 = pd.DataFrame(data_1)
df_2 = pd.DataFrame(data_2)

# Define the bin edges for the categories
bins_ukuran = [-float('inf'), 670, 830, float('inf')]
bins_lantai = [-float('inf'), 7, 11, float('inf')]
bins_tarif_internet = [-float('inf'), 35, 65, float('inf')]
bins_harga_sewa = [-float('inf'), 440, 560, float('inf')]

# Define the labels for the categories
labels_ukuran = ["Kecil", "Sedang", "Besar"]
labels_lantai = ["Rendah", "Menengah", "Tinggi"]
labels_tarif_internet = ["Lambat", "Sedang", "Cepat"]
labels_harga_sewa = ["Murah", "Sedang", "Mahal"]

# Bin the data
df_1["Ukuran"] = pd.cut(df_1["Ukuran"], bins=bins_ukuran, labels=labels_ukuran)
df_1["Lantai"] = pd.cut(df_1["Lantai"], bins=bins_lantai, labels=labels_lantai)
df_1["Tarif Internet"] = pd.cut(df_1["Tarif Internet"], bins=bins_tarif_internet, labels=labels_tarif_internet)
df_1["Harga Sewa"] = pd.cut(df_1["Harga Sewa"], bins=bins_harga_sewa, labels=labels_harga_sewa)

df_2["Ukuran"] = pd.cut(df_2["Ukuran"], bins=bins_ukuran, labels=labels_ukuran)
df_2["Lantai"] = pd.cut(df_2["Lantai"], bins=bins_lantai, labels=labels_lantai)
df_2["Tarif Internet"] = pd.cut(df_2["Tarif Internet"], bins=bins_tarif_internet, labels=labels_tarif_internet)
df_2["Harga Sewa"] = pd.cut(df_2["Harga Sewa"], bins=bins_harga_sewa, labels=labels_harga_sewa)

def calculate_entropy(df, target_column):
    classes,counts = np.unique(df[target_column], return_counts = True)
    entropy = np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts)) for i in range(len(classes))])
    return entropy

def calculate_entropy_attribute(df, attribute, target_column='Kategori'):
    entropy_s = calculate_entropy(df, target_column)
    vals,counts = np.unique(df[attribute], return_counts=True)
    weighted_entropy = np.sum([(counts[i]/np.sum(counts))*calculate_entropy(df.where(df[attribute]==vals[i]).dropna(), target_column) for i in range(len(vals))])
    return weighted_entropy

# Calculate the entropy of the target column
entropy_target = calculate_entropy(df_1, 'Kategori')

# Calculate Information Gain for each attribute
information_gain = {col: entropy_target - calculate_entropy_attribute(df_1, col) for col in ["Ukuran", "Lantai", "Tarif Internet", "Tipe Bangunan", "Harga Sewa"]}

def split_data(df, attribute, value):
    return df[df[attribute] == value], df[df[attribute] != value]

class DecisionNode:
    def __init__(self, attribute=None, value=None, true_branch=None, false_branch=None, is_leaf=False, current_depth=0, max_depth=3, data=None, target_column='Kategori'):
        self.attribute = attribute
        self.value = value
        self.true_branch = true_branch
        self.false_branch = false_branch
        self.is_leaf = is_leaf
        self.current_depth = current_depth
        self.max_depth = max_depth

        if data is not None:
            if self.is_leaf:
                self.label = data[target_column].mode()[0]
            else:
                data_true, data_false = split_data(data, self.attribute, self.value)

                if self.current_depth == self.max_depth:
                    self.true_branch = DecisionNode(is_leaf=True, data=data_true)
                    self.false_branch = DecisionNode(is_leaf=True, data=data_false)
                else:
                    self.true_branch = build_tree(data_true, current_depth=self.current_depth+1, max_depth=self.max_depth)
                    self.false_branch = build_tree(data_false, current_depth=self.current_depth+1, max_depth=self.max_depth)

def build_tree(data, current_depth=0, max_depth=3):
    entropy_s = calculate_entropy(data, 'Kategori')

    if entropy_s == 0:
        return DecisionNode(is_leaf=True, data=data)

    if current_depth == max_depth:
        return DecisionNode(is_leaf=True, data=data)

    gain = {col: entropy_s - calculate_entropy_attribute(data, col) for col in ["Ukuran", "Lantai", "Tarif Internet", "Tipe Bangunan", "Harga Sewa"]}
    best_attribute = max(gain, key=gain.get)

    best_value = data[best_attribute].mode()[0]

    return DecisionNode(attribute=best_attribute, value=best_value, current_depth=current_depth, max_depth=max_depth, data=data)

def print_tree(node, spacing=""):
    if isinstance(node, DecisionNode):
        if node.is_leaf:
            print(f"{spacing}-> Prediksi: {node.label}")
            return

        print(f'{spacing}{node.attribute} == {node.value}?')

        print(f'{spacing}--> True:')
        print_tree(node.true_branch, spacing + "  ")

        print(f'{spacing}--> False:')
        print_tree(node.false_branch, spacing + "  ")

def predict(node, data):
    if node.is_leaf:
        return node.label

    if data[node.attribute] == node.value:
        return predict(node.true_branch, data)
    else:
        return predict(node.false_branch, data)

tree = build_tree(df_1)

print_tree(tree)

predictions = df_2.apply(lambda row: predict(tree, row), axis=1)

predictions

Harga Sewa == Murah?
--> True:
  Tipe Bangunan == C?
  --> True:
    Ukuran == Kecil?
    --> True:
      -> Prediksi: Biasa
    --> False:
      -> Prediksi: VIP
  --> False:
    -> Prediksi: Biasa
--> False:
  Harga Sewa == Mahal?
  --> True:
    -> Prediksi: Eksklusif
  --> False:
    -> Prediksi: VIP


0    Biasa
1    Biasa
dtype: object