<a href="https://colab.research.google.com/github/asrianda/jurnal/blob/main/Jurnal_C5_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Klasifikasi C5.0 Split data 80:20 Tanpa Menggunakan AdaBoot**

In [7]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from graphviz import Digraph

# --- Load data ---
data = pd.read_excel("dataset_31_credit-g.xlsx")  # Ganti dengan file Anda
# --- Pilih Kolom yang Relevan (sesuaikan jika berbeda) ---
data = data[['class',  'age', 'job', 'credit_amount', 'duration',
             'checking_status','purpose','savings_status','personal_status']].copy()

# --- Tangani missing value ---
data.fillna(data.median(numeric_only=True), inplace=True)
for col in data.select_dtypes(include='object'):
    data[col].fillna(data[col].mode()[0], inplace=True)

# --- Split data menjadi fitur dan target ---
X = data.drop(columns=['class'])
y = data['class']

# --- Encode fitur kategorikal ---
le = LabelEncoder()
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = le.fit_transform(X[col])

# --- Split 80:20 ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_data = pd.concat([X_train, y_train], axis=1)
test_data = X_test.copy()
test_data['class'] = y_test

# --- Entropy Function ---
def entropy(labels):
    counts = labels.value_counts()
    total = len(labels)
    ent = 0
    for count in counts:
        p = count / total
        ent -= p * math.log2(p)
    return ent

# --- Gain Ratio for Numeric ---
def gain_ratio_numeric(data, attr, target, threshold):
    total_entropy = entropy(data[target])
    left = data[data[attr] <= threshold]
    right = data[data[attr] > threshold]

    left_entropy = entropy(left[target]) if len(left) > 0 else 0
    right_entropy = entropy(right[target]) if len(right) > 0 else 0

    weighted_entropy = (len(left) / len(data)) * left_entropy + (len(right) / len(data)) * right_entropy
    info_gain = total_entropy - weighted_entropy

    split_info = 0
    for subset in [left, right]:
        p = len(subset) / len(data)
        if p > 0:
            split_info -= p * math.log2(p)

    if split_info == 0:
        return 0, None

    return info_gain / split_info, threshold

# --- Gain Ratio for Categorical ---
def gain_ratio_categorical(data, attr, target):
    total_entropy = entropy(data[target])
    subsets = data.groupby(attr)

    weighted_entropy = 0
    split_info = 0
    for key, subset in subsets:
        p = len(subset) / len(data)
        weighted_entropy += p * entropy(subset[target])
        if p > 0:
            split_info -= p * math.log2(p)

    info_gain = total_entropy - weighted_entropy
    if split_info == 0:
        return 0
    return info_gain / split_info

# --- Get thresholds for numeric attributes ---
def get_thresholds(column):
    unique_vals = sorted(column.unique())
    thresholds = []
    for i in range(len(unique_vals) - 1):
        thresholds.append((unique_vals[i] + unique_vals[i + 1]) / 2)
    return thresholds

# --- Find best attribute and threshold ---
def best_split(data, target, attributes):
    best_gain_ratio = -1
    best_attr = None
    best_threshold = None
    is_numeric = False

    for attr in attributes:
        if data[attr].dtype in ['int64', 'float64']:
            thresholds = get_thresholds(data[attr])
            for t in thresholds:
                gr, thr = gain_ratio_numeric(data, attr, target, t)
                if gr > best_gain_ratio:
                    best_gain_ratio = gr
                    best_attr = attr
                    best_threshold = thr
                    is_numeric = True
        else:
            gr = gain_ratio_categorical(data, attr, target)
            if gr > best_gain_ratio:
                best_gain_ratio = gr
                best_attr = attr
                best_threshold = None
                is_numeric = False

    return best_attr, best_threshold, best_gain_ratio, is_numeric

# --- Build decision tree ---
def build_tree(data, target, attributes, depth=0, max_depth=4):
    labels = data[target]
    if len(labels.unique()) == 1:
        return labels.iloc[0]
    if depth == max_depth or len(attributes) == 0:
        return labels.mode()[0]

    attr, threshold, gr, is_num = best_split(data, target, attributes)
    if attr is None or gr == 0:
        return labels.mode()[0]

    tree = {}
    if is_num:
        tree[f"{attr} <= {threshold:.2f}"] = {}
        left = data[data[attr] <= threshold]
        right = data[data[attr] > threshold]
        tree[f"{attr} <= {threshold:.2f}"]['Yes'] = build_tree(left, target, attributes, depth + 1, max_depth)
        tree[f"{attr} <= {threshold:.2f}"]['No'] = build_tree(right, target, attributes, depth + 1, max_depth)
    else:
        tree[f"{attr}"] = {}
        for val in data[attr].unique():
            subset = data[data[attr] == val]
            tree[f"{attr}"][val] = build_tree(subset, target, [a for a in attributes if a != attr], depth + 1, max_depth)
    return tree

# --- Predict with tree ---
def predict(tree, sample):
    if not isinstance(tree, dict):
        return tree
    root = next(iter(tree))
    if "<=" in root:
        attr, thresh = root.split(" <= ")
        thresh = float(thresh)
        if sample[attr] <= thresh:
            return predict(tree[root]['Yes'], sample)
        else:
            return predict(tree[root]['No'], sample)
    else:
        attr = root
        val = sample[attr]
        if val in tree[attr]:
            return predict(tree[attr][val], sample)
        else:
            return list(tree[attr].values())[0]

# --- Visualize tree ---
def visualize_tree(tree, dot=None, parent=None, edge_label=None):
    if dot is None:
        dot = Digraph()
        dot.attr('node', shape='ellipse', fontname='Arial')

    if not isinstance(tree, dict):
        node_id = str(id(tree)) + str(np.random.randint(1000))
        dot.node(node_id, label=str(tree), shape='box', style='filled', color='lightgrey')
        if parent is not None:
            dot.edge(parent, node_id, label=edge_label)
        return dot

    root = next(iter(tree))
    node_id = str(id(root)) + str(np.random.randint(1000))
    dot.node(node_id, label=root)
    if parent is not None:
        dot.edge(parent, node_id, label=edge_label)

    children = tree[root]
    for branch_label, subtree in children.items():
        visualize_tree(subtree, dot, node_id, str(branch_label))
    return dot

# --- Main ---
attributes = list(X.columns)
target = 'class'

tree = build_tree(train_data, target, attributes, max_depth=4)

# --- Prediksi dan Evaluasi ---
y_pred = []
y_true = []

for _, row in test_data.iterrows():
    sample = row.drop('class')
    pred = predict(tree, sample)
    y_pred.append(pred)
    y_true.append(row['class'])

print("=== Classification Report ===")
print(classification_report(y_true, y_pred))
print(f"Akurasi: {accuracy_score(y_true, y_pred):.4f}")

# --- Visualisasi pohon keputusan ---
#dot = visualize_tree(tree)
#dot.render('credit_tree', format='png', cleanup=True)
#print("Pohon keputusan tersimpan di file credit_tree.png")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mode()[0], inplace=True)


=== Classification Report ===
              precision    recall  f1-score   support

         bad       0.00      0.00      0.00        59
        good       0.70      1.00      0.83       141

    accuracy                           0.70       200
   macro avg       0.35      0.50      0.41       200
weighted avg       0.50      0.70      0.58       200

Akurasi: 0.7050


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### **Klasifikasi C5.0 Split data 70:30 Tanpa Menggunakan AdaBoot**

In [3]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from graphviz import Digraph

# --- Load data ---
data = pd.read_excel("dataset_31_credit-g.xlsx")  # Ganti dengan file Anda
# --- Pilih Kolom yang Relevan (sesuaikan jika berbeda) ---
data = data[['class',  'age', 'job', 'credit_amount', 'duration',
             'checking_status','purpose','savings_status','personal_status']].copy()

# --- Tangani missing value ---
data.fillna(data.median(numeric_only=True), inplace=True)
for col in data.select_dtypes(include='object'):
    data[col].fillna(data[col].mode()[0], inplace=True)

# --- Split data menjadi fitur dan target ---
X = data.drop(columns=['class'])
y = data['class']

# --- Encode fitur kategorikal ---
le = LabelEncoder()
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = le.fit_transform(X[col])

# --- Split 80:20 ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
train_data = pd.concat([X_train, y_train], axis=1)
test_data = X_test.copy()
test_data['class'] = y_test

# --- Entropy Function ---
def entropy(labels):
    counts = labels.value_counts()
    total = len(labels)
    ent = 0
    for count in counts:
        p = count / total
        ent -= p * math.log2(p)
    return ent

# --- Gain Ratio for Numeric ---
def gain_ratio_numeric(data, attr, target, threshold):
    total_entropy = entropy(data[target])
    left = data[data[attr] <= threshold]
    right = data[data[attr] > threshold]

    left_entropy = entropy(left[target]) if len(left) > 0 else 0
    right_entropy = entropy(right[target]) if len(right) > 0 else 0

    weighted_entropy = (len(left) / len(data)) * left_entropy + (len(right) / len(data)) * right_entropy
    info_gain = total_entropy - weighted_entropy

    split_info = 0
    for subset in [left, right]:
        p = len(subset) / len(data)
        if p > 0:
            split_info -= p * math.log2(p)

    if split_info == 0:
        return 0, None

    return info_gain / split_info, threshold

# --- Gain Ratio for Categorical ---
def gain_ratio_categorical(data, attr, target):
    total_entropy = entropy(data[target])
    subsets = data.groupby(attr)

    weighted_entropy = 0
    split_info = 0
    for key, subset in subsets:
        p = len(subset) / len(data)
        weighted_entropy += p * entropy(subset[target])
        if p > 0:
            split_info -= p * math.log2(p)

    info_gain = total_entropy - weighted_entropy
    if split_info == 0:
        return 0
    return info_gain / split_info

# --- Get thresholds for numeric attributes ---
def get_thresholds(column):
    unique_vals = sorted(column.unique())
    thresholds = []
    for i in range(len(unique_vals) - 1):
        thresholds.append((unique_vals[i] + unique_vals[i + 1]) / 2)
    return thresholds

# --- Find best attribute and threshold ---
def best_split(data, target, attributes):
    best_gain_ratio = -1
    best_attr = None
    best_threshold = None
    is_numeric = False

    for attr in attributes:
        if data[attr].dtype in ['int64', 'float64']:
            thresholds = get_thresholds(data[attr])
            for t in thresholds:
                gr, thr = gain_ratio_numeric(data, attr, target, t)
                if gr > best_gain_ratio:
                    best_gain_ratio = gr
                    best_attr = attr
                    best_threshold = thr
                    is_numeric = True
        else:
            gr = gain_ratio_categorical(data, attr, target)
            if gr > best_gain_ratio:
                best_gain_ratio = gr
                best_attr = attr
                best_threshold = None
                is_numeric = False

    return best_attr, best_threshold, best_gain_ratio, is_numeric

# --- Build decision tree ---
def build_tree(data, target, attributes, depth=0, max_depth=4):
    labels = data[target]
    if len(labels.unique()) == 1:
        return labels.iloc[0]
    if depth == max_depth or len(attributes) == 0:
        return labels.mode()[0]

    attr, threshold, gr, is_num = best_split(data, target, attributes)
    if attr is None or gr == 0:
        return labels.mode()[0]

    tree = {}
    if is_num:
        tree[f"{attr} <= {threshold:.2f}"] = {}
        left = data[data[attr] <= threshold]
        right = data[data[attr] > threshold]
        tree[f"{attr} <= {threshold:.2f}"]['Yes'] = build_tree(left, target, attributes, depth + 1, max_depth)
        tree[f"{attr} <= {threshold:.2f}"]['No'] = build_tree(right, target, attributes, depth + 1, max_depth)
    else:
        tree[f"{attr}"] = {}
        for val in data[attr].unique():
            subset = data[data[attr] == val]
            tree[f"{attr}"][val] = build_tree(subset, target, [a for a in attributes if a != attr], depth + 1, max_depth)
    return tree

# --- Predict with tree ---
def predict(tree, sample):
    if not isinstance(tree, dict):
        return tree
    root = next(iter(tree))
    if "<=" in root:
        attr, thresh = root.split(" <= ")
        thresh = float(thresh)
        if sample[attr] <= thresh:
            return predict(tree[root]['Yes'], sample)
        else:
            return predict(tree[root]['No'], sample)
    else:
        attr = root
        val = sample[attr]
        if val in tree[attr]:
            return predict(tree[attr][val], sample)
        else:
            return list(tree[attr].values())[0]

# --- Visualize tree ---
def visualize_tree(tree, dot=None, parent=None, edge_label=None):
    if dot is None:
        dot = Digraph()
        dot.attr('node', shape='ellipse', fontname='Arial')

    if not isinstance(tree, dict):
        node_id = str(id(tree)) + str(np.random.randint(1000))
        dot.node(node_id, label=str(tree), shape='box', style='filled', color='lightgrey')
        if parent is not None:
            dot.edge(parent, node_id, label=edge_label)
        return dot

    root = next(iter(tree))
    node_id = str(id(root)) + str(np.random.randint(1000))
    dot.node(node_id, label=root)
    if parent is not None:
        dot.edge(parent, node_id, label=edge_label)

    children = tree[root]
    for branch_label, subtree in children.items():
        visualize_tree(subtree, dot, node_id, str(branch_label))
    return dot

# --- Main ---
attributes = list(X.columns)
target = 'class'

tree = build_tree(train_data, target, attributes, max_depth=4)

# --- Prediksi dan Evaluasi ---
y_pred = []
y_true = []

for _, row in test_data.iterrows():
    sample = row.drop('class')
    pred = predict(tree, sample)
    y_pred.append(pred)
    y_true.append(row['class'])

print("=== Classification Report ===")
print(classification_report(y_true, y_pred))
print(f"Akurasi: {accuracy_score(y_true, y_pred):.4f}")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mode()[0], inplace=True)


=== Classification Report ===
              precision    recall  f1-score   support

         bad       0.00      0.00      0.00        91
        good       0.70      1.00      0.82       209

    accuracy                           0.70       300
   macro avg       0.35      0.50      0.41       300
weighted avg       0.49      0.70      0.57       300

Akurasi: 0.6967


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## **Kedalaman Pohon Split Data 80:20 Tanpa Adaboots**

In [2]:
import pandas as pd
import numpy as np
import math
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# --- Load dataset ---
# Gantilah ini dengan pemanggilan dataset asli Anda
# Contoh:
data = pd.read_excel("dataset_31_credit-g.xlsx")
# --- Pilih Kolom yang Relevan (sesuaikan jika berbeda) ---
data = data[['class',  'age', 'job', 'credit_amount', 'duration',
             'checking_status','purpose','savings_status','personal_status']].copy()

# --- Preprocessing ---
# Pisahkan fitur dan target
X = data.drop(columns=['class'])
y = data['class']

# Ubah kolom numerik bertipe string ke float
for col in X.columns:
    if X[col].dtype == 'object':
        try:
            X[col] = X[col].astype(float)
        except:
            pass  # Tetap string jika tidak bisa dikonversi

# Encode fitur kategorikal
le = LabelEncoder()
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = le.fit_transform(X[col])

# Gabungkan kembali dengan target untuk imputasi nilai hilang
data_cleaned = X.copy()
data_cleaned['class'] = y.copy()

# Isi nilai kosong numerik dengan median
data_cleaned.fillna(data_cleaned.median(numeric_only=True), inplace=True)

# Isi nilai kosong kategorikal dengan modus
for col in data_cleaned.select_dtypes(include='object'):
    data_cleaned[col].fillna(data_cleaned[col].mode()[0], inplace=True)

# Split dataset 80:20
X = data_cleaned.drop(columns=['class'])
y = data_cleaned['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_data = X_train.copy()
train_data['class'] = y_train
test_data = X_test.copy()
test_data['class'] = y_test

# --- Entropy Function ---
def entropy(labels):
    counts = labels.value_counts()
    total = len(labels)
    ent = 0
    for count in counts:
        p = count / total
        ent -= p * math.log2(p)
    return ent

# --- Gain Ratio for Numeric ---
def gain_ratio_numeric(data, attr, target, threshold):
    total_entropy = entropy(data[target])
    left = data[data[attr] <= threshold]
    right = data[data[attr] > threshold]

    left_entropy = entropy(left[target]) if len(left) > 0 else 0
    right_entropy = entropy(right[target]) if len(right) > 0 else 0

    weighted_entropy = (len(left) / len(data)) * left_entropy + (len(right) / len(data)) * right_entropy
    info_gain = total_entropy - weighted_entropy

    split_info = 0
    for subset in [left, right]:
        p = len(subset) / len(data)
        if p > 0:
            split_info -= p * math.log2(p)

    if split_info == 0:
        return 0, None

    return info_gain / split_info, threshold

# --- Gain Ratio for Categorical ---
def gain_ratio_categorical(data, attr, target):
    total_entropy = entropy(data[target])
    subsets = data.groupby(attr)

    weighted_entropy = 0
    split_info = 0
    for key, subset in subsets:
        p = len(subset) / len(data)
        weighted_entropy += p * entropy(subset[target])
        if p > 0:
            split_info -= p * math.log2(p)

    info_gain = total_entropy - weighted_entropy
    if split_info == 0:
        return 0
    return info_gain / split_info

# --- Get thresholds for numeric attributes ---
def get_thresholds(column_data):
    unique_vals = sorted(column_data.unique())
    thresholds = []
    for i in range(len(unique_vals) - 1):
        thresholds.append((unique_vals[i] + unique_vals[i + 1]) / 2)
    return thresholds

# --- Find best attribute and threshold ---
def best_split(data, target, attributes):
    best_gain_ratio = -1
    best_attr = None
    best_threshold = None
    is_numeric = False

    for attr in attributes:
        if data[attr].dtype in ['int64', 'float64']:
            thresholds = get_thresholds(data[attr])
            for t in thresholds:
                gr, thr = gain_ratio_numeric(data, attr, target, t)
                if gr > best_gain_ratio:
                    best_gain_ratio = gr
                    best_attr = attr
                    best_threshold = thr
                    is_numeric = True
        else:
            gr = gain_ratio_categorical(data, attr, target)
            if gr > best_gain_ratio:
                best_gain_ratio = gr
                best_attr = attr
                best_threshold = None
                is_numeric = False

    return best_attr, best_threshold, best_gain_ratio, is_numeric

# --- Build decision tree ---
def build_tree(data, target, attributes, depth=0, max_depth=4):
    labels = data[target]
    if len(labels.unique()) == 1:
        return labels.iloc[0]
    if depth == max_depth or len(attributes) == 0:
        return labels.mode()[0]

    attr, threshold, gr, is_num = best_split(data, target, attributes)
    if attr is None or gr == 0:
        return labels.mode()[0]

    tree = {}
    if is_num:
        tree[f"{attr} <= {threshold:.2f}"] = {}
        left = data[data[attr] <= threshold]
        right = data[data[attr] > threshold]
        tree[f"{attr} <= {threshold:.2f}"]['Yes'] = build_tree(left, target, attributes, depth+1, max_depth)
        tree[f"{attr} <= {threshold:.2f}"]['No'] = build_tree(right, target, attributes, depth+1, max_depth)
    else:
        tree[f"{attr}"] = {}
        for val in data[attr].unique():
            subset = data[data[attr] == val]
            tree[f"{attr}"][val] = build_tree(subset, target, [a for a in attributes if a != attr], depth+1, max_depth)
    return tree

# --- Predict with tree ---
def predict(tree, sample):
    if not isinstance(tree, dict):
        return tree
    root = next(iter(tree))
    if "<=" in root:
        attr, thresh = root.split(" <= ")
        thresh = float(thresh)
        if sample[attr] <= thresh:
            return predict(tree[root]['Yes'], sample)
        else:
            return predict(tree[root]['No'], sample)
    else:
        attr = root
        val = sample[attr]
        if val in tree[attr]:
            return predict(tree[attr][val], sample)
        else:
            return list(tree[attr].values())[0]

# --- Loop over depths 1 to 20 ---
print("max_depth | accuracy | time (seconds)")
for depth in range(1, 16):
    start_time = time.time()
    attributes = list(train_data.columns)
    attributes.remove('class')

    tree = build_tree(train_data.copy(), 'class', attributes, max_depth=depth)

    y_pred = []
    for _, row in test_data.iterrows():
        y_pred.append(predict(tree, row))

    acc = accuracy_score(test_data['class'], y_pred)
    elapsed = time.time() - start_time

    print(f"{depth:9d} | {acc:.4f}   | {elapsed:.4f}")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_cleaned[col].fillna(data_cleaned[col].mode()[0], inplace=True)


max_depth | accuracy | time (seconds)
        1 | 0.7050   | 2.6369
        2 | 0.7050   | 2.5116
        3 | 0.7050   | 3.9678
        4 | 0.7050   | 6.2746
        5 | 0.7050   | 7.0622
        6 | 0.7050   | 9.3965
        7 | 0.6900   | 10.7387
        8 | 0.7000   | 17.6218
        9 | 0.7000   | 15.9473
       10 | 0.7000   | 15.5145
       11 | 0.7000   | 24.8951
       12 | 0.7000   | 23.3475
       13 | 0.7000   | 22.5093
       14 | 0.7000   | 21.1901
       15 | 0.7000   | 23.6150


### **Kedalaman Pohon Split Data 70:30 Tanpa Adaboots**

In [4]:
import pandas as pd
import numpy as np
import math
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# --- Load dataset ---
# Gantilah ini dengan pemanggilan dataset asli Anda
# Contoh:
data = pd.read_excel("dataset_31_credit-g.xlsx")
# --- Pilih Kolom yang Relevan (sesuaikan jika berbeda) ---
data = data[['class',  'age', 'job', 'credit_amount', 'duration',
             'checking_status','purpose','savings_status','personal_status']].copy()

# --- Preprocessing ---
# Pisahkan fitur dan target
X = data.drop(columns=['class'])
y = data['class']

# Ubah kolom numerik bertipe string ke float
for col in X.columns:
    if X[col].dtype == 'object':
        try:
            X[col] = X[col].astype(float)
        except:
            pass  # Tetap string jika tidak bisa dikonversi

# Encode fitur kategorikal
le = LabelEncoder()
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = le.fit_transform(X[col])

# Gabungkan kembali dengan target untuk imputasi nilai hilang
data_cleaned = X.copy()
data_cleaned['class'] = y.copy()

# Isi nilai kosong numerik dengan median
data_cleaned.fillna(data_cleaned.median(numeric_only=True), inplace=True)

# Isi nilai kosong kategorikal dengan modus
for col in data_cleaned.select_dtypes(include='object'):
    data_cleaned[col].fillna(data_cleaned[col].mode()[0], inplace=True)

# Split dataset 80:20
X = data_cleaned.drop(columns=['class'])
y = data_cleaned['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
train_data = X_train.copy()
train_data['class'] = y_train
test_data = X_test.copy()
test_data['class'] = y_test

# --- Entropy Function ---
def entropy(labels):
    counts = labels.value_counts()
    total = len(labels)
    ent = 0
    for count in counts:
        p = count / total
        ent -= p * math.log2(p)
    return ent

# --- Gain Ratio for Numeric ---
def gain_ratio_numeric(data, attr, target, threshold):
    total_entropy = entropy(data[target])
    left = data[data[attr] <= threshold]
    right = data[data[attr] > threshold]

    left_entropy = entropy(left[target]) if len(left) > 0 else 0
    right_entropy = entropy(right[target]) if len(right) > 0 else 0

    weighted_entropy = (len(left) / len(data)) * left_entropy + (len(right) / len(data)) * right_entropy
    info_gain = total_entropy - weighted_entropy

    split_info = 0
    for subset in [left, right]:
        p = len(subset) / len(data)
        if p > 0:
            split_info -= p * math.log2(p)

    if split_info == 0:
        return 0, None

    return info_gain / split_info, threshold

# --- Gain Ratio for Categorical ---
def gain_ratio_categorical(data, attr, target):
    total_entropy = entropy(data[target])
    subsets = data.groupby(attr)

    weighted_entropy = 0
    split_info = 0
    for key, subset in subsets:
        p = len(subset) / len(data)
        weighted_entropy += p * entropy(subset[target])
        if p > 0:
            split_info -= p * math.log2(p)

    info_gain = total_entropy - weighted_entropy
    if split_info == 0:
        return 0
    return info_gain / split_info

# --- Get thresholds for numeric attributes ---
def get_thresholds(column_data):
    unique_vals = sorted(column_data.unique())
    thresholds = []
    for i in range(len(unique_vals) - 1):
        thresholds.append((unique_vals[i] + unique_vals[i + 1]) / 2)
    return thresholds

# --- Find best attribute and threshold ---
def best_split(data, target, attributes):
    best_gain_ratio = -1
    best_attr = None
    best_threshold = None
    is_numeric = False

    for attr in attributes:
        if data[attr].dtype in ['int64', 'float64']:
            thresholds = get_thresholds(data[attr])
            for t in thresholds:
                gr, thr = gain_ratio_numeric(data, attr, target, t)
                if gr > best_gain_ratio:
                    best_gain_ratio = gr
                    best_attr = attr
                    best_threshold = thr
                    is_numeric = True
        else:
            gr = gain_ratio_categorical(data, attr, target)
            if gr > best_gain_ratio:
                best_gain_ratio = gr
                best_attr = attr
                best_threshold = None
                is_numeric = False

    return best_attr, best_threshold, best_gain_ratio, is_numeric

# --- Build decision tree ---
def build_tree(data, target, attributes, depth=0, max_depth=4):
    labels = data[target]
    if len(labels.unique()) == 1:
        return labels.iloc[0]
    if depth == max_depth or len(attributes) == 0:
        return labels.mode()[0]

    attr, threshold, gr, is_num = best_split(data, target, attributes)
    if attr is None or gr == 0:
        return labels.mode()[0]

    tree = {}
    if is_num:
        tree[f"{attr} <= {threshold:.2f}"] = {}
        left = data[data[attr] <= threshold]
        right = data[data[attr] > threshold]
        tree[f"{attr} <= {threshold:.2f}"]['Yes'] = build_tree(left, target, attributes, depth+1, max_depth)
        tree[f"{attr} <= {threshold:.2f}"]['No'] = build_tree(right, target, attributes, depth+1, max_depth)
    else:
        tree[f"{attr}"] = {}
        for val in data[attr].unique():
            subset = data[data[attr] == val]
            tree[f"{attr}"][val] = build_tree(subset, target, [a for a in attributes if a != attr], depth+1, max_depth)
    return tree

# --- Predict with tree ---
def predict(tree, sample):
    if not isinstance(tree, dict):
        return tree
    root = next(iter(tree))
    if "<=" in root:
        attr, thresh = root.split(" <= ")
        thresh = float(thresh)
        if sample[attr] <= thresh:
            return predict(tree[root]['Yes'], sample)
        else:
            return predict(tree[root]['No'], sample)
    else:
        attr = root
        val = sample[attr]
        if val in tree[attr]:
            return predict(tree[attr][val], sample)
        else:
            return list(tree[attr].values())[0]

# --- Loop over depths 1 to 20 ---
print("max_depth | accuracy | time (seconds)")
for depth in range(1, 16):
    start_time = time.time()
    attributes = list(train_data.columns)
    attributes.remove('class')

    tree = build_tree(train_data.copy(), 'class', attributes, max_depth=depth)

    y_pred = []
    for _, row in test_data.iterrows():
        y_pred.append(predict(tree, row))

    acc = accuracy_score(test_data['class'], y_pred)
    elapsed = time.time() - start_time

    print(f"{depth:9d} | {acc:.4f}   | {elapsed:.4f}")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_cleaned[col].fillna(data_cleaned[col].mode()[0], inplace=True)


max_depth | accuracy | time (seconds)
        1 | 0.6967   | 2.9022
        2 | 0.6967   | 4.9136
        3 | 0.6967   | 3.4993
        4 | 0.6967   | 4.9144
        5 | 0.6933   | 6.8248
        6 | 0.6800   | 8.0470
        7 | 0.6833   | 9.2358
        8 | 0.6833   | 10.6600
        9 | 0.6833   | 11.6949
       10 | 0.6733   | 12.7369
       11 | 0.6733   | 13.7568
       12 | 0.6733   | 14.5447
       13 | 0.6733   | 16.3008
       14 | 0.6667   | 19.6154
       15 | 0.6667   | 18.5216


In [6]:
#pip install pandas numpy scikit-learn graphviz

**Klasifikasi C5.0 Split data 80:20 Menggunakan AdaBoot**

In [10]:
import pandas as pd
import numpy as np
import math
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# ================================
# --- Load dan Preprocessing ---
# ================================
data = pd.read_excel("dataset_31_credit-g.xlsx")

# Pilih kolom relevan
data = data[['class', 'age', 'job', 'credit_amount', 'duration',
             'checking_status', 'purpose', 'savings_status', 'personal_status']].copy()

# Pisahkan fitur dan target
X = data.drop(columns=['class'])
y = data['class']

# Ubah kolom numerik bertipe string ke float
for col in X.columns:
    if X[col].dtype == 'object':
        try:
            X[col] = X[col].astype(float)
        except:
            pass  # Biarkan jika memang tetap string

# Encode fitur kategorikal
le = LabelEncoder()
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = le.fit_transform(X[col])

# Gabungkan kembali fitur dan target
data_cleaned = X.copy()
data_cleaned['class'] = y.copy()

# Isi nilai kosong numerik
data_cleaned.fillna(data_cleaned.median(numeric_only=True), inplace=True)

# Isi nilai kosong kategorikal
for col in data_cleaned.select_dtypes(include='object'):
    data_cleaned[col].fillna(data_cleaned[col].mode()[0], inplace=True)

# Encode target (bad/good → 0/1)
label_encoder_target = LabelEncoder()
data_cleaned['class'] = label_encoder_target.fit_transform(data_cleaned['class'])

# Split 80:20
X = data_cleaned.drop(columns=['class'])
y = data_cleaned['class']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

train_data = X_train.copy()
train_data['class'] = y_train
test_data = X_test.copy()
test_data['class'] = y_test

# =========================================
# === Fungsi Algoritma Gain Ratio + AdaBoost ===
# =========================================

def entropy(data, target):
    values, counts = np.unique(data[target], return_counts=True)
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs))

def gain_ratio_numeric(data, attr, target, base_entropy):
    values = np.sort(data[attr].unique())
    thresholds = (values[:-1] + values[1:]) / 2
    best_gain = 0
    best_threshold = None
    for threshold in thresholds:
        left = data[data[attr] <= threshold]
        right = data[data[attr] > threshold]
        if len(left) == 0 or len(right) == 0:
            continue
        left_entropy = entropy(left, target)
        right_entropy = entropy(right, target)
        weighted_entropy = (len(left) * left_entropy + len(right) * right_entropy) / len(data)
        info_gain = base_entropy - weighted_entropy
        split_info = -((len(left) / len(data)) * np.log2(len(left) / len(data)) +
                       (len(right) / len(data)) * np.log2(len(right) / len(data)))
        if split_info == 0:
            continue
        gain_ratio = info_gain / split_info
        if gain_ratio > best_gain:
            best_gain = gain_ratio
            best_threshold = threshold
    return best_gain, best_threshold

def gain_ratio_categorical(data, attr, target, base_entropy):
    values = data[attr].unique()
    weighted_entropy = 0
    split_info = 0
    for val in values:
        subset = data[data[attr] == val]
        if len(subset) == 0:
            continue
        subset_entropy = entropy(subset, target)
        weight = len(subset) / len(data)
        weighted_entropy += weight * subset_entropy
        split_info -= weight * np.log2(weight)
    info_gain = base_entropy - weighted_entropy
    if split_info == 0:
        return 0
    return info_gain / split_info

def best_split(data, attributes, target):
    base_entropy = entropy(data, target)
    best_attr = None
    best_threshold = None
    best_gain_ratio = 0
    is_numeric = False
    for attr in attributes:
        if data[attr].dtype in [np.int64, np.float64]:
            gain, threshold = gain_ratio_numeric(data, attr, target, base_entropy)
            if gain > best_gain_ratio:
                best_gain_ratio = gain
                best_attr = attr
                best_threshold = threshold
                is_numeric = True
        else:
            gain = gain_ratio_categorical(data, attr, target, base_entropy)
            if gain > best_gain_ratio:
                best_gain_ratio = gain
                best_attr = attr
                best_threshold = None
                is_numeric = False
    return best_attr, best_threshold, is_numeric

def build_tree(data, target, attributes):
    labels = data[target].unique()
    if len(labels) == 1 or len(data) == 0:
        values, counts = np.unique(data[target], return_counts=True)
        if len(counts) == 0:
            return {'type': 'leaf', 'class': 0}
        return {'type': 'leaf', 'class': values[np.argmax(counts)]}
    attr, threshold, is_numeric = best_split(data, attributes, target)
    if attr is None:
        values, counts = np.unique(data[target], return_counts=True)
        return {'type': 'leaf', 'class': values[np.argmax(counts)]}
    node = {'type': 'node', 'attribute': attr, 'is_numeric': is_numeric}
    if is_numeric:
        node['threshold'] = threshold
        left_data = data[data[attr] <= threshold]
        right_data = data[data[attr] > threshold]
        node['left'] = build_tree(left_data, target, attributes)
        node['right'] = build_tree(right_data, target, attributes)
    else:
        node['branches'] = {}
        for val in data[attr].unique():
            subset = data[data[attr] == val]
            node['branches'][val] = build_tree(subset, target, attributes)
    return node

def predict(tree, sample):
    if tree['type'] == 'leaf':
        return tree['class']
    attr = tree['attribute']
    if tree['is_numeric']:
        if sample[attr] <= tree['threshold']:
            return predict(tree['left'], sample)
        else:
            return predict(tree['right'], sample)
    else:
        val = sample[attr]
        if val in tree['branches']:
            return predict(tree['branches'][val], sample)
        else:
            return 0

def convert_label(y):
    return np.where(y == 1, 1, -1)

def ada_boost_train(data, target, attributes, n_estimators=10):
    n = len(data)
    weights = np.ones(n) / n
    trees = []
    alphas = []
    y_true = convert_label(data[target].values)

    for m in range(n_estimators):
        sample_indices = np.random.choice(n, size=n, replace=True, p=weights)
        sample_data = data.iloc[sample_indices].reset_index(drop=True)
        tree = build_tree(sample_data, target, attributes)
        trees.append(tree)

        preds = np.array([1 if predict(tree, data.iloc[i]) == 1 else -1 for i in range(n)])
        miss = (preds != y_true).astype(int)
        error = np.sum(weights * miss)

        if error == 0:
            alpha = 1
            alphas.append(alpha)
            break
        if error >= 0.5:
            break

        alpha = 0.5 * np.log((1 - error) / error)
        alphas.append(alpha)
        weights *= np.exp(-alpha * y_true * preds)
        weights /= np.sum(weights)
    return trees, alphas

def ada_boost_predict(trees, alphas, sample):
    total = 0
    for tree, alpha in zip(trees, alphas):
        pred = predict(tree, sample)
        pred_mod = 1 if pred == 1 else -1
        total += alpha * pred_mod
    return 1 if total >= 0 else 0

# =====================================
# === Pelatihan & Evaluasi Model ===
# =====================================
start = time.time()
attributes = list(train_data.columns)
attributes.remove('class')
trees, alphas = ada_boost_train(train_data, 'class', attributes, n_estimators=10)
end = time.time()

y_pred = [ada_boost_predict(trees, alphas, row) for _, row in test_data.iterrows()]
y_true = test_data['class'].values

# Evaluasi
print(f"Akurasi: {accuracy_score(y_true, y_pred):.4f}")
print("\nClassification Report:\n", classification_report(y_true, y_pred))
print(f"Waktu eksekusi: {end - start:.2f} detik")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_cleaned[col].fillna(data_cleaned[col].mode()[0], inplace=True)


Akurasi: 0.7350

Classification Report:
               precision    recall  f1-score   support

           0       0.57      0.41      0.48        59
           1       0.78      0.87      0.82       141

    accuracy                           0.73       200
   macro avg       0.67      0.64      0.65       200
weighted avg       0.72      0.73      0.72       200

Waktu eksekusi: 240.35 detik


**Waktu di Perbaiki AdaBoot tidak banyak looping**

In [14]:
import pandas as pd
import numpy as np
import math
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# =============================
# --- Load dan Preprocessing ---
# =============================
data = pd.read_excel("dataset_31_credit-g.xlsx")

# Pilih kolom relevan
data = data[['class', 'age', 'job', 'credit_amount', 'duration',
             'checking_status', 'purpose', 'savings_status', 'personal_status']].copy()

# Pisahkan fitur dan target
X = data.drop(columns=['class'])
y = data['class']

# Ubah kolom numerik bertipe string ke float
for col in X.columns:
    if X[col].dtype == 'object':
        try:
            X[col] = X[col].astype(float)
        except:
            pass

# Encode fitur kategorikal
le = LabelEncoder()
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = le.fit_transform(X[col])

# Gabungkan kembali
data_cleaned = X.copy()
data_cleaned['class'] = y.copy()

# Isi nilai kosong numerik
data_cleaned.fillna(data_cleaned.median(numeric_only=True), inplace=True)

# Isi nilai kosong kategorikal
for col in data_cleaned.select_dtypes(include='object'):
    data_cleaned[col].fillna(data_cleaned[col].mode()[0], inplace=True)

# Encode target
y_encoded = LabelEncoder().fit_transform(data_cleaned['class'])
data_cleaned['class'] = y_encoded

# Split 80:20
X = data_cleaned.drop(columns=['class'])
y = data_cleaned['class']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

train_data = X_train.copy()
train_data['class'] = y_train
test_data = X_test.copy()
test_data['class'] = y_test

# ===========================================
# === Fungsi Algoritma Gain Ratio + AdaBoost ===
# ===========================================
def entropy(data, target):
    values, counts = np.unique(data[target], return_counts=True)
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs + 1e-9))


def gain_ratio_numeric(data, attr, target, base_entropy):
    values = np.sort(data[attr].unique())
    if len(values) <= 1:
        return 0, None
    percentiles = np.percentile(values, [10, 30, 50, 70, 90])
    best_gain = 0
    best_threshold = None
    for threshold in percentiles:
        left = data[data[attr] <= threshold]
        right = data[data[attr] > threshold]
        if len(left) == 0 or len(right) == 0:
            continue
        left_entropy = entropy(left, target)
        right_entropy = entropy(right, target)
        weighted_entropy = (len(left) * left_entropy + len(right) * right_entropy) / len(data)
        info_gain = base_entropy - weighted_entropy
        split_info = -((len(left) / len(data)) * np.log2(len(left) / len(data) + 1e-9) +
                       (len(right) / len(data)) * np.log2(len(right) / len(data) + 1e-9))
        if split_info == 0:
            continue
        gain_ratio = info_gain / split_info
        if gain_ratio > best_gain:
            best_gain = gain_ratio
            best_threshold = threshold
    return best_gain, best_threshold


def gain_ratio_categorical(data, attr, target, base_entropy):
    values = data[attr].unique()
    weighted_entropy = 0
    split_info = 0
    for val in values:
        subset = data[data[attr] == val]
        if len(subset) == 0:
            continue
        subset_entropy = entropy(subset, target)
        weight = len(subset) / len(data)
        weighted_entropy += weight * subset_entropy
        split_info -= weight * np.log2(weight + 1e-9)
    info_gain = base_entropy - weighted_entropy
    if split_info == 0:
        return 0
    return info_gain / split_info


def best_split(data, attributes, target):
    base_entropy = entropy(data, target)
    best_attr = None
    best_threshold = None
    best_gain_ratio = 0
    is_numeric = False
    for attr in attributes:
        if data[attr].dtype in [np.int64, np.float64]:
            gain, threshold = gain_ratio_numeric(data, attr, target, base_entropy)
            if gain > best_gain_ratio:
                best_gain_ratio = gain
                best_attr = attr
                best_threshold = threshold
                is_numeric = True
        else:
            gain = gain_ratio_categorical(data, attr, target, base_entropy)
            if gain > best_gain_ratio:
                best_gain_ratio = gain
                best_attr = attr
                best_threshold = None
                is_numeric = False
    return best_attr, best_threshold, is_numeric


def build_tree(data, target, attributes):
    labels = data[target].unique()
    if len(labels) == 1 or len(data) == 0:
        values, counts = np.unique(data[target], return_counts=True)
        return {'type': 'leaf', 'class': values[np.argmax(counts)]}
    attr, threshold, is_numeric = best_split(data, attributes, target)
    if attr is None:
        values, counts = np.unique(data[target], return_counts=True)
        return {'type': 'leaf', 'class': values[np.argmax(counts)]}
    node = {'type': 'node', 'attribute': attr, 'is_numeric': is_numeric}
    if is_numeric:
        node['threshold'] = threshold
        node['left'] = build_tree(data[data[attr] <= threshold], target, attributes)
        node['right'] = build_tree(data[data[attr] > threshold], target, attributes)
    else:
        node['branches'] = {}
        for val in data[attr].unique():
            node['branches'][val] = build_tree(data[data[attr] == val], target, attributes)
    return node


def predict(tree, sample):
    if tree['type'] == 'leaf':
        return tree['class']
    attr = tree['attribute']
    if tree['is_numeric']:
        if sample[attr] <= tree['threshold']:
            return predict(tree['left'], sample)
        else:
            return predict(tree['right'], sample)
    else:
        val = sample[attr]
        if val in tree['branches']:
            return predict(tree['branches'][val], sample)
        else:
            return 0


def convert_label(y):
    return np.where(y == 1, 1, -1)


def ada_boost_train(data, target, attributes, n_estimators=10):
    n = len(data)
    weights = np.ones(n) / n
    trees = []
    alphas = []
    y_true = convert_label(data[target].values)

    for m in range(n_estimators):
        sample_indices = np.random.choice(n, size=n, replace=True, p=weights)
        sample_data = data.iloc[sample_indices].reset_index(drop=True)
        tree = build_tree(sample_data, target, attributes)
        trees.append(tree)

        preds = np.array([1 if predict(tree, row) == 1 else -1 for _, row in data.iterrows()])
        miss = (preds != y_true).astype(int)
        error = np.sum(weights * miss)

        if error == 0:
            alphas.append(1)
            break
        if error >= 0.5:
            break

        alpha = 0.5 * np.log((1 - error) / error)
        alphas.append(alpha)
        weights *= np.exp(-alpha * y_true * preds)
        weights /= np.sum(weights)
    return trees, alphas


def ada_boost_predict(trees, alphas, sample):
    total = 0
    for tree, alpha in zip(trees, alphas):
        pred = predict(tree, sample)
        pred_mod = 1 if pred == 1 else -1
        total += alpha * pred_mod
    return 1 if total >= 0 else 0

# ==========================
# === Pelatihan & Evaluasi ===
# ==========================
start = time.time()
attributes = list(train_data.columns)
attributes.remove('class')
trees, alphas = ada_boost_train(train_data, 'class', attributes, n_estimators=10)
end = time.time()

y_pred = [ada_boost_predict(trees, alphas, row) for _, row in test_data.iterrows()]
y_true = test_data['class'].values
#accuracy = accuracy_score(y_true, y_pred)
#report = classification_report(y_true, y_pred)
#execution_time = end - start

#@accuracy, report, execution_time
# Evaluasi
print(f"Akurasi: {accuracy_score(y_true, y_pred):.4f}")
print("\nClassification Report:\n", classification_report(y_true, y_pred))
print(f"Waktu eksekusi: {end - start:.2f} detik")



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_cleaned[col].fillna(data_cleaned[col].mode()[0], inplace=True)


Akurasi: 0.7400

Classification Report:
               precision    recall  f1-score   support

           0       0.56      0.59      0.57        59
           1       0.82      0.80      0.81       141

    accuracy                           0.74       200
   macro avg       0.69      0.70      0.69       200
weighted avg       0.75      0.74      0.74       200

Waktu eksekusi: 39.01 detik


### **Menggunakan Kedalaman Pohon**

In [16]:
import pandas as pd
import numpy as np
import math
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

# Load dan Preprocessing
data = pd.read_excel("dataset_31_credit-g.xlsx")
data = data[['class', 'age', 'job', 'credit_amount', 'duration',
             'checking_status', 'purpose', 'savings_status', 'personal_status']].copy()
X = data.drop(columns=['class'])
y = data['class']

# Ubah kolom numerik bertipe string ke float
for col in X.columns:
    if X[col].dtype == 'object':
        try:
            X[col] = X[col].astype(float)
        except:
            pass

# Encode fitur kategorikal
le = LabelEncoder()
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = le.fit_transform(X[col])

data_cleaned = X.copy()
data_cleaned['class'] = y.copy()
data_cleaned.fillna(data_cleaned.median(numeric_only=True), inplace=True)
for col in data_cleaned.select_dtypes(include='object'):
    data_cleaned[col].fillna(data_cleaned[col].mode()[0], inplace=True)

y_encoded = LabelEncoder().fit_transform(data_cleaned['class'])
data_cleaned['class'] = y_encoded

X = data_cleaned.drop(columns=['class'])
y = data_cleaned['class']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

train_data = X_train.copy()
train_data['class'] = y_train
test_data = X_test.copy()
test_data['class'] = y_test

# Fungsi Gain Ratio dan Tree (dengan max_depth)
def entropy(data, target):
    values, counts = np.unique(data[target], return_counts=True)
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs + 1e-9))

def gain_ratio_numeric(data, attr, target, base_entropy):
    values = np.sort(data[attr].unique())
    if len(values) <= 1:
        return 0, None
    percentiles = np.percentile(values, [10, 30, 50, 70, 90])
    best_gain = 0
    best_threshold = None
    for threshold in percentiles:
        left = data[data[attr] <= threshold]
        right = data[data[attr] > threshold]
        if len(left) == 0 or len(right) == 0:
            continue
        left_entropy = entropy(left, target)
        right_entropy = entropy(right, target)
        weighted_entropy = (len(left) * left_entropy + len(right) * right_entropy) / len(data)
        info_gain = base_entropy - weighted_entropy
        split_info = -((len(left) / len(data)) * np.log2(len(left) / len(data) + 1e-9) +
                       (len(right) / len(data)) * np.log2(len(right) / len(data) + 1e-9))
        if split_info == 0:
            continue
        gain_ratio = info_gain / split_info
        if gain_ratio > best_gain:
            best_gain = gain_ratio
            best_threshold = threshold
    return best_gain, best_threshold

def gain_ratio_categorical(data, attr, target, base_entropy):
    values = data[attr].unique()
    weighted_entropy = 0
    split_info = 0
    for val in values:
        subset = data[data[attr] == val]
        if len(subset) == 0:
            continue
        subset_entropy = entropy(subset, target)
        weight = len(subset) / len(data)
        weighted_entropy += weight * subset_entropy
        split_info -= weight * np.log2(weight + 1e-9)
    info_gain = base_entropy - weighted_entropy
    if split_info == 0:
        return 0
    return info_gain / split_info

def best_split(data, attributes, target):
    base_entropy = entropy(data, target)
    best_attr = None
    best_threshold = None
    best_gain_ratio = 0
    is_numeric = False
    for attr in attributes:
        if data[attr].dtype in [np.int64, np.float64]:
            gain, threshold = gain_ratio_numeric(data, attr, target, base_entropy)
            if gain > best_gain_ratio:
                best_gain_ratio = gain
                best_attr = attr
                best_threshold = threshold
                is_numeric = True
        else:
            gain = gain_ratio_categorical(data, attr, target, base_entropy)
            if gain > best_gain_ratio:
                best_gain_ratio = gain
                best_attr = attr
                best_threshold = None
                is_numeric = False
    return best_attr, best_threshold, is_numeric

def build_tree(data, target, attributes, max_depth, current_depth=0):
    labels = data[target].unique()
    if len(labels) == 1 or len(data) == 0 or current_depth >= max_depth:
        values, counts = np.unique(data[target], return_counts=True)
        return {'type': 'leaf', 'class': values[np.argmax(counts)]}
    attr, threshold, is_numeric = best_split(data, attributes, target)
    if attr is None:
        values, counts = np.unique(data[target], return_counts=True)
        return {'type': 'leaf', 'class': values[np.argmax(counts)]}
    node = {'type': 'node', 'attribute': attr, 'is_numeric': is_numeric}
    if is_numeric:
        node['threshold'] = threshold
        node['left'] = build_tree(data[data[attr] <= threshold], target, attributes, max_depth, current_depth + 1)
        node['right'] = build_tree(data[data[attr] > threshold], target, attributes, max_depth, current_depth + 1)
    else:
        node['branches'] = {}
        for val in data[attr].unique():
            node['branches'][val] = build_tree(data[data[attr] == val], target, attributes, max_depth, current_depth + 1)
    return node

def predict(tree, sample):
    if tree['type'] == 'leaf':
        return tree['class']
    attr = tree['attribute']
    if tree['is_numeric']:
        if sample[attr] <= tree['threshold']:
            return predict(tree['left'], sample)
        else:
            return predict(tree['right'], sample)
    else:
        val = sample[attr]
        if val in tree['branches']:
            return predict(tree['branches'][val], sample)
        else:
            return 0

# Evaluasi untuk max_depth 1-15
results = []
attributes = list(train_data.columns)
attributes.remove('class')

for depth in range(1, 16):
    start_time = time.time()
    tree = build_tree(train_data, 'class', attributes, max_depth=depth)
    predictions = [predict(tree, row) for _, row in test_data.iterrows()]
    acc = accuracy_score(test_data['class'], predictions)
    exec_time = time.time() - start_time
    results.append((depth, acc, exec_time))

results_df = pd.DataFrame(results, columns=['max_depth', 'accuracy', 'execution_time'])
results_df


Unnamed: 0,max_depth,accuracy,execution_time
0,1,0.705,0.041679
1,2,0.73,0.114796
2,3,0.735,0.236587
3,4,0.735,0.430345
4,5,0.735,1.015578
5,6,0.745,0.957056
6,7,0.73,1.258426
7,8,0.73,1.598124
8,9,0.74,2.15377
9,10,0.73,2.820723


**Kedalaman Pohon dengan AdaBoot split Data 80:20**

In [19]:
import pandas as pd
import numpy as np
import math
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

# Load dan Preprocessing sama seperti sebelumnya...
data = pd.read_excel("dataset_31_credit-g.xlsx")
data = data[['class', 'age', 'job', 'credit_amount', 'duration',
             'checking_status', 'purpose', 'savings_status', 'personal_status']].copy()
X = data.drop(columns=['class'])
y = data['class']

for col in X.columns:
    if X[col].dtype == 'object':
        try:
            X[col] = X[col].astype(float)
        except:
            pass

le = LabelEncoder()
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = le.fit_transform(X[col])

data_cleaned = X.copy()
data_cleaned['class'] = y.copy()
data_cleaned.fillna(data_cleaned.median(numeric_only=True), inplace=True)
for col in data_cleaned.select_dtypes(include='object'):
    data_cleaned[col].fillna(data_cleaned[col].mode()[0], inplace=True)

y_encoded = LabelEncoder().fit_transform(data_cleaned['class'])
data_cleaned['class'] = y_encoded

X = data_cleaned.drop(columns=['class'])
y = data_cleaned['class']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

train_data = X_train.copy()
train_data['class'] = y_train
test_data = X_test.copy()
test_data['class'] = y_test

attributes = list(train_data.columns)
attributes.remove('class')

# Fungsi entropy, gain_ratio_numeric, gain_ratio_categorical, best_split, build_tree, predict
# Sama seperti kode yang kamu sudah berikan, saya anggap sudah ada di sini

def entropy(data, target):
    values, counts = np.unique(data[target], return_counts=True)
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs + 1e-9))

def gain_ratio_numeric(data, attr, target, base_entropy):
    values = np.sort(data[attr].unique())
    if len(values) <= 1:
        return 0, None
    percentiles = np.percentile(values, [10, 30, 50, 70, 90])
    best_gain = 0
    best_threshold = None
    for threshold in percentiles:
        left = data[data[attr] <= threshold]
        right = data[data[attr] > threshold]
        if len(left) == 0 or len(right) == 0:
            continue
        left_entropy = entropy(left, target)
        right_entropy = entropy(right, target)
        weighted_entropy = (len(left) * left_entropy + len(right) * right_entropy) / len(data)
        info_gain = base_entropy - weighted_entropy
        split_info = -((len(left) / len(data)) * np.log2(len(left) / len(data) + 1e-9) +
                       (len(right) / len(data)) * np.log2(len(right) / len(data) + 1e-9))
        if split_info == 0:
            continue
        gain_ratio = info_gain / split_info
        if gain_ratio > best_gain:
            best_gain = gain_ratio
            best_threshold = threshold
    return best_gain, best_threshold

def gain_ratio_categorical(data, attr, target, base_entropy):
    values = data[attr].unique()
    weighted_entropy = 0
    split_info = 0
    for val in values:
        subset = data[data[attr] == val]
        if len(subset) == 0:
            continue
        subset_entropy = entropy(subset, target)
        weight = len(subset) / len(data)
        weighted_entropy += weight * subset_entropy
        split_info -= weight * np.log2(weight + 1e-9)
    info_gain = base_entropy - weighted_entropy
    if split_info == 0:
        return 0
    return info_gain / split_info

def best_split(data, attributes, target):
    base_entropy = entropy(data, target)
    best_attr = None
    best_threshold = None
    best_gain_ratio = 0
    is_numeric = False
    for attr in attributes:
        if data[attr].dtype in [np.int64, np.float64]:
            gain, threshold = gain_ratio_numeric(data, attr, target, base_entropy)
            if gain > best_gain_ratio:
                best_gain_ratio = gain
                best_attr = attr
                best_threshold = threshold
                is_numeric = True
        else:
            gain = gain_ratio_categorical(data, attr, target, base_entropy)
            if gain > best_gain_ratio:
                best_gain_ratio = gain
                best_attr = attr
                best_threshold = None
                is_numeric = False
    return best_attr, best_threshold, is_numeric

def build_tree(data, target, attributes, max_depth, current_depth=0):
    labels = data[target].unique()
    if len(labels) == 1 or len(data) == 0 or current_depth >= max_depth:
        values, counts = np.unique(data[target], return_counts=True)
        return {'type': 'leaf', 'class': values[np.argmax(counts)]}
    attr, threshold, is_numeric = best_split(data, attributes, target)
    if attr is None:
        values, counts = np.unique(data[target], return_counts=True)
        return {'type': 'leaf', 'class': values[np.argmax(counts)]}
    node = {'type': 'node', 'attribute': attr, 'is_numeric': is_numeric}
    if is_numeric:
        node['threshold'] = threshold
        node['left'] = build_tree(data[data[attr] <= threshold], target, attributes, max_depth, current_depth + 1)
        node['right'] = build_tree(data[data[attr] > threshold], target, attributes, max_depth, current_depth + 1)
    else:
        node['branches'] = {}
        for val in data[attr].unique():
            node['branches'][val] = build_tree(data[data[attr] == val], target, attributes, max_depth, current_depth + 1)
    return node

def predict(tree, sample):
    if tree['type'] == 'leaf':
        return tree['class']
    attr = tree['attribute']
    if tree['is_numeric']:
        if sample[attr] <= tree['threshold']:
            return predict(tree['left'], sample)
        else:
            return predict(tree['right'], sample)
    else:
        val = sample[attr]
        if val in tree['branches']:
            return predict(tree['branches'][val], sample)
        else:
            return 0

# Fungsi untuk membuat weighted sample dari data (berdasarkan bobot)
def weighted_sample(data, weights):
    n_samples = len(data)
    indices = np.random.choice(n_samples, size=n_samples, replace=True, p=weights)
    return data.iloc[indices].reset_index(drop=True)

# Implementasi AdaBoost
def adaBoost(train_data, test_data, attributes, target='class', n_estimators=10, max_depth=3):
    n_samples = len(train_data)
    weights = np.ones(n_samples) / n_samples  # bobot awal sama rata
    trees = []
    alphas = []

    start_time = time.time()

    for m in range(n_estimators):
        # Sampling data sesuai bobot
        sample_data = weighted_sample(train_data, weights)

        # Bangun pohon
        tree = build_tree(sample_data, target, attributes, max_depth=max_depth)

        # Prediksi pada data training asli untuk hitung error
        preds = np.array([predict(tree, row) for _, row in train_data.iterrows()])
        actual = train_data[target].values

        # Hitung error weighted
        incorrect = (preds != actual).astype(int)
        error = np.sum(weights * incorrect) / np.sum(weights)
        if error > 0.5:
            # Jika error lebih besar dari 0.5, skip learner ini
            continue
        if error == 0:
            error = 1e-10  # untuk menghindari pembagian dengan nol

        # Hitung alpha (bobot learner)
        alpha = 0.5 * np.log((1 - error) / error)

        # Update bobot sampel
        weights = weights * np.exp(-alpha * (1 - 2*incorrect))  # benar=0, salah=1 → -alpha*(1-2*1)=-alpha*(-1)=+alpha
        weights = weights / np.sum(weights)  # normalisasi

        trees.append(tree)
        alphas.append(alpha)

        print(f"Pohon ke-{m+1}, max_depth={max_depth}, error={error:.4f}, alpha={alpha:.4f}")

    # Prediksi ensemble pada test set dengan weighted voting
    def predict_ensemble(sample):
        class_votes = {}
        for tree, alpha in zip(trees, alphas):
            pred = predict(tree, sample)
            class_votes[pred] = class_votes.get(pred, 0) + alpha
        # Pilih kelas dengan total alpha terbesar
        return max(class_votes, key=class_votes.get)

    predictions = [predict_ensemble(row) for _, row in test_data.iterrows()]
    accuracy = accuracy_score(test_data[target], predictions)
    exec_time = time.time() - start_time

    return accuracy, exec_time, n_estimators, max_depth

# Jalankan AdaBoost dengan kedalaman pohon 1 sampai 15
results = []
for depth in range(1, 16):
    acc, t_exec, n_trees, max_d = adaBoost(train_data, test_data, attributes, target='class',
                                           n_estimators=10, max_depth=depth)
    results.append((max_d, acc, t_exec))

results_df = pd.DataFrame(results, columns=['max_depth', 'accuracy', 'execution_time'])
print(results_df)


Pohon ke-1, max_depth=1, error=0.3013, alpha=0.4207
Pohon ke-2, max_depth=1, error=0.3246, alpha=0.3663
Pohon ke-3, max_depth=1, error=0.4309, alpha=0.1391
Pohon ke-4, max_depth=1, error=0.4450, alpha=0.1105
Pohon ke-5, max_depth=1, error=0.4695, alpha=0.0611
Pohon ke-6, max_depth=1, error=0.4742, alpha=0.0517
Pohon ke-7, max_depth=1, error=0.4882, alpha=0.0236
Pohon ke-8, max_depth=1, error=0.4673, alpha=0.0655
Pohon ke-9, max_depth=1, error=0.4730, alpha=0.0540
Pohon ke-10, max_depth=1, error=0.4307, alpha=0.1395
Pohon ke-1, max_depth=2, error=0.2800, alpha=0.4722
Pohon ke-2, max_depth=2, error=0.3413, alpha=0.3288
Pohon ke-3, max_depth=2, error=0.4252, alpha=0.1506
Pohon ke-4, max_depth=2, error=0.4565, alpha=0.0873
Pohon ke-5, max_depth=2, error=0.4672, alpha=0.0656
Pohon ke-6, max_depth=2, error=0.4417, alpha=0.1172
Pohon ke-7, max_depth=2, error=0.4216, alpha=0.1581
Pohon ke-8, max_depth=2, error=0.4279, alpha=0.1453
Pohon ke-9, max_depth=2, error=0.4573, alpha=0.0855
Pohon ke-10

### **Kedalaman Pohon AdaBoots dengan split Data 70:30**

In [20]:
import pandas as pd
import numpy as np
import math
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

# Load dan Preprocessing sama seperti sebelumnya...
data = pd.read_excel("dataset_31_credit-g.xlsx")
data = data[['class', 'age', 'job', 'credit_amount', 'duration',
             'checking_status', 'purpose', 'savings_status', 'personal_status']].copy()
X = data.drop(columns=['class'])
y = data['class']

for col in X.columns:
    if X[col].dtype == 'object':
        try:
            X[col] = X[col].astype(float)
        except:
            pass

le = LabelEncoder()
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = le.fit_transform(X[col])

data_cleaned = X.copy()
data_cleaned['class'] = y.copy()
data_cleaned.fillna(data_cleaned.median(numeric_only=True), inplace=True)
for col in data_cleaned.select_dtypes(include='object'):
    data_cleaned[col].fillna(data_cleaned[col].mode()[0], inplace=True)

y_encoded = LabelEncoder().fit_transform(data_cleaned['class'])
data_cleaned['class'] = y_encoded

X = data_cleaned.drop(columns=['class'])
y = data_cleaned['class']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

train_data = X_train.copy()
train_data['class'] = y_train
test_data = X_test.copy()
test_data['class'] = y_test

attributes = list(train_data.columns)
attributes.remove('class')

# Fungsi entropy, gain_ratio_numeric, gain_ratio_categorical, best_split, build_tree, predict
# Sama seperti kode yang kamu sudah berikan, saya anggap sudah ada di sini

def entropy(data, target):
    values, counts = np.unique(data[target], return_counts=True)
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs + 1e-9))

def gain_ratio_numeric(data, attr, target, base_entropy):
    values = np.sort(data[attr].unique())
    if len(values) <= 1:
        return 0, None
    percentiles = np.percentile(values, [10, 30, 50, 70, 90])
    best_gain = 0
    best_threshold = None
    for threshold in percentiles:
        left = data[data[attr] <= threshold]
        right = data[data[attr] > threshold]
        if len(left) == 0 or len(right) == 0:
            continue
        left_entropy = entropy(left, target)
        right_entropy = entropy(right, target)
        weighted_entropy = (len(left) * left_entropy + len(right) * right_entropy) / len(data)
        info_gain = base_entropy - weighted_entropy
        split_info = -((len(left) / len(data)) * np.log2(len(left) / len(data) + 1e-9) +
                       (len(right) / len(data)) * np.log2(len(right) / len(data) + 1e-9))
        if split_info == 0:
            continue
        gain_ratio = info_gain / split_info
        if gain_ratio > best_gain:
            best_gain = gain_ratio
            best_threshold = threshold
    return best_gain, best_threshold

def gain_ratio_categorical(data, attr, target, base_entropy):
    values = data[attr].unique()
    weighted_entropy = 0
    split_info = 0
    for val in values:
        subset = data[data[attr] == val]
        if len(subset) == 0:
            continue
        subset_entropy = entropy(subset, target)
        weight = len(subset) / len(data)
        weighted_entropy += weight * subset_entropy
        split_info -= weight * np.log2(weight + 1e-9)
    info_gain = base_entropy - weighted_entropy
    if split_info == 0:
        return 0
    return info_gain / split_info

def best_split(data, attributes, target):
    base_entropy = entropy(data, target)
    best_attr = None
    best_threshold = None
    best_gain_ratio = 0
    is_numeric = False
    for attr in attributes:
        if data[attr].dtype in [np.int64, np.float64]:
            gain, threshold = gain_ratio_numeric(data, attr, target, base_entropy)
            if gain > best_gain_ratio:
                best_gain_ratio = gain
                best_attr = attr
                best_threshold = threshold
                is_numeric = True
        else:
            gain = gain_ratio_categorical(data, attr, target, base_entropy)
            if gain > best_gain_ratio:
                best_gain_ratio = gain
                best_attr = attr
                best_threshold = None
                is_numeric = False
    return best_attr, best_threshold, is_numeric

def build_tree(data, target, attributes, max_depth, current_depth=0):
    labels = data[target].unique()
    if len(labels) == 1 or len(data) == 0 or current_depth >= max_depth:
        values, counts = np.unique(data[target], return_counts=True)
        return {'type': 'leaf', 'class': values[np.argmax(counts)]}
    attr, threshold, is_numeric = best_split(data, attributes, target)
    if attr is None:
        values, counts = np.unique(data[target], return_counts=True)
        return {'type': 'leaf', 'class': values[np.argmax(counts)]}
    node = {'type': 'node', 'attribute': attr, 'is_numeric': is_numeric}
    if is_numeric:
        node['threshold'] = threshold
        node['left'] = build_tree(data[data[attr] <= threshold], target, attributes, max_depth, current_depth + 1)
        node['right'] = build_tree(data[data[attr] > threshold], target, attributes, max_depth, current_depth + 1)
    else:
        node['branches'] = {}
        for val in data[attr].unique():
            node['branches'][val] = build_tree(data[data[attr] == val], target, attributes, max_depth, current_depth + 1)
    return node

def predict(tree, sample):
    if tree['type'] == 'leaf':
        return tree['class']
    attr = tree['attribute']
    if tree['is_numeric']:
        if sample[attr] <= tree['threshold']:
            return predict(tree['left'], sample)
        else:
            return predict(tree['right'], sample)
    else:
        val = sample[attr]
        if val in tree['branches']:
            return predict(tree['branches'][val], sample)
        else:
            return 0

# Fungsi untuk membuat weighted sample dari data (berdasarkan bobot)
def weighted_sample(data, weights):
    n_samples = len(data)
    indices = np.random.choice(n_samples, size=n_samples, replace=True, p=weights)
    return data.iloc[indices].reset_index(drop=True)

# Implementasi AdaBoost
def adaBoost(train_data, test_data, attributes, target='class', n_estimators=10, max_depth=3):
    n_samples = len(train_data)
    weights = np.ones(n_samples) / n_samples  # bobot awal sama rata
    trees = []
    alphas = []

    start_time = time.time()

    for m in range(n_estimators):
        # Sampling data sesuai bobot
        sample_data = weighted_sample(train_data, weights)

        # Bangun pohon
        tree = build_tree(sample_data, target, attributes, max_depth=max_depth)

        # Prediksi pada data training asli untuk hitung error
        preds = np.array([predict(tree, row) for _, row in train_data.iterrows()])
        actual = train_data[target].values

        # Hitung error weighted
        incorrect = (preds != actual).astype(int)
        error = np.sum(weights * incorrect) / np.sum(weights)
        if error > 0.5:
            # Jika error lebih besar dari 0.5, skip learner ini
            continue
        if error == 0:
            error = 1e-10  # untuk menghindari pembagian dengan nol

        # Hitung alpha (bobot learner)
        alpha = 0.5 * np.log((1 - error) / error)

        # Update bobot sampel
        weights = weights * np.exp(-alpha * (1 - 2*incorrect))  # benar=0, salah=1 → -alpha*(1-2*1)=-alpha*(-1)=+alpha
        weights = weights / np.sum(weights)  # normalisasi

        trees.append(tree)
        alphas.append(alpha)

        print(f"Pohon ke-{m+1}, max_depth={max_depth}, error={error:.4f}, alpha={alpha:.4f}")

    # Prediksi ensemble pada test set dengan weighted voting
    def predict_ensemble(sample):
        class_votes = {}
        for tree, alpha in zip(trees, alphas):
            pred = predict(tree, sample)
            class_votes[pred] = class_votes.get(pred, 0) + alpha
        # Pilih kelas dengan total alpha terbesar
        return max(class_votes, key=class_votes.get)

    predictions = [predict_ensemble(row) for _, row in test_data.iterrows()]
    accuracy = accuracy_score(test_data[target], predictions)
    exec_time = time.time() - start_time

    return accuracy, exec_time, n_estimators, max_depth

# Jalankan AdaBoost dengan kedalaman pohon 1 sampai 15
results = []
for depth in range(1, 16):
    acc, t_exec, n_trees, max_d = adaBoost(train_data, test_data, attributes, target='class',
                                           n_estimators=10, max_depth=depth)
    results.append((max_d, acc, t_exec))

results_df = pd.DataFrame(results, columns=['max_depth', 'accuracy', 'execution_time'])
print(results_df)


Pohon ke-1, max_depth=1, error=0.2986, alpha=0.4271
Pohon ke-2, max_depth=1, error=0.4655, alpha=0.0692
Pohon ke-3, max_depth=1, error=0.3321, alpha=0.3493
Pohon ke-4, max_depth=1, error=0.4264, alpha=0.1483
Pohon ke-5, max_depth=1, error=0.4504, alpha=0.0996
Pohon ke-6, max_depth=1, error=0.4623, alpha=0.0755
Pohon ke-7, max_depth=1, error=0.4362, alpha=0.1284
Pohon ke-8, max_depth=1, error=0.4567, alpha=0.0867
Pohon ke-9, max_depth=1, error=0.4785, alpha=0.0429
Pohon ke-10, max_depth=1, error=0.4824, alpha=0.0353
Pohon ke-1, max_depth=2, error=0.2800, alpha=0.4722
Pohon ke-2, max_depth=2, error=0.3410, alpha=0.3295
Pohon ke-3, max_depth=2, error=0.4370, alpha=0.1267
Pohon ke-4, max_depth=2, error=0.4440, alpha=0.1124
Pohon ke-5, max_depth=2, error=0.4421, alpha=0.1164
Pohon ke-6, max_depth=2, error=0.4093, alpha=0.1835
Pohon ke-7, max_depth=2, error=0.4623, alpha=0.0756
Pohon ke-8, max_depth=2, error=0.4167, alpha=0.1681
Pohon ke-9, max_depth=2, error=0.4625, alpha=0.0752
Pohon ke-10

**Menggunakan Adaboot dengan split Data 80:20 Gunakan Program Ini karena ada kedalaman Pohon**

In [26]:
import pandas as pd
import numpy as np
import math
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings("ignore")

# Load dan preprocessing data
data = pd.read_excel("dataset_31_credit-g.xlsx")
data = data[['class', 'age', 'job', 'credit_amount', 'duration',
             'checking_status', 'purpose', 'savings_status', 'personal_status']].copy()
X = data.drop(columns=['class'])
y = data['class']

for col in X.columns:
    if X[col].dtype == 'object':
        try:
            X[col] = X[col].astype(float)
        except:
            pass

le = LabelEncoder()
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = le.fit_transform(X[col])

data_cleaned = X.copy()
data_cleaned['class'] = y.copy()
data_cleaned.fillna(data_cleaned.median(numeric_only=True), inplace=True)
for col in data_cleaned.select_dtypes(include='object'):
    data_cleaned[col].fillna(data_cleaned[col].mode()[0], inplace=True)

y_encoded = LabelEncoder().fit_transform(data_cleaned['class'])
data_cleaned['class'] = y_encoded

X = data_cleaned.drop(columns=['class'])
y = data_cleaned['class']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

train_data = X_train.copy()
train_data['class'] = y_train
test_data = X_test.copy()
test_data['class'] = y_test

attributes = list(train_data.columns)
attributes.remove('class')

# Fungsi entropy, gain_ratio_numeric, gain_ratio_categorical, best_split, build_tree, predict

def entropy(data, target):
    values, counts = np.unique(data[target], return_counts=True)
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs + 1e-9))

def gain_ratio_numeric(data, attr, target, base_entropy):
    values = np.sort(data[attr].unique())
    if len(values) <= 1:
        return 0, None
    percentiles = np.percentile(values, [10, 30, 50, 70, 90])
    best_gain = 0
    best_threshold = None
    for threshold in percentiles:
        left = data[data[attr] <= threshold]
        right = data[data[attr] > threshold]
        if len(left) == 0 or len(right) == 0:
            continue
        left_entropy = entropy(left, target)
        right_entropy = entropy(right, target)
        weighted_entropy = (len(left) * left_entropy + len(right) * right_entropy) / len(data)
        info_gain = base_entropy - weighted_entropy
        split_info = -((len(left) / len(data)) * np.log2(len(left) / len(data) + 1e-9) +
                       (len(right) / len(data)) * np.log2(len(right) / len(data) + 1e-9))
        if split_info == 0:
            continue
        gain_ratio = info_gain / split_info
        if gain_ratio > best_gain:
            best_gain = gain_ratio
            best_threshold = threshold
    return best_gain, best_threshold

def gain_ratio_categorical(data, attr, target, base_entropy):
    values = data[attr].unique()
    weighted_entropy = 0
    split_info = 0
    for val in values:
        subset = data[data[attr] == val]
        if len(subset) == 0:
            continue
        subset_entropy = entropy(subset, target)
        weight = len(subset) / len(data)
        weighted_entropy += weight * subset_entropy
        split_info -= weight * np.log2(weight + 1e-9)
    info_gain = base_entropy - weighted_entropy
    if split_info == 0:
        return 0
    return info_gain / split_info

def best_split(data, attributes, target):
    base_entropy = entropy(data, target)
    best_attr = None
    best_threshold = None
    best_gain_ratio = 0
    is_numeric = False
    for attr in attributes:
        if data[attr].dtype in [np.int64, np.float64]:
            gain, threshold = gain_ratio_numeric(data, attr, target, base_entropy)
            if gain > best_gain_ratio:
                best_gain_ratio = gain
                best_attr = attr
                best_threshold = threshold
                is_numeric = True
        else:
            gain = gain_ratio_categorical(data, attr, target, base_entropy)
            if gain > best_gain_ratio:
                best_gain_ratio = gain
                best_attr = attr
                best_threshold = None
                is_numeric = False
    return best_attr, best_threshold, is_numeric

def build_tree(data, target, attributes, max_depth, current_depth=0):
    labels = data[target].unique()
    if len(labels) == 1 or len(data) == 0 or current_depth >= max_depth:
        values, counts = np.unique(data[target], return_counts=True)
        return {'type': 'leaf', 'class': values[np.argmax(counts)]}
    attr, threshold, is_numeric = best_split(data, attributes, target)
    if attr is None:
        values, counts = np.unique(data[target], return_counts=True)
        return {'type': 'leaf', 'class': values[np.argmax(counts)]}
    node = {'type': 'node', 'attribute': attr, 'is_numeric': is_numeric}
    if is_numeric:
        node['threshold'] = threshold
        node['left'] = build_tree(data[data[attr] <= threshold], target, attributes, max_depth, current_depth + 1)
        node['right'] = build_tree(data[data[attr] > threshold], target, attributes, max_depth, current_depth + 1)
    else:
        node['branches'] = {}
        for val in data[attr].unique():
            node['branches'][val] = build_tree(data[data[attr] == val], target, attributes, max_depth, current_depth + 1)
    return node

def predict(tree, sample):
    if tree['type'] == 'leaf':
        return tree['class']
    attr = tree['attribute']
    if tree['is_numeric']:
        if sample[attr] <= tree['threshold']:
            return predict(tree['left'], sample)
        else:
            return predict(tree['right'], sample)
    else:
        val = sample[attr]
        if val in tree['branches']:
            return predict(tree['branches'][val], sample)
        else:
            # Jika nilai kategori tidak ada di cabang, pilih kelas mayoritas leaf terdekat
            return 0

def weighted_sample(data, weights):
    n_samples = len(data)
    indices = np.random.choice(n_samples, size=n_samples, replace=True, p=weights)
    return data.iloc[indices].reset_index(drop=True)

def adaBoost(train_data, test_data, attributes, target='class', n_estimators=10, max_depth=3):
    n_samples = len(train_data)
    weights = np.ones(n_samples) / n_samples
    trees = []
    alphas = []

    start_time = time.time()

    for m in range(n_estimators):
        sample_data = weighted_sample(train_data, weights)
        tree = build_tree(sample_data, target, attributes, max_depth=max_depth)
        preds = np.array([predict(tree, row) for _, row in train_data.iterrows()])
        actual = train_data[target].values
        incorrect = (preds != actual).astype(int)
        error = np.sum(weights * incorrect) / np.sum(weights)
        if error > 0.5:
            continue
        if error == 0:
            error = 1e-10
        alpha = 0.5 * np.log((1 - error) / error)
        weights = weights * np.exp(-alpha * (1 - 2*incorrect))
        weights = weights / np.sum(weights)
        trees.append(tree)
        alphas.append(alpha)
        print(f"Pohon ke-{m+1}, max_depth={max_depth}, error={error:.4f}, alpha={alpha:.4f}")

    def predict_ensemble(sample):
        class_votes = {}
        for tree, alpha in zip(trees, alphas):
            pred = predict(tree, sample)
            class_votes[pred] = class_votes.get(pred, 0) + alpha
        return max(class_votes, key=class_votes.get)

    predictions = [predict_ensemble(row) for _, row in test_data.iterrows()]
    accuracy = accuracy_score(test_data[target], predictions)
    exec_time = time.time() - start_time

    return accuracy, exec_time, n_estimators, max_depth, predictions

# Jalankan AdaBoost dan tampilkan laporan klasifikasi + akurasi per max_depth
results = []
for depth in range(1, 16):
    acc, t_exec, n_trees, max_d, preds = adaBoost(train_data, test_data, attributes, target='class',
                                                  n_estimators=10, max_depth=depth)
    print(f"\n=== max_depth = {max_d} ===")
    print(f"Akurasi: {acc:.4f} ({acc*100:.2f}%)")
    print("Laporan Klasifikasi:")
    print(classification_report(test_data['class'], preds))
    results.append((max_d, acc, t_exec))

results_df = pd.DataFrame(results, columns=['max_depth', 'accuracy', 'execution_time'])
print(results_df)


Pohon ke-1, max_depth=1, error=0.3013, alpha=0.4207
Pohon ke-2, max_depth=1, error=0.3246, alpha=0.3663
Pohon ke-3, max_depth=1, error=0.4171, alpha=0.1673
Pohon ke-4, max_depth=1, error=0.4390, alpha=0.1226
Pohon ke-5, max_depth=1, error=0.4712, alpha=0.0576
Pohon ke-6, max_depth=1, error=0.5000, alpha=0.0000
Pohon ke-7, max_depth=1, error=0.4658, alpha=0.0686
Pohon ke-9, max_depth=1, error=0.4716, alpha=0.0568
Pohon ke-10, max_depth=1, error=0.4493, alpha=0.1018

=== max_depth = 1 ===
Akurasi: 0.7150 (71.50%)
Laporan Klasifikasi:
              precision    recall  f1-score   support

           0       1.00      0.03      0.07        59
           1       0.71      1.00      0.83       141

    accuracy                           0.71       200
   macro avg       0.86      0.52      0.45       200
weighted avg       0.80      0.71      0.61       200

Pohon ke-1, max_depth=2, error=0.3013, alpha=0.4207
Pohon ke-2, max_depth=2, error=0.3322, alpha=0.3492
Pohon ke-3, max_depth=2, error=

**Menggunakan Adaboot dengan split Data 70:30 Gunakan Program Ini karena ada kedalaman Pohon**

In [27]:
import pandas as pd
import numpy as np
import math
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings("ignore")

# Load dan preprocessing data
data = pd.read_excel("dataset_31_credit-g.xlsx")
data = data[['class', 'age', 'job', 'credit_amount', 'duration',
             'checking_status', 'purpose', 'savings_status', 'personal_status']].copy()
X = data.drop(columns=['class'])
y = data['class']

for col in X.columns:
    if X[col].dtype == 'object':
        try:
            X[col] = X[col].astype(float)
        except:
            pass

le = LabelEncoder()
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = le.fit_transform(X[col])

data_cleaned = X.copy()
data_cleaned['class'] = y.copy()
data_cleaned.fillna(data_cleaned.median(numeric_only=True), inplace=True)
for col in data_cleaned.select_dtypes(include='object'):
    data_cleaned[col].fillna(data_cleaned[col].mode()[0], inplace=True)

y_encoded = LabelEncoder().fit_transform(data_cleaned['class'])
data_cleaned['class'] = y_encoded

X = data_cleaned.drop(columns=['class'])
y = data_cleaned['class']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

train_data = X_train.copy()
train_data['class'] = y_train
test_data = X_test.copy()
test_data['class'] = y_test

attributes = list(train_data.columns)
attributes.remove('class')

# Fungsi entropy, gain_ratio_numeric, gain_ratio_categorical, best_split, build_tree, predict

def entropy(data, target):
    values, counts = np.unique(data[target], return_counts=True)
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs + 1e-9))

def gain_ratio_numeric(data, attr, target, base_entropy):
    values = np.sort(data[attr].unique())
    if len(values) <= 1:
        return 0, None
    percentiles = np.percentile(values, [10, 30, 50, 70, 90])
    best_gain = 0
    best_threshold = None
    for threshold in percentiles:
        left = data[data[attr] <= threshold]
        right = data[data[attr] > threshold]
        if len(left) == 0 or len(right) == 0:
            continue
        left_entropy = entropy(left, target)
        right_entropy = entropy(right, target)
        weighted_entropy = (len(left) * left_entropy + len(right) * right_entropy) / len(data)
        info_gain = base_entropy - weighted_entropy
        split_info = -((len(left) / len(data)) * np.log2(len(left) / len(data) + 1e-9) +
                       (len(right) / len(data)) * np.log2(len(right) / len(data) + 1e-9))
        if split_info == 0:
            continue
        gain_ratio = info_gain / split_info
        if gain_ratio > best_gain:
            best_gain = gain_ratio
            best_threshold = threshold
    return best_gain, best_threshold

def gain_ratio_categorical(data, attr, target, base_entropy):
    values = data[attr].unique()
    weighted_entropy = 0
    split_info = 0
    for val in values:
        subset = data[data[attr] == val]
        if len(subset) == 0:
            continue
        subset_entropy = entropy(subset, target)
        weight = len(subset) / len(data)
        weighted_entropy += weight * subset_entropy
        split_info -= weight * np.log2(weight + 1e-9)
    info_gain = base_entropy - weighted_entropy
    if split_info == 0:
        return 0
    return info_gain / split_info

def best_split(data, attributes, target):
    base_entropy = entropy(data, target)
    best_attr = None
    best_threshold = None
    best_gain_ratio = 0
    is_numeric = False
    for attr in attributes:
        if data[attr].dtype in [np.int64, np.float64]:
            gain, threshold = gain_ratio_numeric(data, attr, target, base_entropy)
            if gain > best_gain_ratio:
                best_gain_ratio = gain
                best_attr = attr
                best_threshold = threshold
                is_numeric = True
        else:
            gain = gain_ratio_categorical(data, attr, target, base_entropy)
            if gain > best_gain_ratio:
                best_gain_ratio = gain
                best_attr = attr
                best_threshold = None
                is_numeric = False
    return best_attr, best_threshold, is_numeric

def build_tree(data, target, attributes, max_depth, current_depth=0):
    labels = data[target].unique()
    if len(labels) == 1 or len(data) == 0 or current_depth >= max_depth:
        values, counts = np.unique(data[target], return_counts=True)
        return {'type': 'leaf', 'class': values[np.argmax(counts)]}
    attr, threshold, is_numeric = best_split(data, attributes, target)
    if attr is None:
        values, counts = np.unique(data[target], return_counts=True)
        return {'type': 'leaf', 'class': values[np.argmax(counts)]}
    node = {'type': 'node', 'attribute': attr, 'is_numeric': is_numeric}
    if is_numeric:
        node['threshold'] = threshold
        node['left'] = build_tree(data[data[attr] <= threshold], target, attributes, max_depth, current_depth + 1)
        node['right'] = build_tree(data[data[attr] > threshold], target, attributes, max_depth, current_depth + 1)
    else:
        node['branches'] = {}
        for val in data[attr].unique():
            node['branches'][val] = build_tree(data[data[attr] == val], target, attributes, max_depth, current_depth + 1)
    return node

def predict(tree, sample):
    if tree['type'] == 'leaf':
        return tree['class']
    attr = tree['attribute']
    if tree['is_numeric']:
        if sample[attr] <= tree['threshold']:
            return predict(tree['left'], sample)
        else:
            return predict(tree['right'], sample)
    else:
        val = sample[attr]
        if val in tree['branches']:
            return predict(tree['branches'][val], sample)
        else:
            # Jika nilai kategori tidak ada di cabang, pilih kelas mayoritas leaf terdekat
            return 0

def weighted_sample(data, weights):
    n_samples = len(data)
    indices = np.random.choice(n_samples, size=n_samples, replace=True, p=weights)
    return data.iloc[indices].reset_index(drop=True)

def adaBoost(train_data, test_data, attributes, target='class', n_estimators=10, max_depth=3):
    n_samples = len(train_data)
    weights = np.ones(n_samples) / n_samples
    trees = []
    alphas = []

    start_time = time.time()

    for m in range(n_estimators):
        sample_data = weighted_sample(train_data, weights)
        tree = build_tree(sample_data, target, attributes, max_depth=max_depth)
        preds = np.array([predict(tree, row) for _, row in train_data.iterrows()])
        actual = train_data[target].values
        incorrect = (preds != actual).astype(int)
        error = np.sum(weights * incorrect) / np.sum(weights)
        if error > 0.5:
            continue
        if error == 0:
            error = 1e-10
        alpha = 0.5 * np.log((1 - error) / error)
        weights = weights * np.exp(-alpha * (1 - 2*incorrect))
        weights = weights / np.sum(weights)
        trees.append(tree)
        alphas.append(alpha)
        print(f"Pohon ke-{m+1}, max_depth={max_depth}, error={error:.4f}, alpha={alpha:.4f}")

    def predict_ensemble(sample):
        class_votes = {}
        for tree, alpha in zip(trees, alphas):
            pred = predict(tree, sample)
            class_votes[pred] = class_votes.get(pred, 0) + alpha
        return max(class_votes, key=class_votes.get)

    predictions = [predict_ensemble(row) for _, row in test_data.iterrows()]
    accuracy = accuracy_score(test_data[target], predictions)
    exec_time = time.time() - start_time

    return accuracy, exec_time, n_estimators, max_depth, predictions

# Jalankan AdaBoost dan tampilkan laporan klasifikasi + akurasi per max_depth
results = []
for depth in range(1, 16):
    acc, t_exec, n_trees, max_d, preds = adaBoost(train_data, test_data, attributes, target='class',
                                                  n_estimators=10, max_depth=depth)
    print(f"\n=== max_depth = {max_d} ===")
    print(f"Akurasi: {acc:.4f} ({acc*100:.2f}%)")
    print("Laporan Klasifikasi:")
    print(classification_report(test_data['class'], preds))
    results.append((max_d, acc, t_exec))

results_df = pd.DataFrame(results, columns=['max_depth', 'accuracy', 'execution_time'])
print(results_df)


Pohon ke-1, max_depth=1, error=0.2986, alpha=0.4271
Pohon ke-2, max_depth=1, error=0.3252, alpha=0.3649
Pohon ke-3, max_depth=1, error=0.4543, alpha=0.0916
Pohon ke-4, max_depth=1, error=0.4650, alpha=0.0701
Pohon ke-5, max_depth=1, error=0.4396, alpha=0.1215
Pohon ke-6, max_depth=1, error=0.4468, alpha=0.1069
Pohon ke-7, max_depth=1, error=0.4289, alpha=0.1431
Pohon ke-8, max_depth=1, error=0.4801, alpha=0.0398
Pohon ke-9, max_depth=1, error=0.4720, alpha=0.0560
Pohon ke-10, max_depth=1, error=0.4362, alpha=0.1284

=== max_depth = 1 ===
Akurasi: 0.7100 (71.00%)
Laporan Klasifikasi:
              precision    recall  f1-score   support

           0       0.58      0.15      0.24        91
           1       0.72      0.95      0.82       209

    accuracy                           0.71       300
   macro avg       0.65      0.55      0.53       300
weighted avg       0.68      0.71      0.65       300

Pohon ke-1, max_depth=2, error=0.2686, alpha=0.5009
Pohon ke-2, max_depth=2, error=

**Jangan Gunakan Program Ini**

In [24]:
import pandas as pd
import numpy as np
import math
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings("ignore")

# Load dan preprocessing data
data = pd.read_excel("dataset_31_credit-g.xlsx")
data = data[['class', 'age', 'job', 'credit_amount', 'duration',
             'checking_status', 'purpose', 'savings_status', 'personal_status']].copy()

X = data.drop(columns=['class'])
y = data['class']

for col in X.columns:
    if X[col].dtype == 'object':
        try:
            X[col] = X[col].astype(float)
        except:
            pass

le = LabelEncoder()
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = le.fit_transform(X[col])

data_cleaned = X.copy()
data_cleaned['class'] = y.copy()
data_cleaned.fillna(data_cleaned.median(numeric_only=True), inplace=True)
for col in data_cleaned.select_dtypes(include='object'):
    data_cleaned[col].fillna(data_cleaned[col].mode()[0], inplace=True)

y_encoded = LabelEncoder().fit_transform(data_cleaned['class'])
data_cleaned['class'] = y_encoded

X = data_cleaned.drop(columns=['class'])
y = data_cleaned['class']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

train_data = X_train.copy()
train_data['class'] = y_train
test_data = X_test.copy()
test_data['class'] = y_test

attributes = list(train_data.columns)
attributes.remove('class')

# Definisi fungsi entropy, gain_ratio_numeric, gain_ratio_categorical, best_split, build_tree, predict
def entropy(data, target):
    values, counts = np.unique(data[target], return_counts=True)
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs + 1e-9))

def gain_ratio_numeric(data, attr, target, base_entropy):
    values = np.sort(data[attr].unique())
    if len(values) <= 1:
        return 0, None
    percentiles = np.percentile(values, [10, 30, 50, 70, 90])
    best_gain = 0
    best_threshold = None
    for threshold in percentiles:
        left = data[data[attr] <= threshold]
        right = data[data[attr] > threshold]
        if len(left) == 0 or len(right) == 0:
            continue
        left_entropy = entropy(left, target)
        right_entropy = entropy(right, target)
        weighted_entropy = (len(left) * left_entropy + len(right) * right_entropy) / len(data)
        info_gain = base_entropy - weighted_entropy
        split_info = -((len(left) / len(data)) * np.log2(len(left) / len(data) + 1e-9) +
                       (len(right) / len(data)) * np.log2(len(right) / len(data) + 1e-9))
        if split_info == 0:
            continue
        gain_ratio = info_gain / split_info
        if gain_ratio > best_gain:
            best_gain = gain_ratio
            best_threshold = threshold
    return best_gain, best_threshold

def gain_ratio_categorical(data, attr, target, base_entropy):
    values = data[attr].unique()
    weighted_entropy = 0
    split_info = 0
    for val in values:
        subset = data[data[attr] == val]
        if len(subset) == 0:
            continue
        subset_entropy = entropy(subset, target)
        weight = len(subset) / len(data)
        weighted_entropy += weight * subset_entropy
        split_info -= weight * np.log2(weight + 1e-9)
    info_gain = base_entropy - weighted_entropy
    if split_info == 0:
        return 0
    return info_gain / split_info

def best_split(data, attributes, target):
    base_entropy = entropy(data, target)
    best_attr = None
    best_threshold = None
    best_gain_ratio = 0
    is_numeric = False
    for attr in attributes:
        if data[attr].dtype in [np.int64, np.float64]:
            gain, threshold = gain_ratio_numeric(data, attr, target, base_entropy)
            if gain > best_gain_ratio:
                best_gain_ratio = gain
                best_attr = attr
                best_threshold = threshold
                is_numeric = True
        else:
            gain = gain_ratio_categorical(data, attr, target, base_entropy)
            if gain > best_gain_ratio:
                best_gain_ratio = gain
                best_attr = attr
                best_threshold = None
                is_numeric = False
    return best_attr, best_threshold, is_numeric

def build_tree(data, target, attributes, max_depth, current_depth=0):
    labels = data[target].unique()
    if len(labels) == 1 or len(data) == 0 or current_depth >= max_depth:
        values, counts = np.unique(data[target], return_counts=True)
        return {'type': 'leaf', 'class': values[np.argmax(counts)]}
    attr, threshold, is_numeric = best_split(data, attributes, target)
    if attr is None:
        values, counts = np.unique(data[target], return_counts=True)
        return {'type': 'leaf', 'class': values[np.argmax(counts)]}
    node = {'type': 'node', 'attribute': attr, 'is_numeric': is_numeric}
    if is_numeric:
        node['threshold'] = threshold
        node['left'] = build_tree(data[data[attr] <= threshold], target, attributes, max_depth, current_depth + 1)
        node['right'] = build_tree(data[data[attr] > threshold], target, attributes, max_depth, current_depth + 1)
    else:
        node['branches'] = {}
        for val in data[attr].unique():
            node['branches'][val] = build_tree(data[data[attr] == val], target, attributes, max_depth, current_depth + 1)
    return node

def predict(tree, sample):
    if tree['type'] == 'leaf':
        return tree['class']
    attr = tree['attribute']
    if tree['is_numeric']:
        if sample[attr] <= tree['threshold']:
            return predict(tree['left'], sample)
        else:
            return predict(tree['right'], sample)
    else:
        val = sample[attr]
        if val in tree['branches']:
            return predict(tree['branches'][val], sample)
        else:
            # Jika cabang tidak ada, kembalikan kelas mayoritas
            return 0

def weighted_sample(data, weights):
    n_samples = len(data)
    indices = np.random.choice(n_samples, size=n_samples, replace=True, p=weights)
    return data.iloc[indices].reset_index(drop=True)

def adaBoost(train_data, test_data, attributes, target='class', n_estimators=10, max_depth=3):
    n_samples = len(train_data)
    weights = np.ones(n_samples) / n_samples
    trees = []
    alphas = []

    start_time = time.time()

    for m in range(n_estimators):
        sample_data = weighted_sample(train_data, weights)
        tree = build_tree(sample_data, target, attributes, max_depth=max_depth)
        preds = np.array([predict(tree, row) for _, row in train_data.iterrows()])
        actual = train_data[target].values
        incorrect = (preds != actual).astype(int)
        error = np.sum(weights * incorrect) / np.sum(weights)
        if error > 0.5:
            continue
        if error == 0:
            error = 1e-10
        alpha = 0.5 * np.log((1 - error) / error)
        weights = weights * np.exp(-alpha * (1 - 2*incorrect))
        weights = weights / np.sum(weights)
        trees.append(tree)
        alphas.append(alpha)

    def predict_ensemble(sample):
        class_votes = {}
        for tree, alpha in zip(trees, alphas):
            pred = predict(tree, sample)
            class_votes[pred] = class_votes.get(pred, 0) + alpha
        return max(class_votes, key=class_votes.get)

    predictions = [predict_ensemble(row) for _, row in test_data.iterrows()]
    accuracy = accuracy_score(test_data[target], predictions)
    exec_time = time.time() - start_time

    # Cetak laporan klasifikasi
    print("Classification Report:\n")
    print(classification_report(test_data[target], predictions))
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Execution time: {exec_time:.2f} seconds\n")

    return accuracy, exec_time

# Jalankan AdaBoost sekali dengan max_depth yang diinginkan, misal 3
accuracy, exec_time = adaBoost(train_data, test_data, attributes, target='class', n_estimators=10, max_depth=3)


Classification Report:

              precision    recall  f1-score   support

           0       0.69      0.15      0.25        59
           1       0.73      0.97      0.84       141

    accuracy                           0.73       200
   macro avg       0.71      0.56      0.54       200
weighted avg       0.72      0.73      0.66       200

Accuracy: 0.7300
Execution time: 2.39 seconds



### **Data Traning 80:20 Dengan AdaBoots**

In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import time

# ===============================
# === Fungsi Gain Ratio & Split ===
# ===============================
def entropy(data, target):
    values, counts = np.unique(data[target], return_counts=True)
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs))

def info_gain(data, attr, target):
    total_entropy = entropy(data, target)
    values, counts = np.unique(data[attr], return_counts=True)
    weighted_entropy = np.sum([(counts[i] / np.sum(counts)) *
                              entropy(data[data[attr] == values[i]], target)
                              for i in range(len(values))])
    return total_entropy - weighted_entropy

def split_info(data, attr):
    values, counts = np.unique(data[attr], return_counts=True)
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs))

def gain_ratio(data, attr, target):
    si = split_info(data, attr)
    if si == 0:
        return 0
    return info_gain(data, attr, target) / si

def best_split(data, attributes, target):
    best_attr = None
    best_gain_ratio = -1
    best_threshold = None
    is_numeric = False
    for attr in attributes:
        if data[attr].dtype in [np.int64, np.float64]:
            thresholds = data[attr].sort_values().unique()
            for t in thresholds[:-1]:
                data_copy = data.copy()
                data_copy[attr] = data_copy[attr] <= t
                gr = gain_ratio(data_copy, attr, target)
                if gr > best_gain_ratio:
                    best_gain_ratio = gr
                    best_attr = attr
                    best_threshold = t
                    is_numeric = True
        else:
            gr = gain_ratio(data, attr, target)
            if gr > best_gain_ratio:
                best_gain_ratio = gr
                best_attr = attr
                is_numeric = False
    return best_attr, best_threshold, is_numeric

# ==========================
# === Pembangunan Tree ===
# ==========================
def build_tree(data, target, attributes, max_depth=None, current_depth=0):
    if len(data) == 0:
        return None  # Data kosong, leaf kosong

    labels = data[target].unique()
    if len(labels) == 1 or len(data) == 0:
        values, counts = np.unique(data[target], return_counts=True)
        return {'type': 'leaf', 'class': values[np.argmax(counts)]}
    if max_depth is not None and current_depth >= max_depth:
        values, counts = np.unique(data[target], return_counts=True)
        return {'type': 'leaf', 'class': values[np.argmax(counts)]}

    attr, threshold, is_numeric = best_split(data, attributes, target)
    if attr is None:
        values, counts = np.unique(data[target], return_counts=True)
        return {'type': 'leaf', 'class': values[np.argmax(counts)]}

    node = {'type': 'node', 'attribute': attr, 'is_numeric': is_numeric}
    if is_numeric:
        node['threshold'] = threshold
        node['left'] = build_tree(data[data[attr] <= threshold], target, attributes, max_depth, current_depth + 1)
        node['right'] = build_tree(data[data[attr] > threshold], target, attributes, max_depth, current_depth + 1)
    else:
        node['branches'] = {}
        for val in data[attr].unique():
            node['branches'][val] = build_tree(data[data[attr] == val], target, attributes, max_depth, current_depth + 1)
    return node

def get_tree_depth(tree):
    if tree is None:
        return 0
    if tree['type'] == 'leaf':
        return 0
    if tree['is_numeric']:
        return 1 + max(get_tree_depth(tree['left']), get_tree_depth(tree['right']))
    else:
        return 1 + max(get_tree_depth(branch) for branch in tree['branches'].values())

# =======================
# === Prediksi & Boost ===
# =======================
def predict(tree, row):
    if tree is None:
        return 0  # default jika tree kosong
    if tree['type'] == 'leaf':
        return tree['class']
    attr = tree['attribute']
    if tree['is_numeric']:
        if row[attr] <= tree['threshold']:
            return predict(tree['left'], row)
        else:
            return predict(tree['right'], row)
    else:
        val = row[attr]
        if val in tree['branches']:
            return predict(tree['branches'][val], row)
        else:
            # default ke cabang pertama jika val tidak ada
            return list(tree['branches'].values())[0]['class']

def convert_label(y):
    return np.array([1 if label == 1 else -1 for label in y])

def ada_boost_train(data, target, attributes, n_estimators=10, max_depth=None):
    n = len(data)
    weights = np.ones(n) / n
    trees = []
    alphas = []
    depths = []
    y_true = convert_label(data[target].values)

    for m in range(n_estimators):
        sample_indices = np.random.choice(n, size=n, replace=True, p=weights)
        sample_data = data.iloc[sample_indices].reset_index(drop=True)
        tree = build_tree(sample_data, target, attributes, max_depth=max_depth)
        trees.append(tree)

        depth = get_tree_depth(tree)
        depths.append(depth)

        preds = np.array([1 if predict(tree, row) == 1 else -1 for _, row in data.iterrows()])
        miss = (preds != y_true).astype(int)
        error = np.sum(weights * miss)

        if error == 0:
            alphas.append(1)
            break
        if error >= 0.5:
            break

        alpha = 0.5 * np.log((1 - error) / error)
        alphas.append(alpha)
        weights *= np.exp(-alpha * y_true * preds)
        weights /= np.sum(weights)
    return trees, alphas, depths

def ada_boost_predict(trees, alphas, row):
    weighted_votes = sum(alpha * (1 if predict(tree, row) == 1 else -1)
                         for tree, alpha in zip(trees, alphas))
    return 1 if weighted_votes > 0 else 0

# ====================
# === Load Dataset ===
# ====================
df = pd.read_excel("dataset_31_credit-g.xlsx")

# Pilih kolom relevan
data = df[['class', 'age', 'job', 'credit_amount', 'duration',
           'checking_status', 'purpose', 'savings_status', 'personal_status']].copy()

# Encoding kolom bertipe objek
for col in data.columns:
    if data[col].dtype == object:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])

# Pisahkan fitur dan target
X = data.drop(columns=['class'])
y = data['class']

# Gabungkan kembali fitur dan target ke satu dataframe agar fungsi bisa jalan
data = pd.concat([X, y], axis=1)

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# ================================
# === Pelatihan & Evaluasi ===
# ================================
start = time.time()
attributes = list(train_data.columns)
attributes.remove('class')

# Set kedalaman maksimum pohon di sini
max_tree_depth = 5

trees, alphas, depths = ada_boost_train(train_data, 'class', attributes,
                                        n_estimators=10, max_depth=max_tree_depth)
end = time.time()

y_pred = [ada_boost_predict(trees, alphas, row) for _, row in test_data.iterrows()]
y_true = test_data['class'].values

print(f"\nAkurasi: {accuracy_score(y_true, y_pred):.4f}")
print("\nClassification Report:\n", classification_report(y_true, y_pred))
print(f"Waktu eksekusi: {end - start:.2f} detik")

# ==============================
# === Tampilkan Kedalaman ===
# ==============================
print("\nKedalaman masing-masing pohon:")
for i, d in enumerate(depths, start=1):
    print(f"Pohon ke-{i}: kedalaman = {d}")



Akurasi: 0.7400

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.12      0.21        59
           1       0.73      1.00      0.84       141

    accuracy                           0.74       200
   macro avg       0.87      0.56      0.53       200
weighted avg       0.81      0.74      0.66       200

Waktu eksekusi: 44.20 detik

Kedalaman masing-masing pohon:
Pohon ke-1: kedalaman = 5
Pohon ke-2: kedalaman = 5
Pohon ke-3: kedalaman = 5
Pohon ke-4: kedalaman = 5
Pohon ke-5: kedalaman = 5
Pohon ke-6: kedalaman = 5
Pohon ke-7: kedalaman = 5
Pohon ke-8: kedalaman = 5
Pohon ke-9: kedalaman = 5
Pohon ke-10: kedalaman = 5


In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import time

# ===============================
# === Fungsi Gain Ratio & Split ===
# ===============================
def entropy(data, target):
    values, counts = np.unique(data[target], return_counts=True)
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs))

def info_gain(data, attr, target):
    total_entropy = entropy(data, target)
    values, counts = np.unique(data[attr], return_counts=True)
    weighted_entropy = np.sum([(counts[i] / np.sum(counts)) *
                              entropy(data[data[attr] == values[i]], target)
                              for i in range(len(values))])
    return total_entropy - weighted_entropy

def split_info(data, attr):
    values, counts = np.unique(data[attr], return_counts=True)
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs))

def gain_ratio(data, attr, target):
    si = split_info(data, attr)
    if si == 0:
        return 0
    return info_gain(data, attr, target) / si

def best_split(data, attributes, target):
    best_attr = None
    best_gain_ratio = -1
    best_threshold = None
    is_numeric = False
    for attr in attributes:
        if data[attr].dtype in [np.int64, np.float64]:
            thresholds = data[attr].sort_values().unique()
            for t in thresholds[:-1]:
                data_copy = data.copy()
                data_copy[attr] = data_copy[attr] <= t
                gr = gain_ratio(data_copy, attr, target)
                if gr > best_gain_ratio:
                    best_gain_ratio = gr
                    best_attr = attr
                    best_threshold = t
                    is_numeric = True
        else:
            gr = gain_ratio(data, attr, target)
            if gr > best_gain_ratio:
                best_gain_ratio = gr
                best_attr = attr
                is_numeric = False
    return best_attr, best_threshold, is_numeric

# ==========================
# === Pembangunan Tree ===
# ==========================
def build_tree(data, target, attributes, max_depth=None, current_depth=0):
    if len(data) == 0:
        return None  # Data kosong, leaf kosong

    labels = data[target].unique()
    if len(labels) == 1 or len(data) == 0:
        values, counts = np.unique(data[target], return_counts=True)
        return {'type': 'leaf', 'class': values[np.argmax(counts)]}
    if max_depth is not None and current_depth >= max_depth:
        values, counts = np.unique(data[target], return_counts=True)
        return {'type': 'leaf', 'class': values[np.argmax(counts)]}

    attr, threshold, is_numeric = best_split(data, attributes, target)
    if attr is None:
        values, counts = np.unique(data[target], return_counts=True)
        return {'type': 'leaf', 'class': values[np.argmax(counts)]}

    node = {'type': 'node', 'attribute': attr, 'is_numeric': is_numeric}
    if is_numeric:
        node['threshold'] = threshold
        node['left'] = build_tree(data[data[attr] <= threshold], target, attributes, max_depth, current_depth + 1)
        node['right'] = build_tree(data[data[attr] > threshold], target, attributes, max_depth, current_depth + 1)
    else:
        node['branches'] = {}
        for val in data[attr].unique():
            node['branches'][val] = build_tree(data[data[attr] == val], target, attributes, max_depth, current_depth + 1)
    return node

def get_tree_depth(tree):
    if tree is None:
        return 0
    if tree['type'] == 'leaf':
        return 0
    if tree['is_numeric']:
        return 1 + max(get_tree_depth(tree['left']), get_tree_depth(tree['right']))
    else:
        return 1 + max(get_tree_depth(branch) for branch in tree['branches'].values())

# =======================
# === Prediksi & Boost ===
# =======================
def predict(tree, row):
    if tree is None:
        return 0  # default jika tree kosong
    if tree['type'] == 'leaf':
        return tree['class']
    attr = tree['attribute']
    if tree['is_numeric']:
        if row[attr] <= tree['threshold']:
            return predict(tree['left'], row)
        else:
            return predict(tree['right'], row)
    else:
        val = row[attr]
        if val in tree['branches']:
            return predict(tree['branches'][val], row)
        else:
            # default ke cabang pertama jika val tidak ada
            return list(tree['branches'].values())[0]['class']

def convert_label(y):
    return np.array([1 if label == 1 else -1 for label in y])

def ada_boost_train(data, target, attributes, n_estimators=10, max_depth=None):
    n = len(data)
    weights = np.ones(n) / n
    trees = []
    alphas = []
    depths = []
    y_true = convert_label(data[target].values)

    for m in range(n_estimators):
        sample_indices = np.random.choice(n, size=n, replace=True, p=weights)
        sample_data = data.iloc[sample_indices].reset_index(drop=True)
        tree = build_tree(sample_data, target, attributes, max_depth=max_depth)
        trees.append(tree)

        depth = get_tree_depth(tree)
        depths.append(depth)

        preds = np.array([1 if predict(tree, row) == 1 else -1 for _, row in data.iterrows()])
        miss = (preds != y_true).astype(int)
        error = np.sum(weights * miss)

        if error == 0:
            alphas.append(1)
            break
        if error >= 0.5:
            break

        alpha = 0.5 * np.log((1 - error) / error)
        alphas.append(alpha)
        weights *= np.exp(-alpha * y_true * preds)
        weights /= np.sum(weights)
    return trees, alphas, depths

def ada_boost_predict(trees, alphas, row):
    weighted_votes = sum(alpha * (1 if predict(tree, row) == 1 else -1)
                         for tree, alpha in zip(trees, alphas))
    return 1 if weighted_votes > 0 else 0

# ====================
# === Load Dataset ===
# ====================
df = pd.read_excel("dataset_31_credit-g.xlsx")

# Pilih kolom relevan
data = df[['class', 'age', 'job', 'credit_amount', 'duration',
           'checking_status', 'purpose', 'savings_status', 'personal_status']].copy()

# Encoding kolom bertipe objek
for col in data.columns:
    if data[col].dtype == object:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])

# Pisahkan fitur dan target
X = data.drop(columns=['class'])
y = data['class']

# Gabungkan kembali fitur dan target ke satu dataframe agar fungsi bisa jalan
data = pd.concat([X, y], axis=1)

train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

# ================================
# === Pelatihan & Evaluasi ===
# ================================
start = time.time()
attributes = list(train_data.columns)
attributes.remove('class')

# Set kedalaman maksimum pohon di sini
max_tree_depth = 5

trees, alphas, depths = ada_boost_train(train_data, 'class', attributes,
                                        n_estimators=10, max_depth=max_tree_depth)
end = time.time()

y_pred = [ada_boost_predict(trees, alphas, row) for _, row in test_data.iterrows()]
y_true = test_data['class'].values

print(f"\nAkurasi: {accuracy_score(y_true, y_pred):.4f}")
print("\nClassification Report:\n", classification_report(y_true, y_pred))
print(f"Waktu eksekusi: {end - start:.2f} detik")

# ==============================
# === Tampilkan Kedalaman ===
# ==============================
print("\nKedalaman masing-masing pohon:")
for i, d in enumerate(depths, start=1):
    print(f"Pohon ke-{i}: kedalaman = {d}")



Akurasi: 0.7000

Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.02      0.04        91
           1       0.70      1.00      0.82       209

    accuracy                           0.70       300
   macro avg       0.68      0.51      0.43       300
weighted avg       0.69      0.70      0.59       300

Waktu eksekusi: 12.85 detik

Kedalaman masing-masing pohon:
Pohon ke-1: kedalaman = 5
Pohon ke-2: kedalaman = 5
Pohon ke-3: kedalaman = 5


In [21]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import time

# ===============================
# === Fungsi Entropy, Gain Ratio ===
# ===============================
def entropy(data, target):
    values, counts = np.unique(data[target], return_counts=True)
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs + 1e-9))  # +epsilon agar tidak log(0)

def info_gain(data, attr, target):
    total_entropy = entropy(data, target)
    values, counts = np.unique(data[attr], return_counts=True)
    weighted_entropy = 0
    for i, val in enumerate(values):
        subset = data[data[attr] == val]
        weighted_entropy += (counts[i] / counts.sum()) * entropy(subset, target)
    return total_entropy - weighted_entropy

def split_info(data, attr):
    values, counts = np.unique(data[attr], return_counts=True)
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs + 1e-9))

def gain_ratio(data, attr, target):
    si = split_info(data, attr)
    if si == 0:
        return 0
    return info_gain(data, attr, target) / si

# ===============================
# === Fungsi untuk cari split terbaik ===
# ===============================
def best_split(data, attributes, target):
    best_attr = None
    best_threshold = None
    best_gain_ratio = -1
    is_numeric = False

    for attr in attributes:
        if data[attr].dtype in [np.int64, np.float64]:
            # Coba split dengan threshold untuk atribut numerik
            values = np.sort(data[attr].unique())
            for t in values[:-1]:
                data_copy = data.copy()
                # Ubah atribut jadi boolean <= threshold
                data_copy[attr] = data_copy[attr] <= t
                gr = gain_ratio(data_copy, attr, target)
                if gr > best_gain_ratio:
                    best_gain_ratio = gr
                    best_attr = attr
                    best_threshold = t
                    is_numeric = True
        else:
            # Atribut kategorikal
            gr = gain_ratio(data, attr, target)
            if gr > best_gain_ratio:
                best_gain_ratio = gr
                best_attr = attr
                best_threshold = None
                is_numeric = False
    return best_attr, best_threshold, is_numeric

# ===============================
# === Fungsi untuk membangun pohon ===
# ===============================
def build_tree(data, target, attributes, max_depth=None, current_depth=0):
    if len(data) == 0:
        return None

    # Jika semua target sama atau kedalaman max tercapai, buat leaf
    unique_targets = data[target].unique()
    if len(unique_targets) == 1 or (max_depth is not None and current_depth >= max_depth):
        values, counts = np.unique(data[target], return_counts=True)
        return {'type':'leaf', 'class': values[np.argmax(counts)]}

    attr, threshold, is_numeric = best_split(data, attributes, target)
    if attr is None:
        values, counts = np.unique(data[target], return_counts=True)
        return {'type':'leaf', 'class': values[np.argmax(counts)]}

    node = {'type':'node', 'attribute': attr, 'is_numeric': is_numeric}
    if is_numeric:
        node['threshold'] = threshold
        node['left'] = build_tree(data[data[attr] <= threshold], target, attributes, max_depth, current_depth + 1)
        node['right'] = build_tree(data[data[attr] > threshold], target, attributes, max_depth, current_depth + 1)
    else:
        node['branches'] = {}
        for val in data[attr].unique():
            subset = data[data[attr] == val]
            node['branches'][val] = build_tree(subset, target, attributes, max_depth, current_depth + 1)

    return node

def get_tree_depth(tree):
    if tree is None:
        return 0
    if tree['type'] == 'leaf':
        return 0
    if tree['is_numeric']:
        return 1 + max(get_tree_depth(tree['left']), get_tree_depth(tree['right']))
    else:
        return 1 + max(get_tree_depth(branch) for branch in tree['branches'].values())

# ===============================
# === Fungsi prediksi dari pohon ===
# ===============================
def predict(tree, row):
    if tree['type'] == 'leaf':
        return tree['class']
    attr = tree['attribute']
    if tree['is_numeric']:
        if row[attr] <= tree['threshold']:
            return predict(tree['left'], row)
        else:
            return predict(tree['right'], row)
    else:
        val = row[attr]
        if val in tree['branches']:
            return predict(tree['branches'][val], row)
        else:
            # fallback jika cabang tidak ada
            return list(tree['branches'].values())[0]['class']

# ===============================
# === Fungsi bantu untuk AdaBoost ===
# ===============================
def convert_label(y):
    # Ubah label ke -1 dan 1 (kelas positif = 1)
    return np.array([1 if label == 1 else -1 for label in y])

def ada_boost_train(data, target, attributes, n_estimators=10, max_depth=None):
    n = len(data)
    weights = np.ones(n) / n
    trees = []
    alphas = []
    depths = []
    y_true = convert_label(data[target].values)

    for m in range(n_estimators):
        # Sampling data dengan probabilitas bobot
        sample_indices = np.random.choice(n, size=n, replace=True, p=weights)
        sample_data = data.iloc[sample_indices].reset_index(drop=True)

        # Bangun pohon dengan data sampling
        tree = build_tree(sample_data, target, attributes, max_depth=max_depth)
        trees.append(tree)

        # Simpan kedalaman pohon
        depths.append(get_tree_depth(tree))

        # Prediksi seluruh data training dengan pohon baru
        preds = np.array([1 if predict(tree, row) == 1 else -1 for _, row in data.iterrows()])
        miss = (preds != y_true).astype(int)
        error = np.sum(weights * miss)

        # Jika error 0 atau >= 0.5, hentikan
        if error == 0:
            alphas.append(1)
            break
        if error >= 0.5:
            break

        # Hitung alpha (berat pohon)
        alpha = 0.5 * np.log((1 - error) / error)
        alphas.append(alpha)

        # Update bobot data
        weights *= np.exp(-alpha * y_true * preds)
        weights /= np.sum(weights)

    return trees, alphas, depths

def ada_boost_predict(trees, alphas, row):
    vote = 0
    for tree, alpha in zip(trees, alphas):
        pred = 1 if predict(tree, row) == 1 else -1
        vote += alpha * pred
    return 1 if vote > 0 else 0

# ===============================
# === Load dan siapkan dataset ===
# ===============================
df = pd.read_excel("dataset_31_credit-g.xlsx")

# Pilih kolom yang relevan (ubah sesuai dataset kamu)
data = df[['class', 'age', 'job', 'credit_amount', 'duration',
           'checking_status', 'purpose', 'savings_status', 'personal_status']].copy()

# Encode kolom bertipe objek
for col in data.columns:
    if data[col].dtype == object:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])

# Pisah fitur dan target
X = data.drop(columns=['class'])
y = data['class']

# Gabungkan lagi untuk kemudahan proses
data = pd.concat([X, y], axis=1)

# Bagi data train-test
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# ===============================
# === Pelatihan AdaBoost ===
# ===============================
attributes = list(train_data.columns)
attributes.remove('class')

max_depth = 5  # Atur kedalaman pohon maksimal
n_estimators = 10  # Jumlah weak learner

start = time.time()
trees, alphas, depths = ada_boost_train(train_data, 'class', attributes,
                                        n_estimators=n_estimators, max_depth=max_depth)
end = time.time()

# ===============================
# === Prediksi data test ===
# ===============================
y_pred = [ada_boost_predict(trees, alphas, row) for _, row in test_data.iterrows()]
y_true = test_data['class'].values

# ===============================
# === Tampilkan hasil ===
# ===============================
print(f"Akurasi: {accuracy_score(y_true, y_pred):.4f}\n")
print("Classification Report:\n", classification_report(y_true, y_pred))
print(f"Waktu eksekusi: {end - start:.2f} detik\n")

print("Kedalaman pohon tiap estimator:")
for i, d in enumerate(depths, 1):
    print(f"Pohon ke-{i}: kedalaman = {d}")


Akurasi: 0.7150

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.03      0.07        59
           1       0.71      1.00      0.83       141

    accuracy                           0.71       200
   macro avg       0.86      0.52      0.45       200
weighted avg       0.80      0.71      0.61       200

Waktu eksekusi: 45.03 detik

Kedalaman pohon tiap estimator:
Pohon ke-1: kedalaman = 5
Pohon ke-2: kedalaman = 5
Pohon ke-3: kedalaman = 5
Pohon ke-4: kedalaman = 5
Pohon ke-5: kedalaman = 5
Pohon ke-6: kedalaman = 5
Pohon ke-7: kedalaman = 5
Pohon ke-8: kedalaman = 5
Pohon ke-9: kedalaman = 5
Pohon ke-10: kedalaman = 5


In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import time

# ===============================
# === Fungsi Gain Ratio & Split ===
# ===============================
def entropy(data, target):
    values, counts = np.unique(data[target], return_counts=True)
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs + 1e-9))  # +1e-9 supaya tidak log(0)

def info_gain(data, attr, target):
    total_entropy = entropy(data, target)
    values, counts = np.unique(data[attr], return_counts=True)
    weighted_entropy = np.sum([(counts[i] / np.sum(counts)) *
                              entropy(data[data[attr] == values[i]], target)
                              for i in range(len(values))])
    return total_entropy - weighted_entropy

def split_info(data, attr):
    values, counts = np.unique(data[attr], return_counts=True)
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs + 1e-9))

def gain_ratio(data, attr, target):
    si = split_info(data, attr)
    if si == 0:
        return 0
    return info_gain(data, attr, target) / si

def best_split(data, attributes, target):
    best_attr = None
    best_gain_ratio = -1
    best_threshold = None
    is_numeric = False
    for attr in attributes:
        if data[attr].dtype in [np.int64, np.float64]:
            thresholds = np.unique(data[attr].sort_values())
            for t in thresholds[:-1]:
                data_copy = data.copy()
                data_copy[attr] = data_copy[attr] <= t
                gr = gain_ratio(data_copy, attr, target)
                if gr > best_gain_ratio:
                    best_gain_ratio = gr
                    best_attr = attr
                    best_threshold = t
                    is_numeric = True
        else:
            gr = gain_ratio(data, attr, target)
            if gr > best_gain_ratio:
                best_gain_ratio = gr
                best_attr = attr
                is_numeric = False
    return best_attr, best_threshold, is_numeric

# ==========================
# === Pembangunan Tree (Modifikasi) ===
# ==========================
def build_tree(data, target, attributes, max_depth=None, current_depth=0):
    if len(data) == 0:
        return None  # Data kosong, leaf kosong

    # Jika max_depth tercapai, buat leaf dengan kelas mayoritas
    if max_depth is not None and current_depth >= max_depth:
        values, counts = np.unique(data[target], return_counts=True)
        return {'type': 'leaf', 'class': values[np.argmax(counts)]}

    attr, threshold, is_numeric = best_split(data, attributes, target)
    # Jika tidak ada split yang bisa dilakukan, buat leaf
    if attr is None:
        values, counts = np.unique(data[target], return_counts=True)
        return {'type': 'leaf', 'class': values[np.argmax(counts)]}

    node = {'type': 'node', 'attribute': attr, 'is_numeric': is_numeric}

    if is_numeric:
        node['threshold'] = threshold
        node['left'] = build_tree(data[data[attr] <= threshold], target, attributes, max_depth, current_depth + 1)
        node['right'] = build_tree(data[data[attr] > threshold], target, attributes, max_depth, current_depth + 1)
    else:
        node['branches'] = {}
        for val in data[attr].unique():
            node['branches'][val] = build_tree(data[data[attr] == val], target, attributes, max_depth, current_depth + 1)
    return node

def get_tree_depth(tree):
    if tree is None:
        return 0
    if tree['type'] == 'leaf':
        return 0
    if tree['is_numeric']:
        return 1 + max(get_tree_depth(tree['left']), get_tree_depth(tree['right']))
    else:
        return 1 + max(get_tree_depth(branch) for branch in tree['branches'].values())

# =======================
# === Prediksi & Boost ===
# =======================
def predict(tree, row):
    if tree is None:
        return 0  # default jika tree kosong
    if tree['type'] == 'leaf':
        return tree['class']
    attr = tree['attribute']
    if tree['is_numeric']:
        if row[attr] <= tree['threshold']:
            return predict(tree['left'], row)
        else:
            return predict(tree['right'], row)
    else:
        val = row[attr]
        if val in tree['branches']:
            return predict(tree['branches'][val], row)
        else:
            # default ke cabang pertama jika val tidak ada
            return list(tree['branches'].values())[0]['class']

def convert_label(y):
    return np.array([1 if label == 1 else -1 for label in y])

def ada_boost_train(data, target, attributes, n_estimators=10, max_depth=None):
    n = len(data)
    weights = np.ones(n) / n
    trees = []
    alphas = []
    depths = []
    y_true = convert_label(data[target].values)

    for m in range(n_estimators):
        sample_indices = np.random.choice(n, size=n, replace=True, p=weights)
        sample_data = data.iloc[sample_indices].reset_index(drop=True)
        tree = build_tree(sample_data, target, attributes, max_depth=max_depth)
        trees.append(tree)

        depth = get_tree_depth(tree)
        depths.append(depth)

        preds = np.array([1 if predict(tree, row) == 1 else -1 for _, row in data.iterrows()])
        miss = (preds != y_true).astype(int)
        error = np.sum(weights * miss)

        if error == 0:
            alphas.append(1)
            break
        if error >= 0.5:
            break

        alpha = 0.5 * np.log((1 - error) / (error + 1e-9))  # +1e-9 supaya tidak pembagian nol
        alphas.append(alpha)
        weights *= np.exp(-alpha * y_true * preds)
        weights /= np.sum(weights)
    return trees, alphas, depths

def ada_boost_predict(trees, alphas, row):
    weighted_votes = sum(alpha * (1 if predict(tree, row) == 1 else -1)
                         for tree, alpha in zip(trees, alphas))
    return 1 if weighted_votes > 0 else 0

# ====================
# === Load Dataset ===
# ====================
df = pd.read_excel("dataset_31_credit-g.xlsx")

# Pilih kolom relevan
data = df[['class', 'age', 'job', 'credit_amount', 'duration',
           'checking_status', 'purpose', 'savings_status', 'personal_status']].copy()

# Encoding kolom bertipe objek
for col in data.columns:
    if data[col].dtype == object:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])

# Pisahkan fitur dan target
X = data.drop(columns=['class'])
y = data['class']

# Gabungkan kembali fitur dan target ke satu dataframe agar fungsi bisa jalan
data = pd.concat([X, y], axis=1)

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# ================================
# === Pelatihan & Evaluasi ===
# ================================
start = time.time()
attributes = list(train_data.columns)
attributes.remove('class')

max_tree_depth = 5  # Set kedalaman maksimum pohon

trees, alphas, depths = ada_boost_train(train_data, 'class', attributes,
                                        n_estimators=10, max_depth=max_tree_depth)
end = time.time()

y_pred = [ada_boost_predict(trees, alphas, row) for _, row in test_data.iterrows()]
y_true = test_data['class'].values

print(f"\nAkurasi: {accuracy_score(y_true, y_pred):.4f}")
print("\nClassification Report:\n", classification_report(y_true, y_pred))
print(f"Waktu eksekusi: {end - start:.2f} detik")

# ==============================
# === Tampilkan Kedalaman Pohon ===
# ==============================
print("\nKedalaman masing-masing pohon:")
for i, d in enumerate(depths, start=1):
    print(f"Pohon ke-{i}: kedalaman = {d}")



Akurasi: 0.7150

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.03      0.07        59
           1       0.71      1.00      0.83       141

    accuracy                           0.71       200
   macro avg       0.86      0.52      0.45       200
weighted avg       0.80      0.71      0.61       200

Waktu eksekusi: 19.98 detik

Kedalaman masing-masing pohon:
Pohon ke-1: kedalaman = 5
Pohon ke-2: kedalaman = 5
Pohon ke-3: kedalaman = 5
Pohon ke-4: kedalaman = 5
