In [None]:
def check_purity(data):
    label_column = data[:, -1]
    unique_classes = np.unique(label_column)
    return len(unique_classes) == 1
def classify_data(data):
    label_column = data[:, -1]
    unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)
    index = counts_unique_classes.argmax()
    return unique_classes[index]
def get_potential_splits(data):
    potential_splits = {}
    _, n_columns = data.shape
    for column_index in range(n_columns - 1):  # excluding the last column which is the label
        values = data[:, column_index]
        unique_values = np.unique(values)
        potential_splits[column_index] = unique_values
    return potential_splits
def calculate_entropy(data):
    label_column = data[:, -1]
    _, counts = np.unique(label_column, return_counts=True)
    probabilities = counts / counts.sum()
    return sum(probabilities * -np.log2(probabilities))

def calculate_overall_entropy(data_below, data_above):
    n = len(data_below) + len(data_above)
    p_data_below = len(data_below) / n
    p_data_above = len(data_above) / n
    return p_data_below * calculate_entropy(data_below) + p_data_above * calculate_entropy(data_above)
def determine_best_split(data, potential_splits):
    overall_entropy = 9999
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below, data_above = split_data(data, split_column=column_index, split_value=value)
            current_overall_entropy = calculate_overall_entropy(data_below, data_above)
            if current_overall_entropy <= overall_entropy:
                overall_entropy = current_overall_entropy
                best_split_column = column_index
                best_split_value = value
    return best_split_column, best_split_value
def split_data(data, split_column, split_value):
    split_column_values = data[:, split_column]
    type_of_feature = FEATURE_TYPES[split_column]
    if type_of_feature == "continuous":
        data_below = data[split_column_values <= split_value]
        data_above = data[split_column_values > split_value]
    else:
        data_below = data[split_column_values == split_value]
        data_above = data[split_column_values != split_value]
    return data_below, data_above
def decision_tree_algorithm(df, counter=0, min_samples=2, max_depth=5):
    if counter == 0:
        global COLUMN_HEADERS, FEATURE_TYPES
        COLUMN_HEADERS = df.columns
        FEATURE_TYPES = determine_type_of_feature(df)
        data = df.values
    else:
        data = df
    if (check_purity(data)) or (len(data) < min_samples) or (counter == max_depth):
        return classify_data(data)
    else:
        counter += 1
        potential_splits = get_potential_splits(data)
        split_column, split_value = determine_best_split(data, potential_splits)
        data_below, data_above = split_data(data, split_column, split_value)
        if len(data_below) == 0 or len(data_above) == 0:
            return classify_data(data)
        feature_name = COLUMN_HEADERS[split_column]
        type_of_feature = FEATURE_TYPES[split_column]
        if type_of_feature == "continuous":
            question = "{} <= {}".format(feature_name, split_value)
        else:
            question = "{} = {}".format(feature_name, split_value)
        sub_tree = {question: []}
        yes_answer = decision_tree_algorithm(data_below, counter, min_samples, max_depth)
        no_answer = decision_tree_algorithm(data_above, counter, min_samples, max_depth)
        if yes_answer == no_answer:
            sub_tree = yes_answer
        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)
        return sub_tree
def predict_example(example, tree):
    question = list(tree.keys())[0]
    feature_name, comparison_operator, value = question.split(" ")
    if comparison_operator == "<=":
        if example[feature_name] <= float(value):
            answer = tree[question][0]
        else:
            answer = tree[question][1]
    else:
        if str(example[feature_name]) == value:
            answer = tree[question][0]
        else:
            answer = tree[question][1]
    if not isinstance(answer, dict):
        return answer
    else:
        return predict_example(example, answer)

def decision_tree_predictions(test_df, tree):
    return test_df.apply(predict_example, args=(tree,), axis=1)


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Sample data
data = {
    'feature_1': [2.77, 1.72, 3.67, 3.96, 2.99, 7.49, 9.00, 7.44],
    'feature_2': [1.78, 1.72, 2.81, 2.61, 2.14, 3.16, 3.29, 0.56],
    'label': [0, 0, 0, 0, 0, 1, 1, 1]
}

df = pd.DataFrame(data)

# Define determine_type_of_feature (This is a placeholder, you can define it based on your use case)
def determine_type_of_feature(df):
    return ["continuous"] * (df.shape[1] - 1)

# Build the decision tree
tree = decision_tree_algorithm(df, max_depth=3)
print("Decision Tree:", tree)

# Predictions
test_data = {
    'feature_1': [4, 6, 1.5],
    'feature_2': [2, 3, 1.5]
}
test_df = pd.DataFrame(test_data)
predictions = decision_tree_predictions(test_df, tree)
print("Predictions:", predictions)


Decision Tree: {'feature_1 <= 3.96': [0.0, 1.0]}
Predictions: 0    1.0
1    1.0
2    0.0
dtype: float64
