In [4]:
import numpy as np
import pandas as pd
class DecisionTree:
    def __init__(self, threshold_best, feature_best):
        self.threshold_best = threshold_best
        self.feature_best = feature_best
        self.left = None
        self.right = None

def entropy(labels):
    n = len(labels)
    if n == 0:
        return 0
    _, counts = np.unique(labels, return_counts=True)
    probabilities = counts / n
    return -np.sum(probabilities * np.log2(probabilities))

def find_best_split(data, target):
    features = data.shape[1]
    entropy_before = entropy(target)
    best_gain = 0
    best_threshold = None
    best_feature = None

    for feature in range(features):
        unique_values = np.unique(data[:, feature])
        for value in unique_values:
            mask = data[:, feature] <= value
            left_labels = target[mask]
            right_labels = target[~mask]

            if len(left_labels) == 0 or len(right_labels) == 0:
                continue

            left_weight = len(left_labels) / len(target)
            right_weight = len(right_labels) / len(target)
            weighted_entropy_after = (left_weight * entropy(left_labels) +
                                      right_weight * entropy(right_labels))

            information_gain = entropy_before - weighted_entropy_after
            if information_gain > best_gain:
                best_gain = information_gain
                best_threshold = value
                best_feature = feature

    return best_threshold, best_feature

def build_tree(data, target, depth=1, max_depth=5):
    if depth > max_depth or len(np.unique(target)) == 1:
        return None

    threshold_best, feature_best = find_best_split(data, target)

    if threshold_best is None or feature_best is None:
        return None

    node = DecisionTree(threshold_best, feature_best)

    mask = data[:, feature_best] <= threshold_best
    data_left, target_left = data[mask], target[mask]
    data_right, target_right = data[~mask], target[~mask]

    node.left = build_tree(data_left, target_left, depth + 1, max_depth)
    node.right = build_tree(data_right, target_right, depth + 1, max_depth)

    return node

def predict(tree, data):
    if tree is None:
        return None

    if data[tree.feature_best] <= tree.threshold_best:
        if tree.left is None:
            return "Non-Fraud"
        return predict(tree.left, data)
    else:
        if tree.right is None:
            return "Non-Fraud"
        return predict(tree.right, data)



In [6]:
df = pd.read_csv("DecisionTreeAssignment.csv")

In [7]:
df

Unnamed: 0,distance_from_last_transaction,ratio_to_median_purchase_price,used_pin_number,online_order,fraud
0,2.131956,6.358667,0,1,1
1,3.803057,1.872950,0,1,1
2,15.694986,0.855623,0,1,1
3,26.711462,4.603601,0,1,1
4,10.664474,4.886521,0,1,1
...,...,...,...,...,...
380753,6.094647,13.979082,0,0,0
380754,5.222036,0.690188,0,0,0
380755,9.312581,3.021285,0,1,0
380756,6.517671,0.358166,0,1,0


In [8]:
# Sample usage
data = np.array([[2, 0.7, 1, 1, 0],
                 [5, 0.5, 0, 0, 1],
                 [1, 0.8, 1, 1, 0],
                 [3, 0.6, 0, 1, 1]])

target = np.array(["Fraud", "Non-Fraud", "Fraud", "Non-Fraud"])

tree = build_tree(data, target)
print("Threshold Best:", tree.threshold_best)
print("Feature Best:", tree.feature_best)

# Sample prediction
new_data_point = np.array([4, 0.65, 1, 0, 1])
prediction = predict(tree, new_data_point)
print("Prediction:", prediction)


Threshold Best: 2.0
Feature Best: 0
Prediction: Non-Fraud
