In [20]:
import numpy as np

def gini(y):
    classes, counts = np.unique(y, return_counts=True)
    prob_sq = (counts / counts.sum()) ** 2
    return 1 - prob_sq.sum()
def entropy(y):
    counts = np.bincount(y)
    probabilities = counts / len(y)
    return -np.sum([p * np.log2(p) for p in probabilities if p > 0])

def information_gain(parent, left_idxs, right_idxs):
    left = parent[left_idxs]
    right = parent[right_idxs]
    parent_entropy = entropy(parent)
    n = len(parent)
    n_left, n_right = len(left), len(right)
    if n_left == 0 or n_right == 0:
        return 0
    child_entropy = (n_left / n) * entropy(left) + (n_right / n) * entropy(right)
    return parent_entropy - child_entropy

In [21]:
def split(X_column, threshold):
    left = np.where(X_column <= threshold)[0]
    right = np.where(X_column > threshold)[0]
    return left, right


In [22]:
def best_split(X, y):
    best_gain = -1
    best_idx, best_thresh = None, None

    for col in range(X.shape[1]):
        thresholds = np.unique(X[:, col])
        for t in thresholds:
            left, right = split(X[:, col], t)
            if len(left) == 0 or len(right) == 0:
                continue
            gain = information_gain(y, left, right)
            if gain > best_gain:
                best_gain = gain
                best_idx = col
                best_thresh = t
    print(f"Best split: feature={best_idx}, threshold={best_thresh}, type={type(best_thresh)}")
    return best_idx, best_thresh


In [23]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

def build_tree(X, y, depth=0, max_depth=3):
    if len(set(y)) == 1 or depth == max_depth:
        leaf_value = max(set(y), key=list(y).count)
        return Node(value=leaf_value)

    feature, threshold = best_split(X, y)
    if feature is None:
        return Node(value=max(set(y), key=list(y).count))

    left_idxs, right_idxs = split(X[:, feature], threshold)
    left = build_tree(X[left_idxs], y[left_idxs], depth + 1)
    right = build_tree(X[right_idxs], y[right_idxs], depth + 1)
    return Node(feature, threshold, left, right)


In [24]:
def predict(tree, x):
    if tree.value is not None:
        return tree.value
    if x[tree.feature] <= tree.threshold:
        return predict(tree.left, x)
    else:
        return predict(tree.right, x)


In [25]:
import pandas as pd
datas=pd.read_csv("data.csv")


In [26]:
datas["card"] = datas["card"].map({"yes": 1, "no": 0})
datas["owner"] = datas["owner"].map({"yes": 1, "no": 0})
datas["selfemp"] = datas["selfemp"].map({"yes": 1, "no": 0})
print(datas.head())

   card  reports       age  income     share  expenditure  owner  selfemp  \
0     1        0  37.66667  4.5200  0.033270   124.983300      1        0   
1     1        0  33.25000  2.4200  0.005217     9.854167      0        0   
2     1        0  33.66667  4.5000  0.004156    15.000000      1        0   
3     1        0  30.50000  2.5400  0.065214   137.869200      0        0   
4     1        0  32.16667  9.7867  0.067051   546.503300      1        0   

   dependents  months  majorcards  active  
0           3      54           1      12  
1           3      34           1      13  
2           4      58           1       5  
3           0      25           1       7  
4           2      64           1       5  


In [27]:
X = datas.drop(columns=['card'])
y = datas['card']

tree = build_tree(X.to_numpy(), y.to_numpy())

predictions = [predict(tree, x) for x in X.to_numpy()]


Best split: feature=4, threshold=0.0, type=<class 'numpy.float64'>
Best split: feature=0, threshold=0.0, type=<class 'numpy.float64'>
Best split: feature=10, threshold=1.0, type=<class 'numpy.float64'>
Best split: feature=1, threshold=48.25, type=<class 'numpy.float64'>


In [28]:
print(X.dtypes)
print(X.isnull().sum())
print(y.isnull().sum())
print(X.head())


reports          int64
age            float64
income         float64
share          float64
expenditure    float64
owner            int64
selfemp          int64
dependents       int64
months           int64
majorcards       int64
active           int64
dtype: object
reports        0
age            0
income         0
share          0
expenditure    0
owner          0
selfemp        0
dependents     0
months         0
majorcards     0
active         0
dtype: int64
0
   reports       age  income     share  expenditure  owner  selfemp  \
0        0  37.66667  4.5200  0.033270   124.983300      1        0   
1        0  33.25000  2.4200  0.005217     9.854167      0        0   
2        0  33.66667  4.5000  0.004156    15.000000      1        0   
3        0  30.50000  2.5400  0.065214   137.869200      0        0   
4        0  32.16667  9.7867  0.067051   546.503300      1        0   

   dependents  months  majorcards  active  
0           3      54           1      12  
1           3   