In [155]:
import numpy as np

def import_data(path):
    data = []
    class_label = []
    with open(path) as f:
        for l in f:
            x1, x2, y = l.strip().split(" ")
            # x1 = float(x1)
            # x2 = float(x2)
            # y = int(y)
            data.append([x1, x2, y])
    data = np.array(data)
    features = ("x0", "x1", "y")
    return data, features

def calculate_entropy(data, ind):
    entropy = 0
    data_feature = data[:,ind]
    instance_set = set(data_feature)
    
    for instance in instance_set:
        p = np.sum(data_feature == instance)/len(data_feature)
        entropy += p * np.log(p)
    entropy = - entropy / np.log(2)
    return entropy
        
def determine_candidate_numeric_split(data, ind):
    c = []
    data_copy = data[:,ind].astype(float)
    sorted_data = data[data_copy.argsort()]
    #print(sorted_data[:,1])
    #print(sorted_data[:,-1])
    for i in range(len(sorted_data) - 1):
        if (sorted_data[i+1, -1] != sorted_data[i, -1]):
            c.append(sorted_data[i+1, ind])
    return c
    
def split_data_set(data, ind, value):
    """
    set 1 is >=
    set 2 is <
    """
    set2 = data[np.where(data[:,ind] < value)]
    set1 = data[np.where(data[:,ind] >= value)]
    return set1, set2
            
    
def find_best_split(data):
    n_features = len(data[0]) - 1
    n_data = len(data)
    data_entropy = calculate_entropy(data, -1)
    
    best_info_gain_ratio = 0
    best_feature_ind = None
    threshold_candidate = None
    for i in range(n_features):
        candidates = determine_candidate_numeric_split(data, i)
        for candidate in candidates:
            sub_set_1, sub_set_2 = split_data_set(data, i, candidate)
            p1 = float(len(sub_set_1))/n_data
            p2 = float(len(sub_set_2))/n_data
            
            conditional_entropy = p1*calculate_entropy(sub_set_1, -1) + \
                                    p2*calculate_entropy(sub_set_2, -1)
            if (len(sub_set_1) == 0 or len(sub_set_2) == 0):
                entropy = 0
            else:
                entropy = -p1*np.log(p1) - p2*np.log(p2)
            entropy = entropy / np.log(2)
            if (entropy == 0):
                continue
            info_gain = data_entropy - conditional_entropy
            info_gain_ratio = info_gain/entropy
            if (info_gain_ratio > best_info_gain_ratio):
                best_info_gain_ratio = info_gain_ratio
                best_feature_ind = i
                threshold_candidate = candidate
    return best_feature_ind, threshold_candidate
    
def determine_majority_class_label(class_labels):
    count_0 = np.count_nonzero(class_labels == '0')
    count_1 = np.count_nonzero(class_labels == '1')
    majority = None
    if (count_0 > count_1):
        majority = '0'
    else:
        majority = '1'
    return majority

def create_tree(data, features):
    if (len(data) == 0):
        return '1' #  no majority class
    
    best_feature_ind, threshold = find_best_split(data)
    if (best_feature_ind == None):
        class_labels = data[:,-1]
        majority = determine_majority_class_label(class_labels)
        return majority
    subtree = {features[best_feature_ind] : {}}
    set1, set2 = split_data_set(data, best_feature_ind, threshold)
    subtree[features[best_feature_ind]]['>= '+threshold] = create_tree(set1, features)
    subtree[features[best_feature_ind]]['< '+threshold] = create_tree(set2, features)
    return subtree

data, features = import_data('./D3leaves.txt')
tree = create_tree(data, features)

In [151]:
m, l = import_data("./D1.txt")
#determine_candidate_numeric_split(m, 1)
s1, s2 = find_best_split(m)

In [156]:
import pydot


{'x1': {'>= 2': '1', '< 2': {'x0': {'>= 10': '1', '< 10': '0'}}}}

In [136]:
m, l = import_data("./test.txt")
#m, l = import_data("./D1.txt")
s1, s2 = find_best_split(m)
print(s1, s2)

0 0.4


In [137]:
m, l = import_data("./D1.txt")
m, l = import_data("./test.txt")
split_data_set(m, 0, '0.1')
#determine_candidate_numeric_split(m, 1)

(array([['0.4', '0.5', '1'],
        ['0.1', '0.7', '0'],
        ['0.3', '0.3', '0'],
        ['0.3', '0.4', '0']], dtype='<U3'),
 array([], shape=(0, 3), dtype='<U3'))

In [138]:
np.count_nonzero(m[:,2] == '0')

3

In [145]:
np.log(0)

  np.log(0)


-inf

In [147]:
np.seterr(all='raise')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}