The decision tree is built from scratch, however I use some helper functions from other libraries: pandas for reading the data into a dataframe, random for shuffling the data before splitting, and scipy.stats for the entropy function.

In [1]:
import pandas as pd
import random
import scipy.stats
import sklearn.metrics
import matplotlib.pyplot as plt

Reading the data into a dataframe

In [2]:
original_data = pd.read_csv("https://raw.githubusercontent.com/ccalantzis/decision_tree/master/HTRU_2.csv", error_bad_lines = False)

Here I split the data into the pulsars (data_1) and not pulsars (data_0). I then shuffle these individually, and take the first 90% of the shuffled data_0 and data_1, and use this for training data. The remaining 10% is for testing data. I split them individually so I have an equal proportion of positive samples in the training and testing data.

In [3]:
labels = list(range(original_data.shape[0]))

data_0 = original_data.Class == 0
labels_0 = [i for i in data_0.index if data_0[i]]
data_1 = original_data.Class == 1
labels_1 = [i for i in data_1.index if data_1[i]]

random.seed(503)
random.shuffle(labels_0)
random.shuffle(labels_1)

split_point_0 = int(0.9*len(labels_0))
split_point_1 = int(0.9*len(labels_1))

train_labels = labels_0[:split_point_0] + labels_1[:split_point_1]
test_labels = labels_0[split_point_0:] + labels_1[split_point_1:]

random.shuffle(train_labels)
random.shuffle(test_labels)

train_data = original_data.loc[train_labels].copy(deep=True)
test_data = original_data.loc[test_labels].copy(deep=True)

The class Node represents any node in the decision tree which has children

PredictionNode represents a node with no children i.e. it is a leaf node

In [4]:
class DecisionNode:
    def __init__(self, attribute, threshold):
        self.attribute = attribute
        self.threshold = threshold
    def addLeftChild(self, child):
        self.left_child = child
    def addRightChild(self, child):
        self.right_child = child
    def print_node(self):
        print("Attribute: ", self.attribute)
        print("Threshold: ", self.threshold)
        
class PredictionNode:
    def __init__(self, value):
        self.value = value
    def prediction(self):
        return self.value
    def print_node(self):
        print("Prediction: ", self.prediction)      

split_column() creates a temporary column 'temp' based on the value of the provided column. If the value in the provided column is above the threshold value, the 'temp' value is 1, otherwise 0. This function is used in information_gain(), to calculate the information gain obtained from a particular variable split with a particular threshold value

In [5]:
def split_column(column_name, threshold, data):
    data['temp'] = 0
    filt = data[column_name] > threshold
    data.loc[filt,'temp'] = 1
    
def information_gain(data):
    entropy = scipy.stats.entropy(data.Class.value_counts())
    if(data['Class'].mode()[0] == data['Class'].shape[0]):
        return 0
    filt = data['temp'] == 0
    counts_0 = data.loc[filt, 'Class'].value_counts()
    entropy_0 = scipy.stats.entropy(counts_0)
    size_0 = data[filt].shape[0]
    filt = data['temp'] == 1
    counts_1 = data.loc[filt, 'Class'].value_counts()
    entropy_1 = scipy.stats.entropy(counts_1)
    size_1 = data[filt].shape[0]
    orig_size = data.shape[0]

    return entropy - ((size_0/orig_size)*entropy_0 + (size_1/orig_size)*entropy_1)

This takes one variable, finds 10 threshold values and calculates which threshold value produces the largest information gain for the target variable

In [6]:
def max_info_gain_per_variable(column, data):
    current_entropy = scipy.stats.entropy(data.Class.value_counts())
    value_range = data[column].max() - data[column].min()
    step = value_range/10
    max_info_gain = 0
    max_info_gain_threshold = 0
    for i in range(1,10):
        threshold = data[column].min() + i*step
        split_column(column, threshold, data)
        info_gain = information_gain(data)
        if(info_gain > max_info_gain):
            max_info_gain = info_gain
            max_info_gain_threshold = threshold
    return (max_info_gain, max_info_gain_threshold)

This loops over all variables, finding the maximum information gain at any point in the decision tree. If the maximum information gain is below some minimum value, the function returns 0

In [7]:
def max_info_gain_overall(data, entropy, used_columns, min_value = 0.005, vis = False):
    max_info_gain = 0
    max_info_gain_column = ''
    max_info_gain_threshold = 0
    
    columns = [c for c in data.columns[:8] if c not in used_columns]
    for column in columns:
        result = max_info_gain_per_variable(column, data)
        if (result[0] > max_info_gain):
            max_info_gain = result[0]
            max_info_gain_column = column
            max_info_gain_threshold = result[1]
    if(max_info_gain < min_value):
        return 0
    max_info_node = DecisionNode(max_info_gain_column, max_info_gain_threshold)
    if vis:
        plt.scatter(data[max_info_gain_column], data.Class)
        plt.axvline(x=max_info_gain_threshold)
        plt.show()
    return max_info_node

build_subtree is a recursive function which builds the decision tree in pre-order.

In [8]:
def build_subtree(root, data, level, used_columns):
    
    # left node, or false node
    filt = data[root.attribute] < root.threshold
    data_subset = data.loc[filt].copy(deep=True)
    entropy = scipy.stats.entropy(data_subset.Class.value_counts())
    left_child = max_info_gain_overall(data_subset, entropy, used_columns)    
    if(left_child == 0):
        root.addLeftChild(PredictionNode(data_subset.Class.mode()[0]))
    else:
        root.addLeftChild(left_child) 
        level += 1
        used_columns.append(left_child.attribute)
        build_subtree(left_child, data_subset, level, used_columns)
        level -= 1
        used_columns.pop()
        
    # right node, or true node
    filt = data[root.attribute] > root.threshold
    data_subset = data.loc[filt].copy(deep=True)
    entropy = scipy.stats.entropy(data_subset.Class.value_counts())
    right_child = max_info_gain_overall(data_subset, entropy, used_columns)
    if(right_child == 0):
        root.addRightChild(PredictionNode(data_subset.Class.mode()[0]))
    else:
        root.addRightChild(right_child)
        level += 1
        used_columns.append(right_child.attribute)
        build_subtree(right_child, data_subset, level, used_columns)

check_subtree checks for redundant subtrees i.e. subtrees where every prediction node has the same value
This function is used by build_ID3_tree to prune the redundant subtrees

build_ID3_tree provides the root node and makes use of build_subtree to create the rest of the tree structure.

In [9]:
def check_subtree(node, values):
    if(type(node) == PredictionNode):
        values.append(node.value)
    else:
        check_subtree(node.left_child, values)
        check_subtree(node.right_child, values)
    return values

def build_ID3_tree(data):
    entropy = scipy.stats.entropy(data.Class.value_counts())
    used_columns = []
    root_node = max_info_gain_overall(data, entropy, used_columns)
    used_columns.append(root_node.attribute)
    build_subtree(root_node, data, 0, used_columns)

    node_stack = []
    node_stack.append(root_node)
    node = root_node
    while(len(node_stack) > 0):
        if(type(node) != PredictionNode):
            values = []
            values = check_subtree(node.right_child, values)
            if(len(set(values)) == 1):
                node.addRightChild(PredictionNode(set(values).pop()))
            else:
                node_stack.append(node.right_child)
            values = []
            values = check_subtree(node.left_child, values)
            if(len(set(values)) == 1):
                node.addLeftChild(PredictionNode(set(values).pop()))                      
            else:
                node_stack.append(node.left_child)
        node = node_stack.pop()
    
    return root_node
        
ID3_tree = build_ID3_tree(train_data)

visualise_tree provides a visualisation of the tree structure using if-else statements

In [10]:
def visualise_tree(node, level):
    if(type(node.left_child) == PredictionNode):
        print(level*"\t", "IF ", node.attribute, " < ", node.threshold)
        print((level+1)*"\t", "THEN ", node.left_child.value)
    else:
        print(level*"\t", "IF ", node.attribute, " < ", node.threshold)
        level += 1
        visualise_tree(node.left_child, level)
        level -= 1
        
    if(type(node.right_child) == PredictionNode):
        print(level*"\t", "ELSE ", node.attribute, " > ", node.threshold)
        print((level+1)*"\t", "THEN ", node.right_child.value)
    else:
        print(level*"\t", "ELSE ", node.attribute, " > ", node.threshold)
        level += 1
        visualise_tree(node.right_child, level)
        
visualise_tree(ID3_tree, 0)

 IF  Excess kurtosis of the integrated profile  <  1.1076487870999996
	 THEN  0
 ELSE  Excess kurtosis of the integrated profile  >  1.1076487870999996
	 IF  Skewness of the DM-SNR curve  <  100.04292962330001
		 THEN  1
	 ELSE  Skewness of the DM-SNR curve  >  100.04292962330001
		 IF  Mean of the DM-SNR curve  <  2.5133779262000004
			 THEN  0
		 ELSE  Mean of the DM-SNR curve  >  2.5133779262000004
			 THEN  1


In [11]:
def make_prediction(node, data_input):
    while(type(node) != PredictionNode):
        if data_input[node.attribute] > node.threshold:
            node = node.right_child
        else:
            node = node.left_child
    return node.prediction()

Finding the accuracy of the ID3_tree on the test data

In [12]:
real_values = [val for val in test_data.Class]
pred_values = []
for i in test_labels:
    pred = make_prediction(ID3_tree, test_data.loc[i])
    pred_values.append(pred)
correct = len([1 for r,p in zip(real_values, pred_values) if (r==p)]) / float(len(pred_values))
print("Test set accuracy: ", correct)

Test set accuracy:  0.9743016759776536


In [13]:
df = pd.DataFrame({"real": real_values, "pred": pred_values}, columns=['real','pred'])

confusion_matrix = pd.crosstab(df['real'], df['pred'], rownames=['Actual'], colnames=['Predicted'])
print (confusion_matrix)

Predicted     0    1
Actual              
0          1612   14
1            32  132
