The decision tree is built from scratch, however I use some helper functions from other libraries: pandas for reading the data into a dataframe, random for shuffling the data before splitting, scipy.stats for the entropy function, and sklearn.metrics for the F1 score.

In [1]:
import pandas as pd
import random
import scipy.stats
import sklearn.metrics
import matplotlib.pyplot as plt

Reading the data into a dataframe

In [2]:
column_names = [
    "Mean of the integrated profile.",
    "Standard deviation of the integrated profile.",
    "Excess kurtosis of the integrated profile.",
    "Skewness of the integrated profile.",
    "Mean of the DM-SNR curve.",
    "Standard deviation of the DM-SNR curve.",
    "Excess kurtosis of the DM-SNR curve.",
    "Skewness of the DM-SNR curve.",
    "Class"
]
original_data = pd.read_csv('HTRU_2.csv', names = column_names)

Here I split the data into the pulsars (data_1) and not pulsars (data_0). I then shuffle these individually, and take the first 90% of the shuffled data_0 and data_1, and use this for training data. The remaining 10% is for testing data. I split them individually so I have an equal proportion of positive samples in the training and testing data.

In [3]:
labels = list(range(original_data.shape[0]))

data_0 = original_data.Class == 0
labels_0 = [i for i in data_0.index if data_0[i]]
data_1 = original_data.Class == 1
labels_1 = [i for i in data_1.index if data_1[i]]

random.seed(503)
random.shuffle(labels_0)
random.shuffle(labels_1)

split_point_0 = int(0.9*len(labels_0))
split_point_1 = int(0.9*len(labels_1))

train_labels = labels_0[:split_point_0] + labels_1[:split_point_1]
test_labels = labels_0[split_point_0:] + labels_1[split_point_1:]

random.shuffle(train_labels)
random.shuffle(test_labels)

train_data = original_data.loc[train_labels].copy(deep=True)
test_data = original_data.loc[test_labels].copy(deep=True)

split_column() creates a temporary column 'temp' based on the value of the provided column. If the value in the provided column is above the threshold value, the 'temp' value is 1, otherwise 0. This function is used in information_gain(), to calculate the information gain obtained from a particular variable split with a particular threshold value

In [4]:
def split_column(column_name, threshold, data):
    data['temp'] = 0
    filt = data[column_name] > threshold
    data.loc[filt,'temp'] = 1
    
def information_gain(entropy, data):
    if(data.shape[0] < 0.01*original_data.shape[0]):
        return 0
    filt = data['temp'] == 0
    counts = data.loc[filt, 'Class'].value_counts()
    entropy_0 = scipy.stats.entropy(counts)
    size_0 = data[filt].shape[0]
    filt = data['temp'] == 1
    counts = data.loc[filt, 'Class'].value_counts()
    entropy_1 = scipy.stats.entropy(counts)
    size_1 = data[filt].shape[0]
    orig_size = data.shape[0]
    return entropy - ((size_0/orig_size)*entropy_0 + (size_1/orig_size)*entropy_1)

This takes one variable, finds 10 threshold values and calculates which threshold value produces the largest information gain for the target variable

In [5]:
def max_info_gain_per_variable(column, data):
    current_entropy = scipy.stats.entropy(data.Class.value_counts())
    value_range = data[column].max() - data[column].min()
    step = value_range/10
    max_info_gain = 0
    max_info_gain_threshold = 0
    for i in range(1,10):
        threshold = data[column].min() + i*step
        split_column(column, threshold, data)
        info_gain = information_gain(current_entropy, data)
        if(info_gain > max_info_gain):
            max_info_gain = info_gain
            max_info_gain_threshold = threshold
    return (max_info_gain, max_info_gain_threshold)

This loops over all variables, finding the maximum information gain at any point in the decision tree. If the maximum information gain is below some minimum value, the function returns 0

In [6]:
def max_info_gain_overall(data, entropy, min_value = 0.005, vis = False):
    max_info_gain = 0
    max_info_gain_column = ''
    max_info_gain_threshold = 0
    for column in data.columns[:8]:
        result = max_info_gain_per_variable(column, data)
        if (result[0] > max_info_gain):
            max_info_gain = result[0]
            max_info_gain_column = column
            max_info_gain_threshold = result[1]
    if(max_info_gain < min_value):
        return 0
    max_info_node = Node(max_info_gain_column, max_info_gain_threshold, max_info_gain)
    if vis:
        plt.scatter(data[max_info_gain_column], data.Class)
        plt.axvline(x=max_info_gain_threshold)
    plt.show()
    return max_info_node

The class Node represents any node in the decision tree which has children
PredictionNode represents a node with no children i.e. it is a leaf node
DecisionTree represents a collection of nodes, beginning with the root node. This class keeps track of how many nodes are in the tree

In [7]:
class Node:
    def __init__(self, column, threshold, info_gain):
        self.column = column
        self.threshold = threshold
        self.info_gain = info_gain
    def addLeftChild(self, child):
        self.left_child = child
    def addRightChild(self, child):
        self.right_child = child
    def print_node(self):
        print("Column: ", self.column)
        print("Threshold: ", self.threshold)
        print("Info gain: ", self.info_gain)
        
class PredictionNode:
    def __init__(self, value):
        self.value = value
    def prediction(self):
        return self.value
    def print_node(self):
        print("Prediction: ", self.prediction)
        
class DecisionTree:
    def __init__(self, root_node):
        self.root_node = root_node
        self.count = 1
    def addNode(self):
        self.count += 1
    def countNodes(self):
        return count        

build_subtree is a recursive function which builds the decision tree in pre-order.
build_ID3_tree provides the root node and makes use of build_subtree to create the rest of the tree structure.
build_subtree also prints a visual representation of the tree structure

In [8]:
def build_subtree(root, data, tree, level):
    
    # left node, or false node
    filt = data[root.column] < root.threshold
    data_subset = data.loc[filt].copy(deep=True)
    entropy = scipy.stats.entropy(data_subset.Class.value_counts())
    left_child = max_info_gain_overall(data_subset, entropy)
    tree.addNode()
    if(left_child == 0):
        root.addLeftChild(PredictionNode(data_subset.Class.mode()[0]))
        print(level*"\t", "IF ", root.column, " < ", root.threshold)
        print((level+1)*"\t", "THEN ", data_subset.Class.mode()[0])
    else:
        root.addLeftChild(left_child)  
        print(level*"\t", "IF ", root.column, " < ", root.threshold)
        level += 1
        build_subtree(left_child, data_subset, tree, level)
        level -= 1
        
    # right node, or true node
    filt = data[root.column] > root.threshold
    data_subset = data.loc[filt].copy(deep=True)
    entropy = scipy.stats.entropy(data_subset.Class.value_counts())
    right_child = max_info_gain_overall(data_subset, entropy)
    tree.addNode()
    if(right_child == 0):
        root.addRightChild(PredictionNode(data_subset.Class.mode()[0]))
        print(level*"\t", "ELSE ", root.column, " > ", root.threshold)
        print((level+1)*"\t", "THEN ", data_subset.Class.mode()[0])
    else:
        root.addRightChild(right_child)
        print(level*"\t", "ELSE ", root.column, " > ", root.threshold)
        level += 1
        build_subtree(right_child, data_subset, tree, level)

def build_ID3_tree(data):
    entropy = scipy.stats.entropy(data.Class.value_counts())
    root_node = max_info_gain_overall(data, entropy)
    decision_tree = DecisionTree(root_node)
    build_subtree(root_node, data, decision_tree, 0)
    
    return decision_tree
        
ID3_tree = build_ID3_tree(train_data)

 IF  Excess kurtosis of the integrated profile.  <  1.1076487870999996
	 IF  Excess kurtosis of the integrated profile.  <  0.5105479293999999
		 THEN  0
	 ELSE  Excess kurtosis of the integrated profile.  >  0.5105479293999999
		 IF  Standard deviation of the DM-SNR curve.  <  27.827414752
			 THEN  0
		 ELSE  Standard deviation of the DM-SNR curve.  >  27.827414752
			 IF  Standard deviation of the integrated profile.  <  41.827887837999995
				 THEN  0
			 ELSE  Standard deviation of the integrated profile.  >  41.827887837999995
				 IF  Excess kurtosis of the integrated profile.  <  0.7474258104
					 THEN  0
				 ELSE  Excess kurtosis of the integrated profile.  >  0.7474258104
					 THEN  1
 ELSE  Excess kurtosis of the integrated profile.  >  1.1076487870999996
	 IF  Excess kurtosis of the integrated profile.  <  2.5004606028
		 IF  Excess kurtosis of the integrated profile.  <  1.5249815067
			 IF  Standard deviation of the DM-SNR curve.  <  17.5339579053
				 THEN  0
			 ELSE

In [9]:
def make_prediction(decision_tree, data_input):
    node = decision_tree.root_node
    while(type(node) != PredictionNode):
        if data_input[node.column] > node.threshold:
            node = node.right_child
        else:
            node = node.left_child
    return node.prediction()

Finding the accuracy of the ID3_tree on the test data

In [10]:
real_values = [val for val in test_data.Class]
pred_values = []
for i in test_labels:
    pred = make_prediction(ID3_tree, test_data.loc[i])
    pred_values.append(pred)
correct = len([1 for r,p in zip(real_values, pred_values) if (r==p)]) / float(len(pred_values))
print("Test set accuracy: ", correct)

Test set accuracy:  0.9793296089385475


In [12]:
f1_score = sklearn.metrics.f1_score(real_values, pred_values)
print("Test set F1 score: ", f1_score)

Test set F1 score:  0.8840125391849529
