## Programming

### Data read in and Train-Test split

The train-test split have been implemented in the `main` function

In [1]:
import scipy.io as sio
import numpy as np
from scipy import stats
from collections import deque

matfile = sio.loadmat('./Sogou_data/Sogou_webpage.mat')
label = matfile['doclabel']
feature = matfile['wordMat']

data = np.concatenate((feature,label), axis=1)
np.random.shuffle(data)
train_data = data[:int(0.6*data.shape[0]),:]
valid_data = data[int(0.6*data.shape[0]):int(0.8*data.shape[0]),:]
test_data = data[int(0.8*data.shape[0]):,:]

train = {'feature':None,'label':None}
valid = {'feature':None,'label':None}
test = {'feature':None,'label':None}

for pair in [[train_data,train],[valid_data,valid],[test_data,test]]:
    pair[1]['feature'] = pair[0][:,:-1]
    pair[1]['label'] = pair[0][:,-1]
labels = np.linspace(1,9,9)

### Impurity Function

Here, we implemented two functions to measure the impurity of dataset, which is 

$$\text{Information Entropy:   } E = -\sum_iP_i\log_2P_i \text{   Gini impurity:   } G = 1 - \sum_iP_i^2$$

In order to calc the $\lim_{P\to0}P\log_2P$ correctly, we calc the $\lim_{P\to0}P\log_2(P+eps) \approx \lim_{P\to0}P\log_2P$ instead, $eps = 1\times 10 ^{-6}$. In this way, the calculation could be implemented easily by operations to numpy array

In [2]:
def Impurity(samples, style='entropy'):
    eps = 1e-6
    P = np.zeros(labels.shape[0])
    for i in range(labels.shape[0]):
        P[i] = np.sum(samples == labels[i])
    P = P / samples.shape[0]
    if style == 'entropy':
        return -np.sum(P * np.log2(P + eps))
    elif style == 'gini':
        return 1 - np.sum(P * P)
    else:
        raise ('No impurity style called '+ style)

### Node Class

Here we implemented the `SelectFeature` function and `SplitNode` function. Based on these two function, we implemented the `GenerateTree` function. The stopping branching criteria could be described as the following:

- All of the features is the same OR
- The increasing of information gain (that is the decreasing of the impurity) provided by branching is smaller than the `thresh`


In [3]:
Q = deque()
class node:
    def __init__(self,data_feature,data_label,style='entropy',thresh=0.01, isleft=False, parent = None):
        self.thresh = thresh
        self.style = style
        self.data = {'feature': data_feature,'label': data_label}
        self.isleft = isleft
        self.parent = parent
        self.im = Impurity(self.data['label'],self.style)
        self.label = stats.mode(self.data['label'])
        
        self.l = None
        self.r = None
        self.feature = None
        self.valid_idx = None
        
    def SelectFeature(self):
        imp = np.zeros(self.data['feature'].shape[1])
        for f in range(self.data['feature'].shape[1]):
            label_left = self.data['label'][self.data['feature'][:,f] == 0]
            label_right = self.data['label'][self.data['feature'][:,f] == 1]
            if np.sum(label_left) == 0 or np.sum(label_right) == 0:
                imp[f] = np.infty
            else:
                imp[f] = label_left.shape[0] * Impurity(label_left,self.style) + label_right.shape[0] * Impurity(label_right,self.style)

        self.feature = np.argmin(imp)
        return imp[self.feature] / (label_left.shape[0] +  label_right.shape[0])
    
    def SplitNode(self):
        idx_left = self.data['feature'][:,self.feature] == 0
        idx_right = self.data['feature'][:,self.feature] == 1
        self.l = node(self.data['feature'][idx_left,:], self.data['label'][idx_left],self.style, self.thresh,True,self)
        self.r = node(self.data['feature'][idx_right,:], self.data['label'][idx_right],self.style, self.thresh,False,self)
        
    def GenerateTree(self):
        next_step_im = self.SelectFeature()
        if self.im - next_step_im > self.thresh:
            self.SplitNode()
            self.l.GenerateTree()
            self.r.GenerateTree()
            if self.l.l is None and self.l.r is None and self.r.l is None and self.r.r is None:
                Q.append(self)     
            return

### Building The tree

We can build up the tree from the root

In [4]:
def build(data,style,thres):
    Q.clear()
    root = node(data['feature'],data['label'],style,thres + 1e-6)# abvoid to provide a real 0
    root.GenerateTree()
    return root

### Predict

Here, we implement the `Decision` function as the following code. The `main_decision` is used to integrate the testing and output the result.

In [5]:
def Decision(GeneratedTree,XToBePredited):
    root = GeneratedTree
    while root.l is not None and root.r is not None:
        if XToBePredited[root.feature] == 0:
            root = root.l
        else:
            root = root.r
    return root.label[0][0]

def main_decision(GeneratedTree, data_set, message=''):
    predict = np.zeros_like(data_set['label'])
    for idx in range(predict.shape[0]):
        predict[idx] = Decision(GeneratedTree,data_set['feature'][idx,:])   
    acc = (predict == data_set['label'])
    print ('{}{:.1f}%'.format(message,sum(acc) / acc.shape[0] * 100))

### Precolate and Punning

We use the method described in the guide to punning the tree:

First of all, we percolate all of the samples of the validation samples to all of the leaf node of the decision tree. And for each node which has two leaf nodes, we are trying to punning its children and check if the validation samples in its children node been correct classified.

This loop is continuing until none of the node which have two leaf nodes satisified the criteria above. 

In [6]:
def Percolate(GeneratedTree, features, labels):
    root = GeneratedTree
    root.valid_idx = np.ones_like(labels,dtype=np.bool)
    queue = deque()
    queue.append(root)
    while queue:
        p = queue.popleft()
        if p.l is not None and p.r is not None:
            queue.append(p.l)
            queue.append(p.r)
            p.l.valid_idx = np.bitwise_and(features[:,p.feature] == 0, p.valid_idx)
            p.r.valid_idx = np.bitwise_and(features[:,p.feature] == 1, p.valid_idx)
    return GeneratedTree

def Punning(GeneratedTree, CrossValidationDataset):
    correct = 0
    GT = Percolate(GeneratedTree,CrossValidationDataset['feature'],CrossValidationDataset['label'])
    while Q:
        p = Q.popleft()
        error_child = np.sum(CrossValidationDataset['label'][p.l.valid_idx] != p.l.label) + \
            np.sum(CrossValidationDataset['label'][p.r.valid_idx] != p.r.label)
        error_parent = np.sum(CrossValidationDataset['label'][p.valid_idx] != p.label)
        if error_parent < error_child:
            correct += error_child - error_parent
            # punning:
            p.l = None
            p.r = None
            if (p.isleft and p.parent.r is None) or ((not p.isleft) and p.parent.l is None):
                Q.append(p.parent)
    print ('Punning: Correct about (less than) {:.1f}% of the samples on validation set'.format(
        correct/CrossValidationDataset['label'].shape[0] * 100))
    return GT

### Sum up and testing the hyper-parameter

We test the threshold from 1e-3 to 1e-1 using two impurity method. The result is that

- Using information entropy: `thres` = 1e-2 performs better: on testing dataset (after punning): 73.1%
- Using gini impurity: `thres` = 1e-3 performs better: on testing dataset (after punning): 73.2%

In [7]:
def main(style,thres):
    print ('----- Using {} impurity, thres = {} -----'.format(style,thres))
    T = build(train,style,thres)
    main_decision(T,valid,message='Using {} impurity, thres = {}, before punning, on validation data set: '.format(style,thres))
    main_decision(T,test,message='Using {} impurity, thres = {}, before punning, on test data set: '.format(style,thres))
    T = Punning(T,valid)
    main_decision(T,valid,message='Using {} impurity, thres = {}, after punning, on validation data set: '.format(style,thres))
    main_decision(T,test,message='Using {} impurity, thres = {}, after punning, on test data set: '.format(style,thres))
    print ('---------- END ----------')

for s in ['entropy','gini']:
    for t in [1e-1,1e-2,1e-3]:
        np.random.shuffle(data)
        train_data = data[:int(0.6*data.shape[0]),:]
        valid_data = data[int(0.6*data.shape[0]):int(0.8*data.shape[0]),:]
        test_data = data[int(0.8*data.shape[0]):,:]

        train = {'feature':None,'label':None}
        valid = {'feature':None,'label':None}
        test = {'feature':None,'label':None}

        for pair in [[train_data,train],[valid_data,valid],[test_data,test]]:
            pair[1]['feature'] = pair[0][:,:-1]
            pair[1]['label'] = pair[0][:,-1]
        main(s,t)

----- Using entropy impurity, thres = 0.1 -----
Using entropy impurity, thres = 0.1, before punning, on validation data set: 70.1%
Using entropy impurity, thres = 0.1, before punning, on test data set: 69.6%
Punning: Correct about (less than) 1.1% of the samples on validation set
Using entropy impurity, thres = 0.1, after punning, on validation data set: 71.1%
Using entropy impurity, thres = 0.1, after punning, on test data set: 69.7%
---------- END ----------
----- Using entropy impurity, thres = 0.01 -----
Using entropy impurity, thres = 0.01, before punning, on validation data set: 71.4%
Using entropy impurity, thres = 0.01, before punning, on test data set: 72.8%
Punning: Correct about (less than) 1.8% of the samples on validation set
Using entropy impurity, thres = 0.01, after punning, on validation data set: 73.1%
Using entropy impurity, thres = 0.01, after punning, on test data set: 73.2%
---------- END ----------
----- Using entropy impurity, thres = 0.001 -----
Using entropy i