### Data read in and Train-Test split

In [1]:
import scipy.io as sio
import numpy as np
from scipy import stats

matfile = sio.loadmat('./Sogou_data/Sogou_webpage.mat')
label = matfile['doclabel']
feature = matfile['wordMat']

data = np.concatenate((feature,label), axis=1)
np.random.shuffle(data)

train_data = data[:int(0.6*data.shape[0]),:]
valid_data = data[int(0.6*data.shape[0]):int(0.8*data.shape[0]),:]
test_data = data[int(0.8*data.shape[0]):,:]

train = {'feature':None,'label':None}
valid = {'feature':None,'label':None}
test = {'feature':None,'label':None}

for pair in [[train_data,train],[valid_data,valid],[test_data,test]]:
    pair[1]['feature'] = pair[0][:,:-1]
    pair[1]['label'] = pair[0][:,-1]
    
labels = np.linspace(1,9,9)

In [2]:
def Impurity(samples, style='entropy'):
    if samples.shape[0] == 0:
        return 0
    eps = 1e-6
    P = np.zeros(labels.shape[0])
    for i in range(labels.shape[0]):
        P[i] = np.sum(samples == labels[i])
    P = P / samples.shape[0]
    if style == 'entropy':
        return -np.sum(P * np.log2(P + eps))
    elif style == 'gini':
        return 1 - np.sum(P * P)
    else:
        raise ('No impurity style called '+ style)

In [3]:
from collections import deque
Q = deque()
class node:
    def __init__(self,style='entropy',thresh=0):
        self.thresh = thresh
        self.style = style
        self.im = 0
        self.l = None
        self.r = None
        self.parent = None
        self.feature = None
        self.data = {'feature':None,'label':None}
        self.isleft = False
        self.label = None
        self.valid_idx = []
    def SelectFeature(self):
        
        imp = np.zeros(self.data['feature'].shape[1])
        for f in range(self.data['feature'].shape[1]):

            idx_left = (self.data['feature'][:,f] == 0)
            idx_right = (self.data['feature'][:,f] == 1)
            
            

            label_left = self.data['label'][idx_left]
            label_right = self.data['label'][idx_right]
            
            imp[f] = label_left.shape[0] * Impurity(label_left,self.style) + label_right.shape[0] * Impurity(label_right,self.style)
        
        self.feature = np.argmin(imp)
        
    def SplitNode(self):
        self.l = node(self.style,self.thresh)
        self.r = node(self.style,self.thresh)
        
        idx_left = self.data['feature'][:,self.feature] == 0
        idx_right = self.data['feature'][:,self.feature] == 1
        
        
        if np.sum(idx_left) == 0 or np.sum(idx_right) == 0:
            self.l = None
            self.r = None
            return False
        
        
        self.l.data['label'] = self.data['label'][idx_left]
        self.r.data['label'] = self.data['label'][idx_right]
        
        self.l.data['feature'] = self.data['feature'][idx_left,:]
        self.r.data['feature'] = self.data['feature'][idx_right,:]
        
        self.l.node_impurity()
        self.r.node_impurity()
        self.l.parent = self
        self.r.parent = self
        
        self.l.isleft = True
        self.l.label = stats.mode(self.l.data['label'])
        self.r.label = stats.mode(self.r.data['label'])
        
        if self.im - (self.l.data['label'].shape[0] * self.l.im + self.r.data['label'].shape[0] * self.r.im) / \
        (self.l.data['label'].shape[0] + self.r.data['label'].shape[0]) < self.thresh:
            self.l = None
            self.r = None
            return False
        else:
            
            return True
    
    def node_impurity(self):
        self.im = Impurity(self.data['label'],self.style)
        return self.im
    
    def GenerateTree(self):
        if self.im < self.thresh or self.data['label'].shape[0] <= 1:
            return
        else:
            self.SelectFeature()
            if not self.SplitNode():
                return
            self.l.GenerateTree()
            self.r.GenerateTree()
            if self.l.l is None and self.l.r is None and self.r.l is None and self.r.r is None:
                Q.append(self)     
            return

In [4]:
def build(style,thres):
    root = node(style,thres)
    root.data = train
    root.node_impurity()
    root.label = stats.mode(root.data['label'])
    root.GenerateTree()
    return root

T = build('entropy',0.01)


<built-in method count of collections.deque object at 0x7fbe72411590>


In [5]:
def Decision(GeneratedTree,XToBePredited):
    root = GeneratedTree
    while root.l is not None and root.r is not None:
        if XToBePredited[root.feature] == 0:
            root = root.l
        else:
            root = root.r
    return root.label[0][0]

predict = np.zeros_like(test['label'])
for idx in range(predict.shape[0]):
    predict[idx] = Decision(T,test['feature'][idx,:])   
acc = (predict == test['label'])
print (sum(acc) / acc.shape[0])

0.7197916666666667


In [6]:
def Percolate(GeneratedTree, features, labels):
    root = GeneratedTree
    root.valid_idx = np.ones_like(labels,dtype=np.bool)
    queue = deque()
    queue.append(root)
    while queue:
        p = queue.popleft()
        if p.l is not None and p.r is not None:
            queue.append(p.l)
            queue.append(p.r)
            p.l.valid_idx = np.bitwise_and(features[:,p.feature] == 0, p.valid_idx)
            p.r.valid_idx = np.bitwise_and(features[:,p.feature] == 1, p.valid_idx)
    return GeneratedTree



In [18]:
def Punning(GeneratedTree, CrossValidationDataset):
    GT = Percolate(GeneratedTree,CrossValidationDataset['feature'],CrossValidationDataset['label'])
    while Q:
        p = Q.popleft()
        error_child = np.sum(CrossValidationDataset['label'][p.l.valid_idx] != p.l.label) + \
            np.sum(CrossValidationDataset['label'][p.r.valid_idx] != p.r.label)
        error_parent = np.sum(CrossValidationDataset['label'][p.valid_idx] != p.label)
        #print (error_parent,error_child)
        if error_parent < error_child:
            # punning:
            p.l = None
            p.r = None
            if (p.isleft and p.parent.r is None) or ((not p.isleft) and p.parent.l is None):
                Q.append(p.parent)
    return GT
#Punning (T,valid)

In [19]:
T = Punning(T,valid)
predict = np.zeros_like(test['label'])
for idx in range(predict.shape[0]):
    predict[idx] = Decision(T,test['feature'][idx,:])   
acc = (predict == test['label'])
print (sum(acc) / acc.shape[0])

0.7274305555555556
