In [12]:
from anytree import Node, RenderTree
from anytree.dotexport import RenderTreeGraph
import os.path;
import datetime;
import time;
import pandas;
import numpy as np;
import ast;
import math;
import sys;
LOG_DIR="log";
LOG_IMAGE=LOG_DIR+"/image";

In [2]:
def readCSVFile(file):
    data=pandas.read_csv(file,",",header=0, na_values='?', skipinitialspace=True);
    return data;
    pass;
def readTrainData(dataset):    
    return dataset.ix[:,6:], dataset.ix[:,4:5].astype(int),dataset.ix[:,5:6];
    pass;

def readTestData(dataset):    
    return dataset.ix[:,6:], dataset.ix[:,4:5].astype(int),dataset.ix[:,5:6];
    pass;

def getTimestamp():
    ts = datetime.datetime.fromtimestamp(time.time()).strftime('%d-%m-%Y-%H:%M:%S')
    return ts;

def createDir(self,directory):
        if not os.path.exists(directory):
            os.makedirs(directory);
        pass;
    
def printPlanerTree(root):
    for pre, fill, node in RenderTree(root): 
        print("%s%s" % (pre, node.name));   
    pass;

def saveTreeAsPNG(root,filename=None):
    if(filename==None):
        filename="gener_"+getTimestamp();
    RenderTreeGraph(root).to_picture(LOG_IMAGE+"/"+filename+".png");
    print("Imaged Saved")
    pass;

In [3]:
udo = Node("Udo")
marc = Node("Marc", parent=udo)
lian = Node("Lian", parent=marc)
dan = Node("Dan", parent=udo)
jet = Node("Jet", parent=dan)
jan = Node("Jan", parent=dan)
joe = Node("Joe", parent=dan)

In [92]:
# data: all continous data
# tree: binary
# feature repitation: allowed 
class DecisionTree():
    
    dataframe=None;
    no_of_class=10;#number of features 0 to k-1
    operator={"less":-1,"equal":0,"greater":1};
    output_col=None;
    features=None;
    visited_feature=None;
    repetition_allowed=True
    minus_infinity=-9999;
    detail_log_enabled=False;
    logging_enabled=True;
    #-----------------------------------------
    
    def __init__(self,df,output_col):
        self.dataframe=df;
        self.output_col=output_col;
        self.features=list(self.dataframe.columns);
        self.features.remove(self.output_col);
        self.no_of_features=len(self.features);
        self.visited_feature=[];
        
    #assuming all data is continous
    def splitDataset(self,df,feature,value_dic):
        val=value_dic["val"];
        op=value_dic["op"];        
        subsetdf=None;
        if(op==self.operator["equal"]):
            print("Error: Equal not supported");
            subsetdf=None;# no categorical data: Assumption        
        elif(op==self.operator["less"]):
            subsetdf= df.loc[(df[feature]<=val)];
            
        elif(op==self.operator["greater"]):
            subsetdf= df.loc[(df[feature]>val)];            
        
        return subsetdf;
    
    #entropy function
    def getEntropy(self,pci):
        ent=-1*pci*math.log(pci,2);
        return ent;
    
    #impurity function
    def getImpurity(self,pci):        
        imp=self.getEntropy(pci);
        return imp;
    
    #Pr(c=i)= (# of c=i)/total
    def getPci(self,df,ci):
        p=0.0;#probablity
        y=df[self.output_col];
        total=len(y);
        no_of_ci=(y==ci).sum();
        p=float(no_of_ci)/total;
        return p;
        pass;
    
    #return sum of impurity for all classes
    def getNetImpurity(self,df):
        e=0;
        for i in range(self.no_of_class):
            pci=self.getPci(df,i);       
            if(pci!=0):
                e+=self.getImpurity(pci);            
        return e;
        pass;
    
    #feature is continous
    def getFeatureVal(self,df,feature):
        mean=df[feature].mean();
        values=[{"val":mean,"op":self.operator["less"]},{"val":mean,"op":self.operator["greater"]}];
        return values;
        pass;
    
    def getGain(self,df,feature):
        #H(S)
        imp_S=self.getNetImpurity(df);
        values=self.getFeatureVal(df,feature);
        net_Sf=0;
        total_row=df[feature].count();        
        for val_dic in values:
            Sv=self.splitDataset(df,feature,val_dic);                        
            len_Sv=Sv[feature].count();
            ratio=float(len_Sv)/total_row;                        
            imp_Sv=self.getNetImpurity(Sv);
            net_Sf+=(ratio*imp_Sv); 
            if(self.detail_log_enabled):
                print("------[GAIN"+features+"]------------")
                print("val:",val_dic);            
                print(Sv);
                print("len:",len_Sv);            
                print("ratio:",ratio);
                print("imp_sv:",imp_Sv); 
                print("net_sf:",net_Sf)
        if(self.detail_log_enabled):
            print("imp_s:",imp_S," net_sv:",net_Sf,"  diff:",imp_S-net_Sf)
        gain=float(imp_S-net_Sf);        
        return gain;    
        pass;
    
    #select my maximum gain
    def getBestFeature(self,df):
        
        gain_list=np.zeros(self.no_of_features);
        for i in range(self.no_of_features):
            f=self.features[i];
            if(self.repetition_allowed or (self.repetition_allowed==False and f not in visited_features)):
                g=self.getGain(df,f);               
            else:
                g=self.minus_infinity;
            gain_list[i]=g;
            self.log("Gain_"+self.features[i]+":",g);
            
        index=gain_list.argmax();       
        return self.features[index];
        pass;
     
    def log(self,text,data=None):
        if self.logging_enabled:
            if(data!=None):
                print(text,data);
            else:
                print(text);
        pass;

In [93]:
arr=np.array([[1,2,3,4],[2,6,70,8],[2,208,11,12],[3,198,15,16]])
df = pandas.DataFrame(arr, columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,1,2,3,4
1,2,6,70,8
2,2,208,11,12
3,3,198,15,16


In [94]:
dt=DecisionTree(df,'A');
#pci=dt.getPci(df,5);
#print(pci);
#e=dt.getNetImpurity(df);
#print(e);
#val_dic={"val":2,"op":-1};
#dt.getFeatureVal(df,"B")
#g=dt.getGain(df,"C");
#print("Gain:",g);
#print(dt.features);
f=dt.getBestFeature(df);
print(f);

AttributeError: DecisionTree instance has no attribute 'log'

In [28]:
dir="data/"
trainFile=dir+"train.csv";
testFile=dir+"test.csv";
trained_dataset=readCSVFile(trainFile);
test_dataset=readCSVFile(testFile);
trained_data,trained_y,trained_y_vector=readTrainData(trained_dataset);
test_data,test_y,test_y_vector=readTestData(test_dataset);

mtx_train =trained_data.as_matrix(columns=None)
mtx_train_y  =trained_y.as_matrix(columns=None)
mtx_train_y=np.array(list((e[0] for e in mtx_train_y)));

mtx_test=test_data.as_matrix(columns=None);
mtx_test_y=test_y.as_matrix(columns=None);
mtx_test_y=np.array(list((e[0] for e in mtx_test_y)));
print("train",np.shape(mtx_train),"test",np.shape(mtx_test));

('train', (801, 30), 'test', (200, 30))


Unnamed: 0,A,B,C,D
0,1,2,3,4
1,5,6,7,8
2,9,10,11,12


In [94]:
df = pandas.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,-0.710662,-0.113527,1.524545,-0.263279
1,-0.727083,1.654355,0.838445,-0.661696
2,0.207603,-1.518234,-0.551141,-2.066694
3,-0.524525,-0.401079,-0.457789,1.326831
4,-2.989655,-0.5313,0.972248,0.464174
5,-0.296029,-0.4137,-1.055845,0.031617
6,0.37702,-0.780109,0.492799,1.166326
7,0.8964,0.585308,0.019814,1.682619


In [148]:
subsetdf= df.loc[(df['A']<10)]
subsetdf['A'].mean()

2.0

In [69]:
a=list(dt.features)
a.remove('A')
print(a)

['B', 'C', 'D']


Index([u'A', u'B', u'C', u'D'], dtype='object')