In [1]:
import pandas as pd
import numpy as np
import re 
from pprint import pprint

In [2]:
# split train and tst data into 80% and 20%
# We drop the index if already exist and re-assign from 0 

def train_test_split(dataset):
    trainLen = int(len(dataset)*0.8)
    training_data = dataset.iloc[:trainLen].reset_index(drop=True) 
    testing_data = dataset.iloc[trainLen:].reset_index(drop=True)
    return training_data,testing_data

In [3]:
def cleanTitanicData(train,test):
    #Copy original dataset in case we need it later when digging into interesting features
    # WARNING: Beware of actually copying the dataframe instead of just referencing it
    # "original_train = train" will create a reference to the train variable (changes in 'train' will apply to 'original_train')
    original_train = train.copy() # Using 'copy()' allows to clone the dataset, creating a different object with the same values

    # Feature engineering steps taken from Sina and Anisotropic, with minor changes to avoid warnings
    full_data = [train, test]

    # Feature that tells whether a passenger had a cabin on the Titanic
    train['Has_Cabin'] = train["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
    test['Has_Cabin'] = test["Cabin"].apply(lambda x: 0 if type(x) == float else 1)

    # Create new feature FamilySize as a combination of SibSp and Parch
    for dataset in full_data:
        dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    # Create new feature IsAlone from FamilySize
    for dataset in full_data:
        dataset['IsAlone'] = 0
        dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
    # Remove all NULLS in the Embarked column
    for dataset in full_data:
        dataset['Embarked'] = dataset['Embarked'].fillna('S')
    # Remove all NULLS in the Fare column
    for dataset in full_data:
        dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())

    # Remove all NULLS in the Age column
    for dataset in full_data:
        age_avg = dataset['Age'].mean()
        age_std = dataset['Age'].std()
        age_null_count = dataset['Age'].isnull().sum()
        age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
        # Next line has been improved to avoid warning
        dataset.loc[np.isnan(dataset['Age']), 'Age'] = age_null_random_list
        dataset['Age'] = dataset['Age'].astype(int)

    # Define function to extract titles from passenger names
    def get_title(name):
        title_search = re.search(' ([A-Za-z]+)\.', name)
        # If the title exists, extract and return it.
        if title_search:
            return title_search.group(1)
        return ""

    for dataset in full_data:
        dataset['Title'] = dataset['Name'].apply(get_title)
    # Group all non-common titles into one single grouping "Rare"
    for dataset in full_data:
        dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

        dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
        dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
        dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

    for dataset in full_data:
        # Mapping Sex
        dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

        # Mapping titles
        title_mapping = {"Mr": 1, "Master": 2, "Mrs": 3, "Miss": 4, "Rare": 5}
        dataset['Title'] = dataset['Title'].map(title_mapping)
        dataset['Title'] = dataset['Title'].fillna(0)

        # Mapping Embarked
        dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

        # Mapping Fare
        dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] 						        = 0
        dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
        dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
        dataset.loc[ dataset['Fare'] > 31, 'Fare'] 							        = 3
        dataset['Fare'] = dataset['Fare'].astype(int)

        # Mapping Age
        dataset.loc[ dataset['Age'] <= 16, 'Age'] 					       = 0
        dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
        dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
        dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
        dataset.loc[ dataset['Age'] > 64, 'Age'] ;

    # Feature selection: remove variables no longer containing relevant information
    drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
    train = train.drop(drop_elements, axis = 1)
    test = test.drop(drop_elements, axis = 1)
    return train,test

In [4]:
def formatData(t,s):
    if not isinstance(t,dict) and not isinstance(t,list):
        print(str("\t"*s)+str(t))
    else:
        for key in t:
            print("\t"*s+str(key))
            if not isinstance(t,list):
                formatData(t[key],s+1)

# CART

In [5]:
nMin = 5

class Node:
    name = None
    n = 0
    ss = {}
    child = {}
    
    def __str__(self):
        if self.name is None:
            return 'Leaf'
        return self.name

In [6]:
def initDictionary(dic):
    for key in dic:
        if isinstance(dic[key], int):
            dic[key] = 0
        else:
            initDictionary(dic[key])

In [7]:
def initSufficientStatistics(data):
    dic = {}
    
    for attribute in data.columns:
        dic[attribute] = {}
    
    cc = np.unique(data['class'])
    
    for attribute in data.columns:
        tt = np.unique(data[attribute])
        
        for val in tt:
            dic[attribute][val] = {}
            
            for val2 in cc:
                dic[attribute][val][val2] = 0
    
    for val2 in cc:
        dic['class'][val2] = 0
    
    return dic

In [8]:
def newSufficientStatistics(ss, dropAttribute):
    newSS = ss.copy()
    #newSS = newSS.pop(dropAttribute)
    del newSS[dropAttribute]
    return newSS

In [9]:
def updateSufficientStatistics(query, ss):
    for key in query:
        if key != 'class':
            if key in ss:
                ss[key][query[key]][query['class']]+=1
                ss['class'][query['class']]+=1

In [10]:
def createNewNode(oldNode):
    newNode = Node()
    newNode.n = 0
    newNode.ss = newSufficientStatistics(oldNode.ss, oldNode.name)
    initDictionary(newNode.ss)
    newNode.child = {}
    return newNode

In [11]:
def info(dic):
    if 'class' in dic:
        s = 0
        for key in dic['class']:
            s += dic['class'][key]
        
        if s == 0:
            return 0,0

        entropy = 0
        
        for key in dic['class']:
            if dic['class'][key] != 0:
                entropy += ( (-dic['class'][key]/s) * np.log2(dic['class'][key]/s) )
        return s,entropy
    else:
        dd = {}
        dd['class'] = {}
        
        # {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0}
        for key in dic:
            if key not in dd['class']:
                dd['class'][ key ] = 0
            dd['class'][ key ] += dic[key]
            
        return info(dd)

In [12]:
def gain(dic,feature):
    ##Calculate the entropy of the dataset    
    s,infoTotal = info(dic)
    infoFeature = 0
    
    attribute = dic[feature]
    
    for val in attribute:
        sI,infoI = info(attribute[val])
        infoFeature += (sI/s*infoI)
    
    #Calculate the information gain of perticular feature or attribute
    gainV = infoTotal - infoFeature
    return gainV

In [13]:
def solveVFDT(query,root):
    #print(root.name)
    if root.name is not None:
        solveVFDT( query,root.child[ query[root.name] ] )
    else:
        updateSufficientStatistics(query,root.ss)
        root.n += 1
        
        if root.n % nMin == 0:
            
            cnt = 0
            for v in root.ss['class']:
                cnt += root.ss['class'][v] == 0

            if cnt < ( len( root.ss['class'] ) - 1 ):
                
                gainAll = {}

                for feature in query:
                    if feature == 'class':
                        continue
                    else:
                        if feature in root.ss:
                            gainAll[feature] = gain(root.ss,feature)
                
                maxGainFeature1 = max(gainAll, key=gainAll.get)
                maxGain1 = gainAll[ maxGainFeature1 ]
                gainAll.pop(maxGainFeature1)
                
                maxGainFeature2 = max(gainAll, key=gainAll.get)
                maxGain2 = gainAll[ maxGainFeature2 ]
                
                R = 0.5
                delta = 0.01
                epsilon = np.sqrt(R**2 * np.log(1/delta) / (2*root.n))
                
                #if (maxGain1-maxGain2)>epsilon :
                root.name = maxGainFeature1
                d = root.ss
                
                for childd in d[ maxGainFeature1 ]:
                    temp = createNewNode(root)
                    if temp.ss is not None:
                        root.child[ childd ] = createNewNode(root)
                    else:
                        #root.name = None
                        break
        #print(root.name)

In [14]:
def VFDT(data,className="Survived"):
    data.rename(columns={className: "class"},inplace=True)
    train,test = train_test_split(data)
    if className == 'Survived':
        train,test = cleanTitanicData(train,test)
    
    root = Node()
    root.ss = initSufficientStatistics(train)

    dic = train.to_dict('index')
    for key in dic:
        query = dic[key]
        solveVFDT(query,root)
    printTree(root,0)
#     formatData(tree,0)
#     print(testModel(test,tree))

In [15]:
#train = pd.read_csv('data/train.csv')
#VFDT(train)

In [16]:
def printTree(root,s):
    print(str("\t"*s)+str(root))
    if root.name is not None:
        for key in root.child:
            printTree( root.child[ key ] , s+1 )


In [17]:
# printing tree in visible format

def formatData(t,s):
    if not isinstance(t,dict) and not isinstance(t,list):
        print(str("\t"*s)+str(t))
    else:
        for key in t:
            print("\t"*s+str(key))
            if not isinstance(t,list):
                formatData(t[key],s+1)

In [18]:
#Import the dataset and define the feature as well as the target datasets / columns#
dataset = pd.read_csv('data/zoo.csv',
                      names=['animal_name','hair','feathers','eggs','milk',
                                                   'airbone','aquatic','predator','toothed','backbone',
                                                  'breathes','venomous','fins','legs','tail','domestic','catsize','class',])#Import all columns omitting the fist which consists the names of the animals

#We drop the animal names since this is not a good feature to split the data on
dataset=dataset.drop('animal_name',axis=1)

VFDT(dataset,className='class')

hair
	legs
		toothed
			Leaf
			eggs
				Leaf
				Leaf
		feathers
			Leaf
			eggs
				Leaf
				airbone
					Leaf
					Leaf
		Leaf
		aquatic
			Leaf
			Leaf
		Leaf
	aquatic
		legs
			Leaf
			Leaf
			milk
				Leaf
				Leaf
			Leaf
			Leaf
		Leaf
