Below is code for a basic classification tree.

In [96]:
import numpy as np

def gini(labels):
  class_counts = {}
  for l in labels:
      if not l in class_counts:
          class_counts[l] = 0
      class_counts[l] += 1
  N = len(labels)
  if N==0: return 1
  return 1 - sum([class_counts[c]**2 for c in class_counts])/float(N*N)

def split_dataset(data, labels, split_ind, split_value):
    """
    Returns
    data1 -- the data above the split
    data2 -- the data below the split
    labels1 -- corresponding to data1
    labels2 -- correspodning to data2
    """
    if not data.size:
        return data, data, labels, labels
    mask = data[:,split_ind] > split_value
    comp = np.logical_not(mask)
    return data[mask], data[comp], labels[mask], labels[comp]

def split_1d(data, labels, split_value):
    """
    Returns
    data1 -- the data above the split
    data2 -- the data below the split
    labels1 -- corresponding to data1
    labels2 -- correspodning to data2
    """
    mask = data > split_value
    comp = np.logical_not(mask)
    return data[mask], data[comp], labels[mask], labels[comp]


def information_gain(data, labels, split_value):
    data1, data2, labels1, labels2 = split_1d(data, labels, split_value)
    npoints = len(labels)
    return gini(labels) - (len(labels1)*gini(labels1) + len(labels2)*gini(labels2)) / float(npoints)

def best_1dsplit(data, labels):
    best_split = None
    best_gain = -np.inf
    for el in data:
        gain = information_gain(data, labels, el)
        if gain > best_gain:
            best_gain = gain
            best_split = el
    return best_gain, best_split

def optimal_split(data, labels, mask=None):
    """
    Returns
    index -- the feature on which to split
    value -- the cutoff value to split on
    """
    best_gain = -np.inf
    best_index = None
    split_value = None
    if mask is None:
        mask = np.arange(data.shape[1])
    for i in mask:
        gain, value = best_1dsplit(data[:,i], labels)
        if gain > best_gain:
            best_index = i
            split_value = value
            best_gain = gain
    return best_index, split_value

def mode(arr):
    vals={}
    mode = None
    max_frq = -1
    for val in arr:
        if not val in vals:
            vals[val]=0
        vals[val]+=1
    for val in vals:
        frq = vals[val]
        if frq > max_frq:
            max_frq = frq
            mode = val

    return mode

class Node:
    def __init__(self, data, labels):
        self.rightchild = None
        self.leftchild = None
        self.data = data
        self.labels = labels
        self.feature = None
        self.cutoff = None
        self.label = None

    def train(self, depth=1, tol = .2, num_vars=None):
        if gini(self.labels) < tol or depth <= 0:
            self.label = mode(self.labels)
            return

        #determine best split
        mask = None
        if num_vars is not None: mask = np.random.choice(np.arange(self.data.shape[1]), num_vars, replace=False)
        best_index, best_value = optimal_split(self.data, self.labels, mask)
        self.feature = best_index
        self.cutoff = best_value
        data1, data2, labels1, labels2 = split_dataset(self.data, self.labels, best_index, best_value)
        if len(labels1) == len(self.labels) or len(labels2) == len(self.labels):
            self.label = mode(self.labels)
            return

        self.rightchild = Node(data1, labels1)
        self.leftchild = Node(data2, labels2)
        self.rightchild.train(depth-1, tol, num_vars=num_vars)
        self.leftchild.train(depth-1, tol, num_vars=num_vars)

    def dump(self, indent=0):
        if self.label is not None:
            print " " * indent + "Label: "+str(self.label)
            return
        print " " * indent  + "Split on feature "+str(self.feature)+" cut " +str(self.cutoff)
        if self.rightchild is not None:
            self.rightchild.dump(indent+1)
        if self.leftchild is not None:
            self.leftchild.dump(indent+1)


    def predict_labels(self, data):
        npoints = data.shape[0]
        res = np.zeros(npoints)
        for i in xrange(npoints):
            res[i] = self.predict_label(data[i,:])
        return res

    def predict_label(self, data):
        if self.label is not None:
            return self.label
        if data[self.feature]  > self.cutoff:
            return self.rightchild.predict_label(data)
        else: return self.leftchild.predict_label(data)


Now, we load in the titanic dataset, converting the passenger class and sex categorical variables to dummy variables.

In [97]:
import pandas as pd

titanicData = pd.read_csv("titanic4real.csv")

def cleanupTitanic(titanicData, changePclass=False):
    """Returns: X -- features, y--labels"""
    categorical_features = ['Sex']
    num_features = ['Fare', 'Age']
    if changePclass:
        categorical_features.append('Pclass')
    else:
        num_features.append('Pclass')

    labelName = 'Survived'
    for feature in num_features:
        titanicData[feature] = titanicData[feature].fillna(titanicData[feature].median())
    y = titanicData[labelName].as_matrix()

    X = titanicData[num_features].as_matrix()
    for feature in categorical_features:
        X = np.hstack([X, pd.get_dummies(titanicData[feature]).as_matrix()])
    #all who did not survive died
    y[y!=1] = 0
    return X, y

features, labels = cleanupTitanic(titanicData, changePclass=True)

Now, we create a test and training set, with a 60-40 testing training split.

In [98]:
npoints = len(labels)
ntrain = int(.6*npoints)
mask = np.arange(npoints)
np.random.shuffle(mask)
train_mask = mask[:ntrain]
test_mask = mask[ntrain:]
trainf, trainl = features[train_mask], labels[train_mask]
testf, testl = features[test_mask], labels[test_mask]

Now, we train a tree of depth 5 and Gini impurity tolerance 1. This should be sufficient for the Titanic data.

In [104]:
tree = Node(trainf, trainl)
tree.train(depth=7, tol=.1)
tree.dump()

Split on feature 2 cut 0.0
 Split on feature 6 cut 0.0
  Split on feature 0 cut 17.4
   Split on feature 1 cut 0.1667
    Split on feature 0 cut 31.3875
     Label: 0.0
     Split on feature 0 cut 31.275
      Label: 1.0
      Split on feature 1 cut 18.0
       Label: 0.0
       Label: 0.0
    Label: 1.0
   Split on feature 0 cut 7.7375
    Split on feature 0 cut 14.5
     Split on feature 0 cut 15.55
      Split on feature 1 cut 19.0
       Label: 1.0
       Label: 1.0
      Split on feature 0 cut 15.5
       Label: 0.0
       Label: 1.0
     Split on feature 1 cut 19.0
      Split on feature 0 cut 7.8792
       Label: 0.0
       Label: 1.0
      Split on feature 0 cut 7.8792
       Label: 1.0
       Label: 0.0
    Split on feature 1 cut 18.5
     Label: 1.0
     Split on feature 1 cut 18.0
      Label: 0.0
      Label: 1.0
  Split on feature 0 cut 31.6792
   Label: 1.0
   Split on feature 0 cut 31.0
    Label: 0.0
    Split on feature 0 cut 12.65
     Split on feature 0 cut 13.0
    

Note that the initial split is on feature index 2, which corresponds to gender. Most of the remaining splits are on age and fare. This is unsurprising.

In [105]:
predLabels = tree.predict_labels(testf)
print "Accuracy %f" %( np.sum(testl==predLabels) / float(len(predLabels)))

Accuracy 0.778626


In [60]:
class Forest:
    def __init__(self, data, labels, tol=.1, max_depth=5, num_trees=100, num_vars=2):
        self.num_vars = num_vars
        self.data = data
        self.labels = labels
        self.tol = tol
        self.max_depth = max_depth
        self.num_trees = num_trees
        self.trees = []
        for i in xrange(num_trees):
            n = Node(data, labels)
            n.train(max_depth, tol=tol, num_vars=num_vars)
            self.trees.append(n)
            print "Trained Tree %d" %i
    
    def predict_label(self, point):
        l = [n.predict_label(point) for n in self.trees]
        return mode(l)
    
    def predict_labels(self, data):
        return np.array([self.predict_label(point) for point in data])
        

In [122]:
forest= Forest(trainf, trainl, max_depth=7, num_trees=10, num_vars=2)
fpl = forest.predict_labels(testf)
print "Accuracy: %f" % (np.sum(fpl==testl)/float(len(testl)))

Trained Tree 0
Trained Tree 1
Trained Tree 2
Trained Tree 3
Trained Tree 4
Trained Tree 5
Trained Tree 6
Trained Tree 7
Trained Tree 8
Trained Tree 9
Accuracy: 0.784351


The forest is better than a single tree.