In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

Problem: Code the Random Forest Classifier (Bagging Classifier version 2)

Algorithm:

    for j = 1, ..., K (number for weak trees)
        construct a tree ($f_j$) but every time you look for a split you only consider p randomly sampled $p$ features.
        if no possible split:
            stop
    end
    average $f_j$

### MODIFIED VERSION OF DECISION TREE CLASSIFIER FROM PREVIOUS LAB
(add number of features to consider in the split)

In [2]:
#we modify the Node class object we used in the
#classification tree to accept an input num_feat
#which will be used to consider how many features 
#to use in the split.
class Node(object):
    """
    Creates a classification tree.
    """
    def __init__(self, X, y, feat_type, num_feat, cur_depth=1, max_depth=10, tol=0.2):
        """
        Parameters: 
        X: matrix(2d numpy array) in num_samples x num_features format
        y: labels (1d numpy array)
        feat_type: list of strings indicating the type of features (categorical or real),
                    must be have length equal to num_features
        num_feat: number of features to consider in splitting for every node
        """
        self.X = X
        self.y = y
        self.feat_type = feat_type
        self.cur_depth = cur_depth
        if max_depth<cur_depth:
            raise ValueError("Maximum depth should be greater than current_depth")
        self.max_depth = max_depth
        self.tol = tol
        self.num_feat = num_feat
        
        def gini_impurity(y, classes):
            """
            y: array(num_samples)
            classes: array of unique elements of classes
            """
            percent = np.array([np.mean(y==k) if np.any(y) else 0 for k in classes])
            gini = 1 - np.sum(percent**2)
            return gini
        
        #splitting of D to D1 and D2 given the index of the attribute p and value x
        def split(X, y, p, x, feat_type):
            """
            X: matrix(num_samples x num_features)
            y: array(num_samples,)
            p: (attribute) column index
            x: value of the attribute used as a basis of splitting
            feat_type: list of strings indicating if the feature is real or categorical
            """
            if feat_type=="categorical": 
                Xsplit1, ysplit1 = X[X[:,p]==x], y[X[:,p]==x]
                Xsplit2, ysplit2 = X[X[:,p]!=x], y[X[:,p]!=x]
            elif feat_type=="real":
                Xsplit1, ysplit1 = X[X[:,p]<=x], y[X[:,p]<=x]
                Xsplit2, ysplit2 = X[X[:,p]>x], y[X[:,p]>x]
            else:
                raise ValueError("Invalid feat_type")
            return (Xsplit1, Xsplit2), (ysplit1, ysplit2)
        
        #information gain
        def info_gain(X, y, p, x, feat_type):
            """
            X: matrix(num_samples x num_features)
            y: array(num_samples,)
            p: (attribute) column index
            x: value of the attribute used as a basis of splitting
            feat_type: list of strings indicating if the feature is real or categorical
            """
            classes = np.unique(y)
            #split 
            Xsplits, ysplits = split(X, y, p, x, feat_type)
            score = gini_impurity(y, classes) - np.sum([(len(ysplits[i])/len(y))*gini_impurity(ysplits[i], classes)
                                                        for i in range(2) if Xsplits[i].any()])
            return score
        
        def optimal_split(X, y, feat_type, num_feat):
            """
            X: matrix(num_samples x num_features)
            y: array(num_samples,)
            feat_type: list of strings indicating if the feature is real or categorical
            num_feat: number of features to consider in splitting
            """
            #maximize info gain
            m,n = X.shape
            best_ig = 0 #best info gain (find the maximum)
            best_j = 0 #best coordinate point index to create x
            best_i = 0 #best attribute index
            feats = np.random.choice(np.arange(n), num_feat, replace=False)
            for i in feats:
                for j in range(m):
                    x = X[j,i]
                    (X1, X2), (y1, y2) = split(X, y, i, x, feat_type[i])
                    ig = info_gain(X, y, i, x, feat_type[i])
                    if ig>best_ig:# and np.any(X1) and np.any(X2): #make sure X1 and X2 are nonempty
                        best_ig, best_j, best_i = ig, j, i

            return split(X, y, best_i, X[best_j, best_i], feat_type[best_i]), best_i, X[best_j, best_i]
        
        ((X1, X2), (y1, y2)), attr, xval = optimal_split(self.X, self.y,
                                                         self.feat_type, self.num_feat)
        
        self.attr = attr
        self.xval = xval
        classes = np.sort(np.unique(y))
         
        gin_imp1 = gini_impurity(y1, classes)
        gin_imp2 = gini_impurity(y2, classes)
        
        if  ((self.cur_depth == self.max_depth) |\
                (gin_imp1<tol) | (gin_imp2<tol)| (not np.any(X1)) | (not np.any(X2))):
            self.asg_label = np.argmax([np.sum(y==i) for i in classes])
        else:
            self.leftchild = Node(X1, y1, self.feat_type, num_feat=self.num_feat, cur_depth=self.cur_depth+1,
                                  max_depth=self.max_depth, tol=self.tol)
            self.rightchild = Node(X2, y2, self.feat_type, num_feat=self.num_feat, cur_depth=self.cur_depth+1,
                                   max_depth=self.max_depth, tol=self.tol)
    def print_tree(self):
        
        print("=="*(self.cur_depth-1)+">"+"Attr:{}, Xval:{}".format(self.attr, self.xval))
        
        if not hasattr(self, "asg_label"):
            self.leftchild.print_tree()
            self.rightchild.print_tree()
        else:
            print("=="*self.cur_depth+">"+"Decision: Assigned label:{}".format(self.asg_label))
        
        
    def predict(self, x):
        if hasattr(self, "asg_label"):
            return self.asg_label
        else:
            if self.feat_type[self.attr]=="categorical":
                if x[self.attr]==self.xval:
                    return self.leftchild.predict(x)
                else:
                    return self.rightchild.predict(x)
            elif self.feat_type[self.attr]=="real":
                if x[self.attr]<=self.xval:
                    return self.leftchild.predict(x)
                else:
                    return self.rightchild.predict(x)

In [3]:
#load the titanic dataset
data = pd.read_csv("titanic.csv")

#preprocessing
data.drop(["Name", "Boat", "Body", "Ticket", "Cabin", "home.dest", "Embarked"], axis=1, inplace=True)
data["Sex"] = [1 if i=="male"  else 0 for i in data["Sex"]]
data.dropna(inplace=True)
data.reset_index(inplace=True, drop=True)

X = np.array(data[["Pclass", "Sex", "Age"]])
feat_type = ["categorical", "categorical", "real"]
y = np.array(data["Survived"])

#train and test split
train_ind = np.random.choice(np.arange(len(X)), size=int(0.6*len(X)))
test_ind = np.array([i for i in np.arange(len(X)) if i not in train_ind ])

Xtrain, ytrain, Xtest, ytest = X[train_ind], y[train_ind], X[test_ind], y[test_ind]
print(Xtrain.shape, ytrain.shape)
print(Xtest.shape, ytest.shape)

(627, 3) (627,)
(567, 3) (567,)


In [4]:
#one weak tree using our Decision tree classifier
tree = Node(Xtrain, ytrain, feat_type, 3)

In [5]:
preds_train = np.array([tree.predict(i) for i in Xtrain])
acc_train = np.mean(preds_train==ytrain)

preds_test = np.array([tree.predict(i) for i in Xtest])
acc_test = np.mean(preds_test==ytest)

In [6]:
print(acc_train, acc_test)

0.791068580542 0.776014109347


In [7]:
#let's compare with sklearn's Decision Tree classifier
dt = DecisionTreeClassifier(max_features=3)
dt.fit(Xtrain, ytrain)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=3, max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [8]:
preds_train = dt.predict(Xtrain)
acc_train = np.mean(preds_train==ytrain)

preds_test = dt.predict(Xtest)
acc_test = np.mean(preds_test==ytest)

In [9]:
print(acc_train, acc_test)

0.897926634769 0.756613756614


### RANDOM FOREST CLASSIFIER THAT USES THE MODIFIED VERSION OF DECISION TREE CLASSIFIER

In [10]:
class Nodes():
    """
    Creates a random forest classifier
    """
    def __init__(self, X, y, feat_type, num_feat=None, num_trees=None, 
                 cur_depth=1, max_depth=10, tol=0.2):
        self.X = X
        self.y = y
        self.feat_type = feat_type
        if num_feat==None:
            #for classification, rule of thumb is sqrt
            #of the dimension of features
            self.num_feat = int(np.sqrt(X.shape[1])) 
        else:
            self.num_feat = num_feat
        if num_trees==None:
            #rule of thumb is range(100, 1000) trees
            self.num_trees = 500
        else:
            self.num_trees = num_trees
        
        #build weak classification trees
        self.trees = []
        for j in range(self.num_trees):
            tree = Node(self.X, self.y, self.feat_type, self.num_feat)
            self.trees.append(tree)
            
    def predict(self, x):
        self.preds = []
        for j in self.trees:
            self.preds.append(j.predict(x))
        return int(np.mean(self.preds)>=0.5)

In [11]:
forest = Nodes(Xtrain, ytrain, feat_type)

In [12]:
preds_train = np.array([forest.predict(i) for i in Xtrain])
acc_train = np.mean(preds_train==ytrain)

preds_test = np.array([forest.predict(i) for i in Xtest])
acc_test = np.mean(preds_test==ytest)

In [13]:
print(acc_train, acc_test)

0.800637958533 0.768959435626


In [14]:
#use sklearn's Random Forest Classifier
num_trees = 500
num_feat = int(np.sqrt(Xtrain.shape[1])) 
rf = RandomForestClassifier(n_estimators=num_trees, max_features=num_feat)
rf.fit(Xtrain, ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=1, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [15]:
preds_train = rf.predict(Xtrain)
acc_train = np.mean(preds_train==ytrain)

preds_test = rf.predict(Xtest)
acc_test = np.mean(preds_test==ytest)

In [16]:
print(acc_train, acc_test)

0.897926634769 0.749559082892
