In [1]:
from sklearn import metrics, linear_model, model_selection
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
import math

In [2]:
titanic = pd.read_csv('titanic.csv')

In [3]:
titanic.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body
count,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0,121.0
mean,2.294882,0.381971,29.881135,0.498854,0.385027,33.295479,160.809917
std,0.837836,0.486055,14.4135,1.041658,0.86556,51.758668,97.696922
min,1.0,0.0,0.1667,0.0,0.0,0.0,1.0
25%,2.0,0.0,21.0,0.0,0.0,7.8958,72.0
50%,3.0,0.0,28.0,0.0,0.0,14.4542,155.0
75%,3.0,1.0,39.0,1.0,0.0,31.275,256.0
max,3.0,1.0,80.0,8.0,9.0,512.3292,328.0


In [4]:
titanic.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [5]:
#create dummy variables for categorical attributes 
gender = pd.get_dummies(titanic['sex'])
embarkloc = pd.get_dummies(titanic['embarked'])
home = pd.get_dummies(titanic['home.dest'])[['New York, NY','London' ,'Montreal, PQ']]
dummies = pd.concat([titanic,gender,embarkloc,home],axis=1)
dummies.drop(['body'],axis=1,inplace=True)
dummies.groupby('survived').mean()

Unnamed: 0_level_0,pclass,age,sibsp,parch,fare,female,male,C,Q,S,"New York, NY",London,"Montreal, PQ"
survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,2.500618,30.545369,0.521632,0.328801,23.353831,0.156984,0.843016,0.148331,0.097651,0.754017,0.029666,0.011125,0.007417
1,1.962,28.918228,0.462,0.476,49.361184,0.678,0.322,0.3,0.088,0.608,0.08,0.01,0.008


In [6]:
summdf = dummies.drop(['name', 'home.dest','cabin', 'boat', 'embarked', 'sex', 'ticket'],axis=1)
summ = pd.DataFrame(index=summdf.columns)
summ['Mean'] = summdf.mean()
summ['Std Dev'] = summdf.std()
corrs = []
for col in summdf.columns:
    corrs.append(np.corrcoef(summdf[col], summdf['survived'])[0,1])
summ["Correlation"] = corrs
print(summ)
 

                   Mean    Std Dev  Correlation
pclass         2.294882   0.837836    -0.312469
survived       0.381971   0.486055     1.000000
age           29.881135  14.413500          NaN
sibsp          0.498854   1.041658    -0.027825
parch          0.385027   0.865560     0.082660
fare          33.295479  51.758668          NaN
female         0.355997   0.478997     0.528693
male           0.644003   0.478997    -0.528693
C              0.206264   0.404777     0.182123
Q              0.093965   0.291891    -0.016071
S              0.698243   0.459196    -0.154558
New York, NY   0.048892   0.215725     0.113408
London         0.010695   0.102902    -0.005313
Montreal, PQ   0.007639   0.087103     0.003256


In [7]:
dummiesbin = summdf.drop(['female', 'fare' ],axis=1)
dummiesclass = pd.get_dummies(summdf['pclass'])
dummiesclass.columns = ['class1', 'class2', 'class3']
ages = pd.cut(dummiesbin['age'],[0,10,30,50,70,90,110], right=False)
dummiesage = pd.get_dummies(ages)
data = pd.concat([dummiesbin,dummiesclass,dummiesage],axis=1)
data.drop(['age','parch','sibsp','pclass'], axis=1, inplace=True)

In [8]:
data.head()

Unnamed: 0,survived,male,C,Q,S,"New York, NY",London,"Montreal, PQ",class1,class2,class3,"[0, 10)","[10, 30)","[30, 50)","[50, 70)","[70, 90)","[90, 110)"
0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0
1,1,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0
2,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0
3,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0
4,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0


In [9]:
survived = (data['survived'] == 1).sum() / (1.0*len(data))
died = (data['survived'] == 0).sum() / (1.0*len(data))
entropy = -1 * survived * math.log(survived, 2) - died * math.log(died,2)

print survived
print died
print entropy

0.3819709702062643
0.6180290297937356
0.959422170862815


In [23]:
data.columns

Index([    u'survived',         u'male',            u'C',            u'Q',
                  u'S', u'New York, NY',       u'London', u'Montreal, PQ',
             u'class1',       u'class2',       u'class3',         [0, 10),
              [10, 30),        [30, 50),        [50, 70),        [70, 90),
             [90, 110)],
      dtype='object')

In [24]:
#splitting on 0-10 age range variable
df1 = data[data[data.columns[11]] == 1]
df0 = data[data[data.columns[11]] == 0]
#splititng into survived/died for rows in/not in 0-10 range
survivedIf1 = df1['survived'].sum() / (1.0*len(df1))
survivedIf0 = df0['survived'].sum() / (1.0*len(df0))
diedIf1 = (df1['survived'] == 0).sum() / (1.0*len(df1))
diedIf0 = (df0['survived'] == 0).sum() / (1.0*len(df0))

print survivedIf1, survivedIf0, diedIf1, diedIf0

#calculating entropy for either split
yEntropy = -1 * survivedIf1 * math.log(survivedIf1, 2) - diedIf1 * math.log(diedIf1,2)
nEntropy = -1 * survivedIf0 * math.log(survivedIf0, 2) - diedIf0 * math.log(diedIf0,2)

print yEntropy, nEntropy

0.6097560975609756 0.36674816625916873 0.3902439024390244 0.6332518337408313
0.9649567669505688 0.9481424850561753


In [11]:
def growTree(data):
    survived = (data['survived'] == 1).sum() / (1.0*len(data))
    died = (data['survived'] == 0).sum() / (1.0*len(data))
    #print(survived)
    #print(died)
    entropy = -1 * survived * math.log(survived, 2) - died * math.log(died,2)
    #print(entropy)
    minEntropy = entropy
    minIndex = -1
    #calcualting entropy for each feature
    for i in range(1, data.shape[1]):
        df1 = data[data[data.columns[i]] == 1]
        df0 = data[data[data.columns[i]] == 0]
        if not (len(df0) == 0 or len(df1) == 0):
            survivedIf1 = df1['survived'].sum() / (1.0*len(df1))
            survivedIf0 = df0['survived'].sum() / (1.0*len(df0))
            diedIf1 = (df1['survived'] == 0).sum() / (1.0*len(df1))
            diedIf0 = (df0['survived'] == 0).sum() / (1.0*len(df0))
            #check for empty splits/terminal conditions and recurse
            if survivedIf0 == 0.0:
                return [data.columns[i], [0.0], growTree(data.drop([data.columns[i]], axis=1))]
            if survivedIf1 == 0.0:
                return [data.columns[i], growTree(data.drop([data.columns[i]], axis=1)), [0.0] ]
            if survivedIf0 == 1.0:
                return [data.columns[i], [1.0], growTree(data.drop([data.columns[i]], axis=1))]
            if survivedIf1 == 1.0:
                return [data.columns[i], growTree(data.drop([data.columns[i]], axis=1)), [1.0] ]
            yEntropy = -1 * survivedIf1 * math.log(survivedIf1, 2) - diedIf1 * math.log(diedIf1,2)
            nEntropy = -1 * survivedIf0 * math.log(survivedIf0, 2) - diedIf0 * math.log(diedIf0,2)
            #checking if entropy less than current min
            weightedAvg = (yEntropy * len(df1) + nEntropy * len(df0)) / (1.0*(len(df1) + len(df0)))
            if weightedAvg < minEntropy:
                minEntropy = weightedAvg
                minIndex = i
    #splitting on best feature and removing said feature from list
    dataSplit1 = data[data[data.columns[minIndex]] == 1].drop([data.columns[minIndex]], axis=1)
    dataSplit0 = data[data[data.columns[minIndex]] == 0].drop([data.columns[minIndex]], axis=1)
    #terminating in case best split returns an empty subset
    if len(dataSplit0) == 0:
        return [dataSplit1['survived'].sum() / (1.0*len(dataSplit1))]
    if len(dataSplit1) == 0:
        return [dataSplit0['survived'].sum() / (1.0*len(dataSplit0))]
        
    return [data.columns[minIndex], growTree(dataSplit0), growTree(dataSplit1)]



In [31]:
#splitting data into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(data, data['survived'], test_size=.2)
tree = growTree(X_train)

In [32]:
print "level 1:", tree[0]
print "level 2", tree[1][0], tree[2][0] 

level 1: male
level 2 Montreal, PQ Montreal, PQ


In [33]:
import copy

def test(tree, testData):
    predictions = np.empty(X_test.shape[0])
    for i in range(testData.shape[0]):
        predictionTree = copy.deepcopy(tree);
        while(len(predictionTree) > 2):
            path = testData.iloc[i][predictionTree[0]]
            predictionTree = predictionTree[path + 1]
        if predictionTree[0] > .5: 
            predictions[i] = 1
        else:
            predictions[i] = 0
    return predictions

predictions = test(tree, X_test)
predictions


array([0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1.,
       0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 1., 1., 0., 1., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0.,
       0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0., 1., 0.,
       1., 1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0.,
       0., 1., 1., 1., 0., 0., 0., 1., 1., 1., 0., 0., 1., 0., 0., 1., 0.,
       1., 0., 1., 0., 0.

In [34]:
np.array(y_test.tolist())

array([0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0])

In [35]:
tn, fp, fn, tp = metrics.confusion_matrix(y_test, predictions).ravel()

print "Accuracy ", (tp+tn) / (1.0*(tp+fp+fn+tn))
print "Precision ", tp/(1.0*(tp + fp))
print "Recall ", tp/(1.0*(tp+fn))

Accuracy  0.7748091603053435
Precision  0.7183098591549296
Recall  0.5666666666666667


In [26]:
#random forest generation
def growForestTrees(minimum, maximum, step):
    for B in range(minimum,maximum,step):
        trees = {}
        for i in range(B):
            #select random sample from 
            selectedData = X_train.sample(n=int(len(X_train)**(1/2.0)), replace=True)
            trees[i] = growTree(selectedData)
        predictions = np.empty(X_test.shape[0])
        #generate predictions for each test example, i, using each tree, j - select resultant value according to simple majority
        for i in range(X_test.shape[0]):
            innerPredictions = np.empty(B)
            for j in range(B):
                predictionTree = copy.deepcopy(trees[list(trees.keys())[j]]);
                while(len(predictionTree) > 2):
                    path = X_test.iloc[i][predictionTree[0]]
                    predictionTree = predictionTree[path + 1]
                if predictionTree[0] > .5: 
                    innerPredictions[j] = 1
                else:
                    innerPredictions[j] = 0
            if(innerPredictions.sum() > (B/2)):
                predictions[i] = 1
            else:
                predictions[i] = 0
        #for each b compute scores
        tn, fp, fn, tp = metrics.confusion_matrix(y_test, predictions).ravel()

        print "B: ", B
        print "Accuracy: ", (tp+tn) / (1.0*(tp+fp+fn+tn))
        print "Precision: ", tp/(1.0*(tp + fp))
        print "Recall: ", tp/(1.0*(tp+fn))
growForestTrees(1,52,5)

        
        

B:  1
Accuracy:  0.5610687022900763
Precision:  0.4556213017751479
Recall:  0.77
B:  6
Accuracy:  0.7595419847328244
Precision:  0.6796116504854369
Recall:  0.7
B:  11
Accuracy:  0.7786259541984732
Precision:  0.7386363636363636
Recall:  0.65
B:  16
Accuracy:  0.7938931297709924
Precision:  0.8484848484848485
Recall:  0.56
B:  21
Accuracy:  0.7938931297709924
Precision:  0.7346938775510204
Recall:  0.72
B:  26
Accuracy:  0.8091603053435115
Precision:  0.8289473684210527
Recall:  0.63
B:  31
Accuracy:  0.7824427480916031
Precision:  0.7171717171717171
Recall:  0.71
B:  36
Accuracy:  0.7748091603053435
Precision:  0.7887323943661971
Recall:  0.56
B:  41
Accuracy:  0.7748091603053435
Precision:  0.7157894736842105
Recall:  0.68
B:  46
Accuracy:  0.7748091603053435
Precision:  0.7469879518072289
Recall:  0.62
B:  51
Accuracy:  0.7900763358778626
Precision:  0.8082191780821918
Recall:  0.59


In [None]:
#computing beyond B = 50 very slow
#we can see a gradual increase in accuracy/precision/recall with larger forest size

In [30]:
#bagging

for B in range(1,20,4):
    trees = {}
    for i in range(B):
        #select random samples fromdataset to build trees
        selectedData = X_train.sample(n=len(X_train), replace=True)
        trees[i] = growTree(selectedData)
    predictions = np.empty(X_test.shape[0])
    allAgree = 0;
    for i in range(X_test.shape[0]):
        innerPredictions = np.empty(B)
        for j in range(B):
            predictionTree = copy.deepcopy(trees[list(trees.keys())[j]]);
            #make decisions for indivisual trees
            while(len(predictionTree) > 2):
                path = X_test.iloc[i][predictionTree[0]]
                predictionTree = predictionTree[path + 1]
            if predictionTree[0] > .5: 
                innerPredictions[j] = 1
            else:
                innerPredictions[j] = 0
        #make decision base don majority
        if(innerPredictions.sum() > (B/2)):
            predictions[i] = 1
        else:
            predictions[i] = 0
        if(innerPredictions.sum() == 0 or innerPredictions.sum() == B):
            allAgree += 1

    tn, fp, fn, tp = metrics.confusion_matrix(y_test, predictions).ravel()
    print "Accuracy: ", (tp+tn) / (1.0*(tp+fp+fn+tn))
    print "Precision: ", tp/(1.0*(tp + fp))
    print "Recall: ", tp/(1.0*(tp+fn))
    

Accuracy:  0.7938931297709924
Precision:  0.8108108108108109
Recall:  0.6
Accuracy:  0.7709923664122137
Precision:  0.7631578947368421
Recall:  0.58
Accuracy:  0.7824427480916031
Precision:  0.8208955223880597
Recall:  0.55
Accuracy:  0.7900763358778626
Precision:  0.8
Recall:  0.6
Accuracy:  0.7862595419847328
Precision:  0.8055555555555556
Recall:  0.58
