In [None]:
# example decision tree

#                _SEX_
#              /       \
#             f         m
#            /           \
#         _PCLASS_        _AGE_
#         /      \       /     \
#    1 or 2       3     >13     <=13
#       /          \   /         \
#    die         live  die      _PCLASS_
#                              /        \
#                             1        2 or 3
#                            /            \
#                           die            live

# sex = root node
# sex, pclass, age = internal nodes (features to split)
# live, die = leaf nodes (predictions)


# building the tree

# start by choosing feature with most predictive power as root node. 
#Looking for the split that gives us the most information gain. 
#The more homogeneity (purity) the better: ideally we want all the passengers who died on one side, and all who survived on the other.

# gini impurity 

# a measure of how homogeneous the set is. value between 0 and 0.5. 
# 0.5 means completely impure, members of set 50/50% mixed classes. 0 means completely pure, 100% members of set the same class. 
# Therefore, we want lower values (more purity)

# equation (when there are 2 target classes)  =  2 * p * (1 - p)
#                                             =  2 * % lived * % died

# entropy

# another measure of purity. value between 0 and 1.
# 1 is completely impure (50% survived and 50% didn’t survive) 0 is completely pure (100% the same class).
# So we want lower values

# equation (when there are 2 target classes)  =  -[p * p_log2 + (1 - p) * (1 - p)_log2 ]
#                                             = -(%live * %live_log_2  + %die * %die_log_2)

# It usually doesn't make a difference whether we use gini impurity or entropy to find the best splits.
# We can always cross validate to compare a Decision Tree with gini impurity and a Decision Tree with entropy to see which performs better.

# information gain

# measure of reduction in impurity after dataset is split. 
# found by comparing impurity before and after the split. 
# Can be calculated using either gini impurity or entropy as our impurity measure. 
# We want higher values (more information gained).

# equation (when there are 2 target classes) = S_score - (A_size/S_size) * A_score - (B_size/S_size) * B_score
#                                            = S_score - weighted_A_score - weighted_B_score 
# S is the original set
# A and B are the subsets after the split
# score can be gini impurity or entropy


In [3]:
# *first let's look at the unsplit dataset, then try splitting by age 30 and by sex to see which makes a better root mode*

# look at unsplit dataset 
import math
import pandas as pd
df=pd.read_csv('https://sololearn.com/uploads/files/titanic.csv')

groupbysurvival = df.groupby('Survived') # break down data by 'Survived' 0/1 values into two groups
unsplit_die, unsplit_live = groupbysurvival['Survived'].count() # selecting one column 'Survived' to apply '.count()' to. We could pass any column name here, the point is to count # of items in each of the 2 groups. Count values unpacked to the 2 variables 'live' 'die'
print('Unsplit Dataset:')
print()
print('did not survive: ', unsplit_die)
print('survived: ', unsplit_live)
print()

# gini impurity
#                       =  2 * % lived * % died
# unsplit gini impurity =  2 * (342 / 342 + 545) * (545 / 342 + 545)
#                       =  2 * .3856 * .6144
#                       =  .4738

# gini impurity with python
unsplit_live_pc = unsplit_live/(unsplit_live + unsplit_die)
unsplit_die_pc = unsplit_die/(unsplit_live + unsplit_die)
unsplit_gini = 2 * unsplit_live_pc * unsplit_die_pc
print('gini impurity score: ', unsplit_gini)

# entropy
#                 =  -(%live * %live_log_2  + %die * %die_log_2)
# unsplit entropy =  -[(.3856 * .3856_log_2) + (.6144 * .6144_log_2)]  
#                 =  .9619

# entropy with python
unsplit_entropy = -(unsplit_live_pc * math.log2(unsplit_live_pc) + unsplit_die_pc * math.log2(unsplit_die_pc))
print('entropy score: ', unsplit_entropy)


Unsplit Dataset:

did not survive:  545
survived:  342

gini impurity score:  0.4738112457404905
entropy score:  0.9618806789594467


In [5]:
# split by age 30

#              _AGE_
#             /     \
#           >30     <=30 
#           /         \
#      live: 145   live: 197
#       die: 217    die: 328

# about 40% of passengers surviving on both sides so barely any information gained by splitting the data this way.

# split with python
df['Under30'] = df['Age'] <=30 # new column with boolean False/True values for <=30
splitbyage30 = df.groupby(['Under30','Survived']) # break down data by 'Under30' F/T values and then by 'Survived' 0/1 values. 
# We now have 4 groups: 'Under30' False 'Survived' 0  
#                       'Under30' False 'Survived' 1
#                       'Under30' True  'Survived' 0
#                       'Under30' True  'Survived' 1
over30_die,over30_live,under30_die,under30_live = splitbyage30['Survived'].count() # selecting one column 'Survived' to apply '.count()' to. We could pass any column name here, the point is to count # of items in each of the 4 groups. Count values unpacked to the 4 variables 'over30die' 'over30live' 'under30die' 'under30live'
print('Split by Age 30: ')
print()
print('Above 30, did not survive: ', over30_die)
print('Above 30, survived: ', over30_live)
print()
print('30 or younger, did not survive: ', under30_die)
print('30 or younger survived: ', under30_live)
print()

# gini impurity 
#                    = 2 * % lived * % died
# >30 gini impurity  = 2 * (145 / (145 + 217)) * (217 / (145 + 217))
#                    = 2 * .4006 * .5994
#                    = .4802
# <=30 gini impurity = 2 * (197 / (197 + 328)) * (328 / (197 + 328))
#                    = 2 * .3752 * .6248
#                    = .4689

# both gini values near the max .5 = impure subsets

# gini impurity with python
over30_live_pc = over30_live/(over30_live + over30_die)
over30_die_pc = over30_die/(over30_live + over30_die)
over30_gini = 2 * over30_live_pc * over30_die_pc

under30_live_pc = under30_live/(under30_live + under30_die)
under30_die_pc = under30_die/(under30_live + under30_die)
under30_gini = 2 * under30_live_pc * under30_die_pc

print('gini impurity scores:')
print(over30_gini)
print(under30_gini)
print()

# entropy
#              =  -(%live * %live_log_2  + %die * %die_log_2)
# >30 entropy  =  -[(.4006 * .4006_log_2) + (.5994 * .5994_log_2)] 
#              =  0.9713
# <=30 entropy =  -[(.3752 * .3752_log_2) + (.6428 * .6248_log_2)] 
#              =  0.9546

# both entropy values near the max 1.0 = impure subsets

# entropy with python
over30_entropy = -(over30_live_pc * math.log2(over30_live_pc) + over30_die_pc * math.log2(over30_die_pc))
under30_entropy = -(under30_live_pc * math.log2(under30_live_pc) + under30_die_pc * math.log2(under30_die_pc))
print('entropy scores:')
print(over30_entropy)
print(under30_entropy)
print()

# information gain

# let's choose gini impurity as our impurity measure

# information gain =  S_gini - (A_size/S_size) * A_gini - (B_size/S_size) * B_gini
#                  =  .4738 - (362/887) * .4802 - (525/887) * .4689
#                  =  .0003

# very low information gain value means we gained very little from this split

#information gain with python
unsplit_size = unsplit_live + unsplit_die
over30_size = over30_live + over30_die
under30_size = under30_live + under30_die
infogain = unsplit_gini - (over30_size/unsplit_size) * over30_gini - (under30_size/unsplit_size) * under30_gini
print('information gain:')
print(infogain)
print()
print()

Split by Age 30: 

Above 30, did not survive:  217
Above 30, survived:  145

30 or younger, did not survive:  328
30 or younger survived:  197

gini impurity scores:
0.4802203839931625
0.4688689342403628

entropy scores:
0.971272860860505
0.9546092964627468

information gain:
0.0003095890531000789




In [7]:

# split by sex

#              _SEX_
#             /     \
#            f       m 
#           /         \
#      live: 233   live: 109
#       die: 81     die: 464

# female side majority survive, male side majority don't survive. Useful split!

# split with python
splitbysex = df.groupby(['Sex', 'Survived']) # break down data by 'Sex' 'female'/'male' values and then by 'Survived' 0/1 values.
# We now have 4 groups: 'Sex' 'female'   'Survived' 0  
#                       'Sex' 'female'   'Survived' 1
#                       'Sex' 'male'     'Survived' 0
#                       'Sex' 'male'     'Survived' 1
female_die,female_live,male_die,male_live = splitbysex['Survived'].count() # selecting one column 'Survived' to apply '.count()' to. We could pass any column name here, the point is to count # of items in each of the 4 groups. Count values unpacked to the 4 variables 'female_die' 'female_live' 'male_die' 'male_live'
print('Split by Sex: ')
print()
print('Male, did not survive: ', male_die)
print('Male, survived: ', male_live)
print()
print('Female, did not survive ', female_die)
print('Female, survived ', female_live)
print()

# gini impurity 
#                      = 2 * % lived * % died
# female gini impurity = 2 * (233 / (233 + 81)) * (81 / (233 + 81))
#                      = 2 * .7420 * .2580
#                      = .3828
# male gini impurity   = 2 * (109 / (109 + 464)) * (464 / (109 + 464))
#                      = 2 * .1902 * .8098
#                      = .3081

# both gini values lower = purer subsets!

# gini impurity with python
male_live_pc = male_live/(male_live + male_die)
male_die_pc = male_die/(male_live + male_die)
male_gini = 2 * male_live_pc * male_die_pc

female_live_pc = female_live/(female_live + female_die)
female_die_pc = female_die/(female_live + female_die)
female_gini = 2 * female_live_pc * female_die_pc

print('gini impurity scores:')
print(male_gini)
print(female_gini)
print()

# entropy
#                =  -(%live * %live_log_2  + %die * %die_log_2)
# female entropy =  -[(.7420 * .7420_log_2) + (.2580 * .2580_log_2)] 
#                =  0.8237
# male entropy   =  -[(.1902 * .1902_log_2) + (.8098 * .8098_log_2)] 
#                =  0.7019

# both entropy values lower = purer subsets!

# entropy with python
male_entropy = -(male_live_pc * math.log2(male_live_pc) + male_die_pc * math.log2(male_die_pc))
female_entropy = -(female_live_pc * math.log2(female_live_pc) + female_die_pc * math.log2(female_die_pc))
print('entropy scores:')
print(male_entropy)
print(female_entropy)
print()

# information gain

# let's choose gini impurity as our impurity measure. 

# information gain =  S_gini - (A_size/S_size) * A_gini - (B_size/S_size) * B_gini
#                  =  .4738 - (573/887) * .3081 - (314/887) * .3828
#                  =  .1393

# higher information gain value means this is the better split!

#information gain with python
male_size = male_live + male_die
female_size = female_live + female_die
infogain = unsplit_gini - (male_size/unsplit_size) * male_gini - (female_size/unsplit_size) * female_gini
print('information gain:')
print(infogain)

# code and comments by github.com/alandavidgrunberg


Split by Sex: 

Male, did not survive:  464
Male, survived:  109

Female, did not survive  81
Female, survived  233

gini impurity scores:
0.30808122340700944
0.3828350034484157

entropy scores:
0.7019458258949879
0.8236550739295191

information gain:
0.1392670156446405
