# Decision Trees
Problem: <b>Titanic Survivor Prediction</b> Kaggle Challenge
    
## Learning Goals
* How to pre-process data?
    * Dropping not useful features
    * Filling the missing values (Data Imputation)
* Creating a Binary Decision Tree from Scratch

In [2]:
import numpy as np
import pandas as pd

In [3]:
data=pd.read_csv("titanic.csv")

In [4]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [6]:
data.shape

(891, 12)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


## columns to drop 
* **PassengerID**,**Name**,**Ticket**,**Embrked** as it does not affect survival
* **Cabin** As almost all values are Nan

In [8]:
col_to_drop=['PassengerId','Name','Cabin', 'Embarked','Ticket']

In [9]:
data_clean=data.drop(columns=col_to_drop)

In [10]:
data_clean.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
le=LabelEncoder()

In [13]:
data_clean['Sex']=le.fit_transform(data_clean['Sex'])

In [14]:
data_clean.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,1,22.0,1,0,7.25
1,1,1,0,38.0,1,0,71.2833
2,1,3,0,26.0,0,0,7.925
3,1,1,0,35.0,1,0,53.1
4,0,3,1,35.0,0,0,8.05


In [15]:
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null int32
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
dtypes: float64(2), int32(1), int64(4)
memory usage: 45.3 KB


In [16]:
data_clean=data_clean.fillna(value=data_clean['Age'].mean())

In [17]:
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null int32
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
dtypes: float64(2), int32(1), int64(4)
memory usage: 45.3 KB


In [18]:
data_clean.loc[1]


Survived     1.0000
Pclass       1.0000
Sex          0.0000
Age         38.0000
SibSp        1.0000
Parch        0.0000
Fare        71.2833
Name: 1, dtype: float64

In [19]:
data_clean.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')

In [21]:
input_cols =[ 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
output_cols=['Survived']

In [22]:
x=data_clean[input_cols]
y=data_clean[output_cols]

## Entropy

In [23]:
def Entropy(col):
    counts=np.unique(col,return_counts=True)
    N=float(col.shape[0])
    
    ent=0.0
    
    for i in counts[1]:
        p=i/N
        ent+=p*np.log2(p)
    return -ent

In [30]:
def divide_data(X_data,fkey,fval):
    x_left =pd.DataFrame([],columns=X_data.columns)
    x_right=pd.DataFrame([],columns=X_data.columns)
    
    for i in range(X_data.shape[0]):
        value=X_data[fkey].iloc[i]
        
        if value>=fval:
            x_right=x_right.append(X_data.iloc[i])
        else:
            x_left=x_left.append(X_data.iloc[i])
    
    return x_left,x_right

In [24]:
def information_gain(X_data,fkey,fval):
    x_left,x_right=divide_data(X_data,fkey,fval)
    l=x_left.shape[0]/X_data.shape[0]
    r=x_right.shape[0]/X_data.shape[0]
    
    return Entropy(X_data['Survived'])-l*Entropy(x_left['Survived'])-r*Entropy(x_right['Survived'])

In [25]:
class DecisionTree():
    
    def __init__(self,depth=0,max_depth=5):
        self.left      = None
        self.right     = None
        self.fkey      = None
        self.fval      = None
        self.depth     = depth
        self.max_depth = max_depth
        self.target    =None
        
    def train(self,X_train):
        features=[ 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
        infogains=[]
        
        for i in features:
            ds=information_gain(X_train,i,X_train[i].mean())
            infogains.append(ds)
        
        self.fkey=features[np.argmax(infogains)]
        self.fval=X_train[self.fkey].mean()
        print("Making tree with feature", self.fkey, "depth -" , self.depth)
        
        
        
        data_left,data_right=divide_data(X_train,self.fkey,self.fval)
        data_left =data_left.reset_index(drop=True)
        data_right=data_right.reset_index(drop=True)
        
        #leaf_node or Early Stopping
        if data_left.shape[0]==0 or data_right.shape[0]==0 or self.depth>=self.max_depth:
            if X_train['Survived'].mean()>=.5:
                self.target='Survived'
            else:
                self.target='Dead'
            return
        
        #Recursion
        self.left=DecisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.left.train(data_left)
        
        self.right=DecisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.right.train(data_right)
        
        
    #having target for all nodes
        if X_train['Survived'].mean()>=.5:
            self.target='Survived'
        else:
            self.target='Dead'
            
        return
    
    def predict(self,test):
        
        if test[self.fkey]>=self.fval:
            if self.right is None:
                return self.target
            else:
                return self.right.predict(test)
        else:
            if self.left is None:
                return self.target
            else:
                return self.left.predict(test)
            
            
        

In [26]:
split=int(.7*x.shape[0])

In [27]:
train_data  =data_clean[:split]
test_data = data_clean[split:]

In [28]:
dt=DecisionTree()

In [31]:
dt.train(train_data)

Making tree with feature Sex depth - 0
Making tree with feature Pclass depth - 1
Making tree with feature Age depth - 2
Making tree with feature SibSp depth - 3
Making tree with feature Pclass depth - 4
Making tree with feature Pclass depth - 5
Making tree with feature Age depth - 5
Making tree with feature SibSp depth - 4
Making tree with feature Parch depth - 5
Making tree with feature Pclass depth - 5
Making tree with feature SibSp depth - 3
Making tree with feature Fare depth - 4
Making tree with feature Parch depth - 5
Making tree with feature Pclass depth - 5
Making tree with feature Pclass depth - 4
Making tree with feature Pclass depth - 5
Making tree with feature Pclass depth - 5
Making tree with feature Parch depth - 2
Making tree with feature SibSp depth - 3
Making tree with feature Fare depth - 4
Making tree with feature Age depth - 5
Making tree with feature Age depth - 5
Making tree with feature Fare depth - 4
Making tree with feature Age depth - 5
Making tree with featur

In [None]:
##class

In [32]:
for col in train_data.columns:
    
    ig = information_gain(train_data,col,train_data[col].mean())
    print(col , ig)

Survived 0.9678665597403109
Pclass 0.068482078178491
Sex 0.2300577479064686
Age 0.0009644555250772902
SibSp 0.008738245653774845
Parch 0.012402189794205293
Fare 0.034030223184305014


## Predict

In [33]:
y_pred=[]
for i in range(test_data.shape[0]):
    p=dt.predict(test_data.iloc[i])
    y_pred.append(p)

In [34]:
y_pred=le.fit_transform(y_pred)

In [35]:
le.classes_

array(['Dead', 'Survived'], dtype='<U8')

In [36]:
(y_pred==test_data.Survived.values).mean()

0.8171641791044776

## From sklearn 

In [37]:
from sklearn.tree import DecisionTreeClassifier as DT

In [38]:
dt=DT(criterion='entropy',max_depth=5)

In [39]:
dt.fit(train_data[input_cols],train_data[output_cols])

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [40]:
y_pred=dt.predict(test_data[input_cols])

In [41]:
(y_pred==test_data.Survived.values).mean()

0.8283582089552238