In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("dataset/Train.csv")

In [3]:
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,3.0,0.0,"O'Donoghue, Ms. Bridget",female,,0.0,0.0,364856,7.75,,Q,,,
1,2.0,0.0,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0.0,0.0,250655,26.0,,S,,,
2,2.0,1.0,"Smith, Miss. Marion Elsie",female,40.0,0.0,0.0,31418,13.0,,S,9,,
3,3.0,1.0,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",female,31.0,1.0,1.0,363291,20.525,,S,C D,,"Strood, Kent, England Detroit, MI"
4,3.0,1.0,"McCoy, Miss. Agnes",female,,2.0,0.0,367226,23.25,,Q,16,,


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1009 non-null   float64
 1   survived   1009 non-null   float64
 2   name       1009 non-null   object 
 3   sex        1009 non-null   object 
 4   age        812 non-null    float64
 5   sibsp      1009 non-null   float64
 6   parch      1009 non-null   float64
 7   ticket     1009 non-null   object 
 8   fare       1008 non-null   float64
 9   cabin      229 non-null    object 
 10  embarked   1008 non-null   object 
 11  boat       374 non-null    object 
 12  body       98 non-null     float64
 13  home.dest  582 non-null    object 
dtypes: float64(7), object(7)
memory usage: 110.5+ KB


 ## Dropping irrelevant columns

In [5]:
columns_to_drop = ["name", "ticket", "cabin", "home.dest", "boat", "body", "embarked"]
clean_data = data.drop(columns = columns_to_drop)

In [6]:
clean_data.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,3.0,0.0,female,,0.0,0.0,7.75
1,2.0,0.0,male,39.0,0.0,0.0,26.0
2,2.0,1.0,female,40.0,0.0,0.0,13.0
3,3.0,1.0,female,31.0,1.0,1.0,20.525
4,3.0,1.0,female,,2.0,0.0,23.25


## Data Preprocessing

In [7]:
# The 'sex' feature contains non-numeric values which need to be converted

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [8]:
clean_data["sex"] = le.fit_transform(clean_data["sex"])
clean_data.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,3.0,0.0,0,,0.0,0.0,7.75
1,2.0,0.0,1,39.0,0.0,0.0,26.0
2,2.0,1.0,0,40.0,0.0,0.0,13.0
3,3.0,1.0,0,31.0,1.0,1.0,20.525
4,3.0,1.0,0,,2.0,0.0,23.25


In [9]:
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1009 non-null   float64
 1   survived  1009 non-null   float64
 2   sex       1009 non-null   int32  
 3   age       812 non-null    float64
 4   sibsp     1009 non-null   float64
 5   parch     1009 non-null   float64
 6   fare      1008 non-null   float64
dtypes: float64(6), int32(1)
memory usage: 51.4 KB


```sex``` feature still have some Na values and ```fare``` feature still left with one Na values!

In [10]:
clean_data.fillna(value = {"age" : clean_data["age"].mean(), "fare" : clean_data["fare"].mean()}, inplace = True)

```Now IT IS CLEAN_DATA, indeed```

In [11]:
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1009 non-null   float64
 1   survived  1009 non-null   float64
 2   sex       1009 non-null   int32  
 3   age       1009 non-null   float64
 4   sibsp     1009 non-null   float64
 5   parch     1009 non-null   float64
 6   fare      1009 non-null   float64
dtypes: float64(6), int32(1)
memory usage: 51.4 KB


## Separate ```Input Data``` from ```Labels```

In [12]:
x_features = ["pclass", "sex", "age", "sibsp", "parch", "fare"]
y_feature  = ["survived"]

X = clean_data[x_features]
Y = clean_data[y_feature]

In [13]:
X.shape, Y.shape

((1009, 6), (1009, 1))

In [14]:
type(X), type(Y)

(pandas.core.frame.DataFrame, pandas.core.frame.DataFrame)

## Entropy AND Infromation Gain

### Entropy : 
$ H(s) = -\sum P_{c}\log_{2} P_{c}$

### Information Gain :
$ IG(s, c) = H(s) - \sum_{v\epsilon values(c)} P_{c}{v}H{s_{c}}(v))$ 
$ \equiv$ entropy(parent) - [weightes average] * entropy (children) 


In [15]:
def Entropy(col):
    
    unique_values = np.unique(col, return_counts = True)
    N = float(col.shape[0])
    
    entropy = 0.0
    
    for ix in unique_values[1]:
        probab = ix/N
        
        entropy += (-1.0 * probab * np.log2(probab))
    
    return entropy

```divide_data``` is to split the data on the basis of provided threshold

In [16]:
def divide_data(X_data, fkey, fval):
    
    x_right = pd.DataFrame([], columns = X_data.columns)
    x_left  = pd.DataFrame([], columns = X_data.columns)
    
    for ix in range(X_data.shape[0]):
        if X_data[fkey].loc[ix] > fval:
            x_right = x_right.append(X_data.loc[ix])
        
        else:
            x_left = x_left.append(X_data.loc[ix])
    
    return x_left, x_right

In [17]:
 x_left, x_right = divide_data(clean_data[:10], "sex", 0.5)

In [18]:
x_left

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,3.0,0.0,0.0,29.838978,0.0,0.0,7.75
2,2.0,1.0,0.0,40.0,0.0,0.0,13.0
3,3.0,1.0,0.0,31.0,1.0,1.0,20.525
4,3.0,1.0,0.0,29.838978,2.0,0.0,23.25
7,1.0,1.0,0.0,49.0,0.0,0.0,25.9292


In [19]:
x_right

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
1,2.0,0.0,1.0,39.0,0.0,0.0,26.0
5,2.0,0.0,1.0,16.0,0.0,0.0,26.0
6,2.0,0.0,1.0,43.0,0.0,1.0,21.0
8,1.0,0.0,1.0,48.0,0.0,0.0,50.4958
9,2.0,0.0,1.0,29.838978,0.0,0.0,12.875


In [20]:
def information_gain(x_data, fkey, fval):
    
    left, right = divide_data(x_data, fkey, fval)
    
    # Calculate ratio of returned divided samples
    
    l = float(left.shape[0])/x_data.shape[0]
    r = float(right.shape[0])/x_data.shape[0]
    
    if left.shape[0] == 0 or right.shape[0] == 0:
        return -10000000 #We have landed the foremost leaf node
    
    i_gain = Entropy(x_data.survived) - (l * Entropy(left.survived) + r * Entropy(right.survived))
    
    return i_gain
    
    

In [21]:
# Testing

for fx in X.columns:
    
    print(fx)
    print(information_gain(clean_data, fx, clean_data[fx].mean()))

pclass
0.055456910002982474
sex
0.19274737190850932
age
0.0010525742338489685
sibsp
0.006492394392888956
parch
0.01975608012294816
fare
0.04242793401428169


In [22]:
class DecisionTree:
    
    def __init__(self, depth = 0, max_depth = 5):
        
        self.left  = None 
        self.right = None
        self.fkey  = None
        self.fval  = None
        self.max_depth = max_depth
        self.depth = depth
        self.target = None
    
    def train(self, x_train):
        
        features = ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare']
        info_gain = []
        
        for ix in features:
            info_gain.append(information_gain(x_train, ix, x_train[ix].mean()))
            
        self.fkey  = features[np.argmax(info_gain)]
        self.fval = x_train[self.fkey].mean()
        
        print("Making Tree : Selected Feature is ", self.fkey)
    
        
#------------------------Spliting Data on the basis of above info-------------------------------------
        
        left_data, right_data = divide_data(x_train, self.fkey, self.fval)
        
        # Reset index 
        left_data  = left_data.reset_index(drop = True)
        right_data = right_data.reset_index(drop = True)
        
        
        # Ending on a leaf node
        if left_data.shape[0] == 0 or right_data.shape[0] == 0:
            
            if x_train.survived.mean() >= 0.5:
                self.target = "Survived"
            
            else:
                self.target = "Dead"
            
            return
        
        # When max-depth achieved 
        if self.depth == self.max_depth:
            
            if x_train.survived.mean() >= 0.5:
                self.target = "Survived"
            
            else:
                self.target = "Dead"
            
            return
        
        # If it continues  ------> UPDATION
        
        # Recursive Call for left part
        self.left = DecisionTree(depth = self.depth + 1, max_depth = self.max_depth)
        self.left.train(left_data)
        
        # Recursive Call for right part
        self.right = DecisionTree(depth = self.depth + 1, max_depth = self.max_depth)
        self.right.train(right_data)
        
        # Setting target at every node
        if x_train.survived.mean() >= 0.5:
            self.target = "Survived"
        
        else:
            self.target = "Dead"
            
        return

In [23]:
dt = DecisionTree()

In [None]:
dt.train(clean_data)

Making Tree : Selected Feature is  sex
Making Tree : Selected Feature is  pclass
Making Tree : Selected Feature is  pclass
Making Tree : Selected Feature is  fare
Making Tree : Selected Feature is  sibsp
Making Tree : Selected Feature is  age
Making Tree : Selected Feature is  age
Making Tree : Selected Feature is  sibsp
Making Tree : Selected Feature is  age
Making Tree : Selected Feature is  sibsp
Making Tree : Selected Feature is  parch
Making Tree : Selected Feature is  fare
Making Tree : Selected Feature is  sibsp
Making Tree : Selected Feature is  fare
Making Tree : Selected Feature is  age
Making Tree : Selected Feature is  age
Making Tree : Selected Feature is  parch
Making Tree : Selected Feature is  parch
Making Tree : Selected Feature is  sibsp
Making Tree : Selected Feature is  fare
Making Tree : Selected Feature is  fare
Making Tree : Selected Feature is  fare
Making Tree : Selected Feature is  age
Making Tree : Selected Feature is  sibsp
Making Tree : Selected Feature is 