In [40]:
import numpy as np
import pandas as pd

In [41]:
def load_csv(src):
    df = pd.read_csv(src)

    df=df.replace(to_replace="male",value=0)
    df=df.replace(to_replace="female",value=1)

    mean = df['Age'].mean()
    df.fillna(mean, inplace = True)
    df = df.drop(columns=['PassengerId'])
    df.columns = ['output', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch','Fare']
    print(df.head())
    return df

In [42]:
train_df = load_csv("../data/train.csv")
test_df = load_csv('../data/test.csv')

   output  Pclass  Sex   Age  SibSp  Parch     Fare
0       0       3    0  22.0      1      0   7.2500
1       1       1    1  38.0      1      0  71.2833
2       1       3    1  26.0      0      0   7.9250
3       1       1    1  35.0      1      0  53.1000
4       0       3    0  35.0      0      0   8.0500
   output  Pclass  Sex   Age  SibSp  Parch     Fare
0       0       3    0  27.0      1      0  14.4542
1       1       1    0  42.0      1      0  52.5542
2       1       3    0  20.0      1      1  15.7417
3       0       3    0  21.0      0      0   7.8542
4       0       3    0  21.0      0      0  16.1000


In [43]:
# Entropy of a column
def entropy(column):
    # count of each unique values (class) of a column
    counts = np.unique(column, return_counts=True)
    sum = 0.0
    for i in counts[1]:
        # Probability of a class
        prob = i/column.shape[0]
        sum+= (-1.0 * prob * np.log2(prob))
    return sum

In [44]:
def divide_data(x_data, title_name, mean_val):
    # x_right is dataframe for the right node, which will be used further
    x_right = pd.DataFrame([],columns = x_data.columns)
    # x_left is dataframe for the right node, which will be used further
    x_left = pd.DataFrame([],columns = x_data.columns)
    for i in range(x_data.shape[0]):
        # title_name - key column on basis of which data will be divided
        val = x_data[title_name].loc[i]
        if val >= mean_val:
            x_right = x_right.append(x_data.iloc[i])
        else:
            x_left = x_left.append(x_data.iloc[i])
    return x_right, x_left

In [45]:
# Info gain of a class
def info_gain(x_data, title_name, mean_val):
    right, left = divide_data(x_data, title_name, mean_val)

    # l and r are prob. of the classes
    l = float(left.shape[0])/x_data.shape[0]
    r = float(right.shape[0])/x_data.shape[0]
    # If data is totally pure (every value is same)
    if left.shape[0] == 0 or right.shape[0] == 0:
        return -99999
    gain = entropy(x_data.output) - (l * entropy(left.output) + r * entropy(right.output))
    return gain

In [46]:
class DecisionTree:
    def __init__(self, depth=0, max_depth=5):
        # Left Node
        self.left = None
        # Right Node
        self.right = None
        self.title_name = None
        self.mean_val = None
        self.depth = depth
        self.max_depth = max_depth
        # Output value
        self.target = None
    def train_modal(self, x_train):
        features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
        info_gains = []
        # Informatio gain for all features
        for i in features:
            i_gain = info_gain(x_train, i, x_train[i].mean())
            info_gains.append(i_gain)
        # title_name - title name of highest info gain feature
        self.title_name = features[np.argmax(info_gains)]
        # Value on basis of which data will be splitted
        # > mean_val will go to right node and < mean_val to left node
        self.mean_val = x_train[self.title_name].mean()
        # Splitting Tree 
        data_right, data_left = divide_data(x_train, self.title_name, self.mean_val)
        data_right = data_right.reset_index(drop=True)
        data_left = data_left.reset_index(drop=True)
        # Setting Leaf Nodes
        if data_left.shape[0] == 0 or data_right.shape[0] == 0:
            if x_train.output.mean() >= 0.5:
                self.target = 1
            else:
                self.target = 0
            return
        if self.depth >= self.max_depth:
            if x_train.output.mean() >= 0.5:
                self.target = 1
            else:
                self.target = 0
            return

        self.left = DecisionTree(self.depth+1, self.max_depth)
        self.left.train_modal(data_left)
        self.right = DecisionTree(self.depth+1, self.max_depth)
        self.right.train_modal(data_right)

        if x_train.output.mean() >= 0.5:
            self.target = 1
        else:
            self.target = 0
        return
        # Prediction
    def predict(self, test):
        if test[self.title_name] > self.mean_val:
            if self.right is None:
                return self.target
            return self.right.predict(test)
        if test[self.title_name] < self.mean_val:
            if self.left is None:
                return self.target
            return self.left.predict(test)

In [47]:
# Tree Data Structure
dt = DecisionTree()
dt.train_modal(train_df)

In [48]:
# To find Accuracy, loss and f1_score
def acc(y_pred, y_test):
  acuracy = 0
  loss = 0
  # True positive,
  falsen = 0;truep = 0;falsep = 0;truen = 0;accuracy = 0
  for i,j in zip(y_pred, y_test):
    loss = loss + abs(i-j)
    if i == j:
      accuracy+=1
    if i == 1 and j == 1:
      truep+=1
    elif i == 1 and j == 0:
      falsep+=1
    elif i==0 and j == 1:
      falsen+=1
    else:
      truen+=1

  recall = truep/(truep+falsen)
  precision = truep/(truep + falsep)
  f1_score = 2*precision*recall/(precision+recall)

  print('Accuracy = ',accuracy/y_pred.shape[0])
  print('Loss = ',loss)
  print('F1_Score = ',f1_score)

In [49]:
# Accuracy for train data
y_pred=[]
for i in range(train_df.shape[0]):
    pred = dt.predict(train_df.loc[i])
    y_pred.append(pred)

y_pred = np.array(y_pred)
acc(y_pred, np.array(train_df['output']))

Accuracy =  0.8306451612903226
Loss =  105
F1_Score =  0.7712418300653595


In [50]:
# Accuracy for test data
y_pred = []
for i in range(test_df.shape[0]):
    pred = dt.predict(test_df.loc[i])
    y_pred.append(pred)

y_pred = np.array(y_pred)
# type(test_df['output'])
acc(y_pred, np.array(test_df['output']))

Accuracy =  0.8302583025830258
Loss =  46
F1_Score =  0.7386363636363636
