In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('/home/aakash/Desktop/train.csv')
data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,1,0,3,male,22.0,1,0,7.25
1,2,1,1,female,38.0,1,0,71.2833
2,3,1,3,female,26.0,0,0,7.925
3,4,1,1,female,35.0,1,0,53.1
4,5,0,3,male,35.0,0,0,8.05
5,6,0,3,male,,0,0,8.4583
6,7,0,1,male,54.0,0,0,51.8625
7,8,0,3,male,2.0,3,1,21.075
8,9,1,3,female,27.0,0,2,11.1333
9,10,1,2,female,14.0,1,0,30.0708


In [3]:
data['Age'] = data['Age'].fillna(data['Age'].median())
data = pd.get_dummies(data,columns = ['Sex','Pclass'])
data = data.drop(['PassengerId'],axis = 1)
x = 'Survived'

In [4]:
#making decision tree using ID3
#IG --> Information Gain
#IG(S, A) = Entropy(S) - ∑((|Sᵥ| / |S|) * Entropy(Sᵥ))
#Entropy(S) = - ∑ pᵢ * log₂(pᵢ) ; i = 1 to n

In [5]:
def entropy(feature):
    p = feature.value_counts()/feature.shape[0]
    return np.sum(-p*np.log2(p))

def info_gain(target,feature,split): # x = target variable # y = data variable
    temp = feature < split
    a = sum(temp)
    b = temp.shape[0] - a
    if(a == 0 or b == 0): 
         return 0
    return entropy(target) - (a/(a+b))*entropy(target[temp]) - (b/(a+b))*entropy(target[~temp])

def split_for_max_info_gain(feature,target):
    optimal_split = 0
    max_gain = 0
    for split in feature.sort_values().unique():
        gain = info_gain(target,feature,split)
        if gain > max_gain:
            optimal_split = split
            max_gain = gain
    return optimal_split,max_gain

In [6]:
data.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3
0,0,22.0,1,0,7.25,0,1,0,0,1
1,1,38.0,1,0,71.2833,1,0,1,0,0
2,1,26.0,0,0,7.925,1,0,0,0,1
3,1,35.0,1,0,53.1,1,0,1,0,0
4,0,35.0,0,0,8.05,0,1,0,0,1


In [7]:
def get_best_split(target_col_name, data): #feature having maximum Information gain
    max_info_gain = 0
    split_value = 0
    col_split = data.drop(target_col_name,axis = 1).columns[1]
    for i in data.drop(target_col_name,axis = 1).columns:
        split,gain = split_for_max_info_gain(data[i],data[target_col_name])
        if gain > max_info_gain:
            max_info_gain = gain
            split_value = split
            col_split = i
            
    return split_value,col_split,max_info_gain


def make_split(col_split, split_value, data):
    data1 = data[data[col_split] < split_value]
    data2 = data[data[col_split] >= split_value]

    return(data1,data2)

def make_prediction(data):
    pred = data.value_counts().idxmax()

    return pred

In [30]:
def train_tree(x,data,max_depth = None,min_samples_split = None,min_information_gain = 1e-20,counter = 0):
    if max_depth == None or counter < max_depth:  # Check for depth conditions
        depth = True
    else:
        depth = False
    
    
    if min_samples_split == None or data.shape[0] > min_samples_split: # Check for sample conditions
        sample_size = True
    else:
        sample_size = False
        
        
    if depth & sample_size:
        split_value,col_split,max_info_gain = get_best_split(x,data)

        if max_info_gain is not None and max_info_gain >= min_information_gain:
            counter += 1
            
            left,right = make_split(col_split, split_value, data)
            #storing tree
            split_type = "<="
            question =   "{} {} {}".format(col_split,split_type,split_value)
            subtree = {question: []}

            yes_answer = train_tree(x,left,max_depth,min_samples_split,min_information_gain, counter)
            no_answer = train_tree(x,right,max_depth,min_samples_split,min_information_gain, counter)
            
            if yes_answer == no_answer:
                subtree = yes_answer
            else:
                subtree[question].append(yes_answer)
                subtree[question].append(no_answer)

        else:
            pred = make_prediction(data[x])
            return pred
    else:
        pred = make_prediction(data[x])
        return pred

    return subtree

In [31]:
def prediction(data_point,decision_tree):
    question = list(decision_tree.keys())[0]
    if question.split()[1] == '<=':
        if data_point[question.split()[0]] <= float(question.split()[2]):
            answer = decision_tree[question][0]
        else:
            answer = decision_tree[question][1]
    else:
        if data_point[question.split()[0]] in (question.split()[2]):
            answer = decision_tree[question][0]
        else:
            answer = decision_tree[question][1]
    if not isinstance(answer, dict):
        return answer
    else:
        residual_tree = answer
        return prediction(data_point, answer)

In [32]:
max_depth = 10
min_samples_split = 10
min_information_gain  = 1e-5

decision_tree = train_tree(x,data,max_depth,min_samples_split,min_information_gain)
decision_tree

{'Sex_female <= 1': [{'Age <= 15.0': [{'SibSp <= 3': [1, 0]},
    {'Pclass_1 <= 1': [{'Parch <= 2': [{'Fare <= 7.775': [{'Age <= 30.0': [{'Age <= 29.0': [0,
              1]},
            0]},
          {'Fare <= 7.8': [{'Age <= 27.0': [0, 1]},
            {'Age <= 29.0': [{'Fare <= 8.05': [0,
                {'Age <= 20.0': [{'Fare <= 8.1583': [1, 0]}, 0]}]},
              {'Age <= 32.5': [{'Age <= 32.0': [0, 1]}, 0]}]}]}]},
        0]},
      {'Age <= 61.0': [{'Fare <= 135.6333': [{'Parch <= 2': [{'Fare <= 27.7208': [{'Fare <= 26.2875': [0,
                1]},
              {'Fare <= 30.5': [0,
                {'Fare <= 30.6958': [1, {'Age <= 60.0': [0, 1]}]}]}]},
            1]},
          0]},
        0]}]}]},
  {'Pclass_3 <= 1': [{'Age <= 28.0': [{'Age <= 25.0': [1, 0]},
      {'Fare <= 32.5': [{'Fare <= 28.7125': [{'Age <= 38.0': [1,
            {'Age <= 40.0': [0, 1]}]},
          0]},
        1]}]},
    {'Fare <= 25.4667': [{'Age <= 37.0': [{'Fare <= 8.05': [{'Age <= 15.0': [0

In [39]:
train_pred = []

for i in range(len(data)):
  obs_pred = prediction(data.iloc[i,:],decision_tree)
  train_pred.append(obs_pred)


accuracy = (sum(train_pred == data[x])/len(data))*100
print("TRAIN SET")
print("======================")
print(f"Accuracy = {accuracy:0.2f}%")

tp = np.sum(train_pred & data[x])
precision = tp/np.sum(train_pred) # precision=tp/(tp+fp)
recall = tp/np.sum(data[x]) #recall=tp/(tp+fn)
f1 = 2*tp/(np.sum(data[x])+np.sum(train_pred)) #f1=2*precision*recall/(precision+recall)

print(f"Precision = {precision:.2f}") 
print(f"Recall = {recall:.2f}") 
print(f"F1 Score = {f1:.2f}") 
print(f"Loss = {len(data)-sum(train_pred == data[x]):0.2f}")

TRAIN SET
Accuracy = 63.39%
Precision = 0.68
Recall = 0.13
F1 Score = 0.22
Loss = 227.00


In [34]:
test_data = pd.read_csv('/home/aakash/Desktop/test.csv')
test_data['Age'] = test_data['Age'].fillna(data['Age'].median())
test_data = pd.get_dummies(test_data,columns = ['Sex','Pclass'])
test_data = test_data.drop(['PassengerId'],axis = 1)
x_test = 'Survived'

In [40]:
test_pred = []
for i in range(len(test_data)):
    test_obs_pred = prediction(test_data.iloc[i,:],decision_tree)
    test_pred.append(test_obs_pred)

accuracy = (sum(test_pred == test_data[x])/len(data))*100
print("TRAIN SET")
print("======================")
print(f"Accuracy = {accuracy:0.2f}%")

tp = np.sum(test_pred & test_data[x])
precision = tp/np.sum(test_pred) # precision=tp/(tp+fp)
recall = tp/np.sum(test_data[x]) #recall=tp/(tp+fn)
f1 = 2*tp/(np.sum(test_data[x])+np.sum(test_pred)) #f1=2*precision*recall/(precision+recall)

print(f"Precision = {precision:.2f}") 
print(f"Recall = {recall:.2f}") 
print(f"F1 Score = {f1:.2f}") 
print(f"Loss = {len(test_data)-sum(test_pred == test_data[x]):0.2f}")

TRAIN SET
Accuracy = 29.52%
Precision = 0.71
Recall = 0.17
F1 Score = 0.28
Loss = 88.00
