In [1]:
import pandas as pd
import numpy as np
import copy

### Get data

In [2]:
dataset = pd.read_csv('train_titanic.csv')

dataset['Sex'] = dataset['Sex'].replace(['female', 'male'], [0, 1]) 
dataset['Embarked'] = dataset['Embarked'].replace(['S', 'C', 'Q'], [0, 1, 2]) 

dataset.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,0.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,1.0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,0.0


### Split data in training and validation dataset

In [3]:
numeric_datapoints = dataset[['Pclass', 'Age', 'SibSp', 'Fare', 'Sex', 'Embarked']]
targets = dataset['Survived']

training_dataset_size = int(numeric_datapoints.shape[0] * 0.8)
training_points = numeric_datapoints.head(training_dataset_size)
training_labels = targets.head(training_dataset_size)

validation_dataset_size = int(numeric_datapoints.shape[0] - training_dataset_size)
validation_points = numeric_datapoints.tail(validation_dataset_size).reset_index(drop = True)
validation_labels = targets.tail(validation_dataset_size).reset_index(drop = True)

### Decision trees for classification

In [4]:
import decision_trees
m_tree = decision_trees.ClassificationDecisionTree()
m_tree.train(training_points, training_labels, max_depth = 3)

In [5]:
def check_accuracy(tree, data, labels):
    positives = np.sum([labels.loc[i] == int(tree.predict(data.loc[[i]])) for i in range(data.shape[0])])
    return int( (positives / data.shape[0]) * 100)

def calculate_error(prunned_subtree, test_data, test_labels):
    while prunned_subtree.parent is not None:
        prunned_subtree = prunned_subtree.parent
    return 100 - check_accuracy(prunned_subtree, test_data, test_labels)
    
def reduced_error_pruning(tree, best_tree, best_error, test_data, test_labels):
    # Recursion anchor
    if tree.right is None:
        return best_tree, best_error
    
    # Prunning current subtree
    prunned_tree = copy.deepcopy(tree)
    
    prunned_tree.right = None
    prunned_tree.left = None
    prunned_tree.label = str(decision_trees.ClassificationDecisionTree.dominatingClass(test_labels))
    prunned_tree_error = calculate_error(prunned_tree, test_data, test_labels)
    
    if prunned_tree_error < best_error:
        best_tree = prunned_tree
        best_error = prunned_tree_error

    # Recursive call
    reduced_error_pruning(tree.right, best_tree, best_error, test_data, test_labels)
    reduced_error_pruning(tree.left, best_tree, best_error, test_data, test_labels)

In [6]:
print('Accuracy: {} %'.format(check_accuracy(m_tree, validation_points, validation_labels)))

Accuracy: 86 %


### Test the algorithm on the test set

In [7]:
test = pd.read_csv('test_titanic.csv')

test['Sex'] = test['Sex'].replace(['female', 'male'], [0, 1])
test['Embarked'] = test['Embarked'].replace(['S', 'C', 'Q'], [0, 1, 2]) 
test_important = test[['Pclass', 'Age', 'SibSp', 'Fare', 'Sex', 'Embarked']]

m_answer = pd.DataFrame()
for i in range(test.shape[0]):
    prediction = int(m_tree.predict(test_important.loc[[i]]))
    passenger_id = int(test.loc[i,'PassengerId'])
    m_answer = m_answer.append({'PassengerId':test.loc[i,'PassengerId'], 'Survived': prediction},ignore_index=True)
m_answer = m_answer.astype('int32')

In [None]:
m_answer.to_csv('titanic-dt.csv',index=False)