In [18]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import numpy as np

train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

#Drop features we are not going to use
train = train.drop(['Name','SibSp','Parch', 'Ticket', 'Cabin', 'Embarked'],axis=1)
test = test.drop(['Name','SibSp','Parch', 'Ticket', 'Cabin', 'Embarked'],axis=1)

#Look at the first 5 rows of our training data
print(train.head(5))

#count number of elements in each column
#easy way to see if there are any missing values in any column
print(train.count())
print(test.count())

   PassengerId  Survived  Pclass     Sex   Age     Fare
0            1         0       3    male  22.0   7.2500
1            2         1       1  female  38.0  71.2833
2            3         1       3  female  26.0   7.9250
3            4         1       1  female  35.0  53.1000
4            5         0       3    male  35.0   8.0500
PassengerId    891
Survived       891
Pclass         891
Sex            891
Age            714
Fare           891
dtype: int64
PassengerId    418
Pclass         418
Sex            418
Age            332
Fare           417
dtype: int64


In [19]:
data = [train, test]

#convert al columns to numerical
#Convert male,female to [1,0] so the decision tree can be built
genders = {"male": 0, "female": 1}
for dataset in data:
    dataset['Sex'] = dataset['Sex'].map(genders)

#fill missing age with average age
train['Age'].fillna(value = train['Age'].mean(), inplace = True)  
test['Age'].fillna(value = train['Age'].mean(), inplace = True)

#fill missing fare in test dataset with average Fare
test['Fare'].fillna(value = train['Fare'].mean(), inplace = True)

#feature column names and target variable we are going to use for training
features = ['Pclass','Age','Fare','Sex_binary']


In [21]:
#split the training and test values to use it for implementing models
from sklearn.model_selection import train_test_split

predictors = train.drop(['Survived', 'PassengerId'], axis=1)
target = train["Survived"]
x_train, x_val, y_train, y_val = train_test_split(predictors, target, test_size = 0.32, random_state = 0)

### decision tree implmentation and evaluation

In [7]:
from sklearn.tree import DecisionTreeClassifier

#default Decision tree hyperparameters
decisiontree = DecisionTreeClassifier(max_depth=4)
decisiontree.fit(x_train, y_train)
y_pred = decisiontree.predict(x_val)
acc_decisiontree = round(accuracy_score(y_pred, y_val) * 100, 2)
print(acc_decisiontree)


80.77


In [16]:
from sklearn.tree import DecisionTreeClassifier

#default Decision tree hyperparameters
decisiontree_clf = DecisionTreeClassifier()
decisiontree_clf.fit(x_train, y_train)
y_pred = decisiontree_clf.predict(x_val)

#evaluating accuracy
acc_decisiontree = round(accuracy_score(y_pred, y_val) * 100, 2)
print(acc_decisiontree)


79.72


In [17]:
#Decision tree with custom hyperparameters
decisiontree_clf2 = DecisionTreeClassifier(max_depth=4, max_leaf_nodes=10)
decisiontree_clf2.fit(x_train, y_train)
y_pred = decisiontree_clf2.predict(x_val)

#evaluating accuracy
acc_decisiontree = round(accuracy_score(y_pred, y_val) * 100, 2)
print(acc_decisiontree)

81.12


### decision tree visualization

In [26]:
from sklearn.tree import export_graphviz
export_graphviz(decisiontree_clf, out_file= ("decision_tree.dot"),
               feature_names = features,
               class_names= ['Survived','Did not Survive'],
               rounded = True,
               filled = True)
# outputs a cute treeeee danmmm