In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import pandas as pd
from sklearn import ensemble, tree, model_selection

In [2]:
# Import test and train datasets
df_train = pd.read_csv('datasets/titanic/train.csv')
df_test = pd.read_csv('datasets/titanic/test.csv')

In [3]:
# To put average age/fare for those without data and to convert sex and embarked to integers
def clean_data(data):
    data["Fare"] = data["Fare"].fillna(data["Fare"].dropna().median())
    data["Age"] = data["Age"].fillna(data["Age"].dropna().median())

    data.loc[data["Sex"] == "male", "Sex"] = 0
    data.loc[data["Sex"] == "female", "Sex"] = 1

    data["Embarked"] = data["Embarked"].fillna("S")
    data.loc[data["Embarked"] == "S", "Embarked"] = 0
    data.loc[data["Embarked"] == "C", "Embarked"] = 1
    data.loc[data["Embarked"] == "Q", "Embarked"] = 2

# To create submission file
def write_prediction(prediction, name):
    PassengerId = np.array(df_test["PassengerId"]).astype(int)
    solution = pd.DataFrame(prediction, PassengerId, columns = ["Survived"])
    solution.to_csv(name, index_label = ["PassengerId"])

In [4]:
#To clean train and test datasets
clean_data(df_train)
clean_data(df_test)

#Goal of survived
target = df_train["Survived"].values

#List of features values
features = df_train[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values

In [5]:
# Set paramater for fit
decision_tree = tree.DecisionTreeClassifier(
    max_depth = 7,
    min_samples_split = 2,
    random_state = 1)
decision_tree = decision_tree.fit(features, target)

#To print weights of features
print(decision_tree.feature_importances_)
#To print score
print(decision_tree.score(features, target))


#To create CSV submission file
test_features_tree = df_test[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values
prediction_tree = decision_tree.predict(test_features_tree)
write_prediction(prediction_tree, "results/decision_tree.csv")

[0.16289604 0.13683043 0.47976487 0.14661755 0.04973834 0.01414005
 0.01001273]
0.8787878787878788
