In [103]:
import numpy as np
import pandas as pd

from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix

url = "https://raw.githubusercontent.com/BigDataGal/Python-for-Data-Science/master/titanic-train.csv"
titanic_train = pd.read_csv(url)

print(titanic_train.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [104]:
print(titanic_train.describe())

       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  


In [105]:
titanic_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Fill the Age and Cabin column

In [106]:
titanic_train["Age"] = titanic_train["Age"].fillna(titanic_train["Age"].median())
titanic_train["Embarked"] = titanic_train["Embarked"].fillna("S")

In [107]:
#Convert sex and Embarked classes to dummy variables using OHE and drop target and unhelpful columns
candidate_train_predictors = titanic_train.drop(['PassengerId', 'Survived', 'Name', 'Ticket'], axis = 1)

categorical_cols = [cname for cname in candidate_train_predictors.columns if candidate_train_predictors[cname].nunique()<10 and candidate_train_predictors[cname].dtype == "object"]
numeric_cols = [cname for cname in candidate_train_predictors.columns if candidate_train_predictors[cname].dtype in ['int64', 'float64']]

my_cols = categorical_cols + numeric_cols
train_predictors = candidate_train_predictors[my_cols]

dummy_encoded_train_predictors = pd.get_dummies(train_predictors)

In [108]:
#Create the target and features numpy arrays: target, features_one
y_target = titanic_train["Survived"].values
x_features_one = dummy_encoded_train_predictors.values
print(x_features_one)



[[ 3. 22.  1. ...  0.  0.  1.]
 [ 1. 38.  1. ...  1.  0.  0.]
 [ 3. 26.  0. ...  0.  0.  1.]
 ...
 [ 3. 28.  1. ...  0.  0.  1.]
 [ 1. 26.  0. ...  1.  0.  0.]
 [ 3. 32.  0. ...  0.  1.  0.]]


In [109]:
# Split the data into training and testing sets - 3:4
x_train, x_validation, y_train, y_validation = train_test_split(x_features_one, y_target, test_size =.25, random_state = 1)

In [110]:
tree_one = tree.DecisionTreeClassifier()
tree_one = tree_one.fit(x_features_one, y_target)

In [111]:
#Look at the score of the included features
tree_one_accuracy = round(tree_one.score(x_features_one, y_target), 4)
print("Accuracy: %0.4f" % (tree_one_accuracy))

Accuracy: 0.9798


In [112]:
# Predict y_target given validation set
predictions = tree_one.predict(x_validation)
# Look at the confusion matrix
confusion_matrix(y_validation,predictions)

array([[128,   0],
       [  8,  87]], dtype=int64)

In [113]:
accuracy_score(y_validation, predictions)

0.9641255605381166

In [114]:
# Overfitting and additional features
# Create the target and features numpy arrays: target, features_two
y_target = titanic_train["Survived"].values
x_features_two = dummy_encoded_train_predictors.values

In [115]:
tree_two = tree.DecisionTreeClassifier()
tree_two = tree_two.fit(x_features_two, y_target)

tree_two_accuracy = round(tree_two.score(x_features_two, y_target), 4)
print("Accuracy: %0.4f" % (tree_two_accuracy))

Accuracy: 0.9798


In [116]:
max_depth = 10
min_samples_split = 5
tree_two = tree.DecisionTreeClassifier(max_depth = max_depth, min_samples_split = min_samples_split, random_state = 1)
tree_two = tree_two.fit(x_features_two, y_target)

# Look at the score of the included features
tree_two_accuracy = round(tree_two.score(x_features_two, y_target), 4)
print("Accuracy: %0.4f" % (tree_two_accuracy))

Accuracy: 0.9091


In [125]:
# Feature Engineering
titanic_train['family_size'] = 1 + titanic_train['SibSp'] + titanic_train['Parch']

candidate_train_predictors = titanic_train.drop(['PassengerId','Survived','Name','Ticket','Cabin','SibSp','Parch'], axis = 1)
categorical_cols = [cname for cname in candidate_train_predictors.columns if candidate_train_predictors[cname].nunique() < 10 and candidate_train_predictors[cname].dtype == "object"]
numeric_cols = [cname for cname in candidate_train_predictors.columns if candidate_train_predictors[cname].dtype in ['int64', 'float64']]

my_cols = categorical_cols + numeric_cols
train_predictors = candidate_train_predictors[my_cols]

dummy_encoded_train_predictors = pd.get_dummies(train_predictors)

# Create the target and features numpy arrays: target, features_three
y_target = titanic_train["Survived"].values
x_features_three = dummy_encoded_train_predictors.values


In [126]:
tree_three = tree.DecisionTreeClassifier()
tree_three = tree_three.fit(x_features_three, y_target)

In [127]:
# Look at the score of the included features
tree_three_accuracy = round(tree_three.score(x_features_three, y_target), 4)
print("Accuracy: %0.4f" % (tree_three_accuracy))

Accuracy: 0.9798


In [129]:
#Control overfitting by setting "max_depth" to 10 and "min_samples_split" to 5
max_depth = 10
min_samples_split = 5
tree_three = tree.DecisionTreeClassifier(max_depth = max_depth, min_samples_split = min_samples_split, random_state = 1)
tree_three = tree_three.fit(x_features_three, y_target)

In [130]:
print(tree_three.score(x_features_three, y_target))

0.9090909090909091
