We will be using a decision tree to make predictions about the Titanic data set from
Kaggle. This data set provides information on the Titanic passengers and can be used to
predict whether a passenger survived or not.

You use only Pclass, Sex, Age, SibSp (Siblings aboard), Parch (Parents/children aboard),
and Fare to predict whether a passenger survived.

### Importing Library

In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import sklearn
from pandas import Series, DataFrame
from pylab import rcParams
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.cross_validation import KFold



In [2]:
titanic_ds=pd.read_csv("https://raw.githubusercontent.com/BigDataGal/Python-for-Data-Science/master/titanic-train.csv")
titanic_ds.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [3]:
titanic_ds.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
titanic_ds.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
titanic_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


#### As mentioned we wil be using following attribute to predict survived or not

Pclass
Sex
Age
SibSp
Parch

In [6]:
selected_features=titanic_ds[['Pclass','Sex','Age','SibSp','Parch','Fare','Survived']]

In [7]:
selected_features.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Survived
0,3,male,22.0,1,0,7.25,0
1,1,female,38.0,1,0,71.2833,1
2,3,female,26.0,0,0,7.925,1
3,1,female,35.0,1,0,53.1,1
4,3,male,35.0,0,0,8.05,0


### Data Preprocessing

In [8]:
#There are 177 records where there is not value for age
selected_features['Age'].isnull().sum()

177

In [9]:
# filling the null values with median of age
selected_features['Age']=selected_features['Age'].fillna(selected_features['Age'].median())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [10]:
#now there is no null values. Again trying to get a info of the dataset
selected_features['Age'].isnull().sum()

0

In [11]:
#Converting  Categorical Values into Numeric Values
selected_features.Sex = selected_features.Sex.map({'female':0,'male':1})
selected_features.Sex.unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


array([1, 0], dtype=int64)

In [12]:
#Normalizing Age and Fare
age_scaler = StandardScaler()
age_scaler.fit(pd.DataFrame(selected_features['Age']))
selected_features.Age = age_scaler.transform(selected_features[['Age']])
selected_features.Age.unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


array([-0.56573646,  0.66386103, -0.25833709,  0.4333115 , -0.1046374 ,
        1.89345853, -2.10273333, -0.18148724, -1.18053521, -1.94903364,
        2.2008579 , -0.71943615,  0.74071088,  1.97030837,  0.12591213,
        0.35646166, -1.10368536, -1.64163427, -0.79628599,  0.81756072,
        2.81565665,  0.97126041, -0.6425863 , -0.87313583, -2.02588348,
       -1.71848411,  1.50920931, -0.02778756,  2.7388068 , -0.06621248,
       -1.8721838 , -1.41108474,  1.20180994, -0.94998568,  0.20276197,
       -1.02683552, -0.33518693, -2.19264764,  0.04906229,  0.27961182,
       -0.48888662, -0.41203677,  1.27865978,  2.27770774,  3.19990586,
        0.58701119,  1.35550962, -1.14211029,  3.16148094,  0.24118689,
       -1.33423489, -1.56478442,  0.54858627,  1.662909  ,  2.00873329,
        0.85598564,  1.12496009, -2.17958317,  2.43140743,  2.04715821,
        1.58605915,  0.51016135,  1.24023486, -0.68101123,  2.50825727,
        0.89441056,  1.73975884,  2.58510712, -0.4504617 , -2.18

In [20]:
fare_scaler = StandardScaler()
fare_scaler.fit(pd.DataFrame(selected_features['Fare']))
selected_features['Fare'] = fare_scaler.transform(selected_features[['Fare']])
selected_features['Fare'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


array([-5.02445171e-01,  7.86845294e-01, -4.88854258e-01,  4.20730236e-01,
       -4.86337422e-01, -4.78116429e-01,  3.95813561e-01, -2.24083121e-01,
       -4.24256141e-01, -4.29555021e-02, -3.12172378e-01, -1.13845709e-01,
       -1.87093118e-02, -4.90279793e-01, -3.26266659e-01, -6.19988892e-02,
       -3.86670720e-01, -2.85997284e-01, -5.02948539e-01, -1.24919787e-01,
       -4.86756223e-01,  6.63597416e-02, -1.64441595e-02,  4.64700108e+00,
       -4.89776426e-01, -4.89442190e-01, -9.02720170e-02,  2.30172882e+00,
       -4.92377828e-01, -4.37007438e-01,  1.00606170e+00,  3.98582080e-01,
       -5.02863973e-01, -4.22073541e-01, -4.57645492e-01, -2.25593223e-01,
        1.88762532e-01, -3.36334002e-01, -2.11917743e-01, -2.90024222e-01,
        1.50673744e-01, -4.91371093e-01,  8.96496787e-01,  5.99510151e-01,
       -8.96840841e-02,  2.95895176e-01,  9.62353321e-01,  1.03232136e+00,
       -8.66638810e-02, -3.41452240e-01, -4.84156835e-01, -4.74004926e-01,
        8.31477855e-01, -

In [14]:
selected_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Pclass      891 non-null int64
Sex         891 non-null int64
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Survived    891 non-null int64
dtypes: float64(2), int64(5)
memory usage: 48.8 KB


### Independent and target variable splitting

In [15]:
# Independent and target variable splitting
X=selected_features.drop(['Survived'],axis=1) # Independent Variables
Y=selected_features.drop(['Pclass','Sex','Age','SibSp','Parch','Fare'],axis=1)  # Dependent Variable

In [16]:
# Train and test data splitting

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.20,random_state=101)
print("X_train Shape : ",X_train.shape)
print("X_test Shape : ",X_test.shape)
print("Y_train Shape : ",Y_train.shape)
print("Y_test.shape : ",Y_test.shape)

X_train Shape :  (712, 6)
X_test Shape :  (179, 6)
Y_train Shape :  (712, 1)
Y_test.shape :  (179, 1)


### Applying Decision Tree Classifier Model on the dataset

In [17]:

decision_tree = DecisionTreeClassifier() 
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_test, Y_test) * 100, 2)

In [18]:
print("Decision Tree Accuracy Score: ", acc_decision_tree)

Decision Tree Accuracy Score:  78.21


### Decision Tree with K fold cross validation

In [19]:

kfold = KFold(len(X),n_folds=10,shuffle=True,random_state=0)
#If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples
dt = DecisionTreeClassifier()
fold_accuracy = []
for train_fold, valid_fold in kfold:
    train = X.iloc[train_fold] # Extract training data with cv indices
    valid = X.iloc[valid_fold] # Extract validation data with cv indices
    
    train_y = Y.iloc[train_fold]
    valid_y = Y.iloc[valid_fold]
    
    model = dt.fit(train,train_y)
    valid_acc = model.score(X = valid, y = valid_y)
    fold_accuracy.append(valid_acc)    

print("Decision Tree Classifier Accuracy per fold: ", fold_accuracy, "\n")
print("Decision Tree Classifier Average accuracy: ", sum(fold_accuracy)/len(fold_accuracy))

Decision Tree Classifier Accuracy per fold:  [0.7777777777777778, 0.7191011235955056, 0.8314606741573034, 0.7640449438202247, 0.7865168539325843, 0.7752808988764045, 0.7640449438202247, 0.7752808988764045, 0.7415730337078652, 0.7752808988764045] 

Decision Tree Classifier Average accuracy:  0.77103620474407
