In [14]:
##Loading modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score,RandomizedSearchCV

In [2]:
#Loading data
url='https://raw.githubusercontent.com/BigDataGal/Python-for-Data-Science/master/titanic-train.csv'
titanic = pd.read_csv(url)

In [3]:
#Reading data
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
titanic_actual = titanic[['Sex','Pclass','Age','SibSp','Parch','Fare','Survived']]

In [5]:
#Defining dependent(y) and independent(X) sets
X = titanic_actual.iloc[:,0:5]
y = titanic_actual.loc[:,'Survived']
X.Sex.unique()
X.iloc[:,2:3]

Unnamed: 0,Age
0,22.0
1,38.0
2,26.0
3,35.0
4,35.0
...,...
886,27.0
887,19.0
888,
889,26.0


In [6]:
imputer = SimpleImputer(missing_values=np.nan , strategy = 'mean' , verbose = 0)
imputer = imputer.fit(X.iloc[:,2:3])
X.iloc[:,2:3] = imputer.transform(X.iloc[:,2:3])
X.Age.unique()

array([22.        , 38.        , 26.        , 35.        , 29.69911765,
       54.        ,  2.        , 27.        , 14.        ,  4.        ,
       58.        , 20.        , 39.        , 55.        , 31.        ,
       34.        , 15.        , 28.        ,  8.        , 19.        ,
       40.        , 66.        , 42.        , 21.        , 18.        ,
        3.        ,  7.        , 49.        , 29.        , 65.        ,
       28.5       ,  5.        , 11.        , 45.        , 17.        ,
       32.        , 16.        , 25.        ,  0.83      , 30.        ,
       33.        , 23.        , 24.        , 46.        , 59.        ,
       71.        , 37.        , 47.        , 14.5       , 70.5       ,
       32.5       , 12.        ,  9.        , 36.5       , 51.        ,
       55.5       , 40.5       , 44.        ,  1.        , 61.        ,
       56.        , 50.        , 36.        , 45.5       , 20.5       ,
       62.        , 41.        , 52.        , 63.        , 23.5 

In [7]:
#Preprocessing
ct = ColumnTransformer([('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X), dtype=np.float) 

X[0]


array([ 0.,  1.,  3., 22.,  1.,  0.])

In [8]:

#Creating training and test data sets
X_train,X_test,y_train,y_test = train_test_split(X , y , test_size = 0.2)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(712, 6) (179, 6) (712,) (179,)


In [9]:
#Create DecisonTreeClassifier Object

clf = DecisionTreeClassifier()
#Train Decision Tree Classifier

clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [10]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8156424581005587


In [11]:
cm = confusion_matrix(y_test,y_pred)
cm

array([[101,  20],
       [ 13,  45]], dtype=int64)

In [12]:
accuracies = cross_val_score(estimator = clf , X = X_train , y = y_train , cv = 10)
accuracies.mean()

0.779501453163425

In [13]:
clf

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [15]:
random_param = {
    "criterion" : ['entropy','gini'],
    "min_samples_leaf" : [1,2,3,4,5],
    'min_samples_split': [4,5,6,7,8],
    'max_features' : ['auto','log2']
}

In [16]:
random_search = RandomizedSearchCV(estimator = clf , param_distributions = random_param , n_iter = 20)
random_search.fit(X_train,y_train)



RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
                   estimator=DecisionTreeClassifier(class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features=None,
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    presort=False,
                                                    random_state=None,
                                                    splitter='b

In [18]:
random_search.best_params_

{'min_samples_split': 4,
 'min_samples_leaf': 3,
 'max_features': 'log2',
 'criterion': 'gini'}

In [20]:
#Create DecisonTreeClassifier Object

clf = DecisionTreeClassifier(criterion = 'gini' , min_samples_leaf = 3 , max_features ='log2' , min_samples_split = 4)
#Train Decision Tree Classifier

clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [21]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8100558659217877


In [22]:
cm = confusion_matrix(y_test,y_pred)
cm

array([[103,  18],
       [ 16,  42]], dtype=int64)

In [23]:
accuracies = cross_val_score(estimator = clf , X = X_train , y = y_train , cv = 10)
accuracies.mean()

0.8105700871898055