In [43]:
import numpy as np 
import pandas as pd 
import seaborn as sb 
import matplotlib.pyplot as plt 
import sklearn 
from pandas import Series, DataFrame 
from pylab import rcParams 
from sklearn import preprocessing 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn import metrics
from sklearn.metrics import accuracy_score,classification_report 

### Import data

In [3]:
Url= "https://raw.githubusercontent.com/BigDataGal/Python-for-Data-Science/master/titanic-train.csv "
titanic_df = pd.read_csv(Url)

In [4]:
titanic_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [7]:
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
titanic_df.shape

(891, 12)

In [5]:
titanic_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

## As per instructions, we only select below features

In [10]:
titanic_df = titanic_df[['Survived','Pclass','Sex','Age','SibSp','Parch','Fare']]
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


## Fill missing values in Age

In [15]:
titanic_df['Age'].fillna(0,inplace=True)
titanic_df.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
dtype: int64

## Using get_dummies to handle categorical variables

In [17]:
new_df = pd.get_dummies(titanic_df,columns=['Pclass','Sex'])
new_df.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male
0,0,22.0,1,0,7.25,0,0,1,0,1
1,1,38.0,1,0,71.2833,1,0,0,1,0
2,1,26.0,0,0,7.925,0,0,1,1,0
3,1,35.0,1,0,53.1,1,0,0,1,0
4,0,35.0,0,0,8.05,0,0,1,0,1


### Define input parameters for machine learning model

In [23]:
#X = np.array(titanic_df1.drop(['Survived'], 1).astype(float))
#Y = np.array(titanic_df1['Survived'])
X = new_df.drop(['Survived'],axis=1)
y = new_df['Survived']

In [24]:
X.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male
0,22.0,1,0,7.25,0,0,1,0,1
1,38.0,1,0,71.2833,1,0,0,1,0
2,26.0,0,0,7.925,0,0,1,1,0
3,35.0,1,0,53.1,1,0,0,1,0
4,35.0,0,0,8.05,0,0,1,0,1


In [25]:
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

## Apply train test split on the above data and use Decision Tree Classifier

In [41]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.25, random_state = 42)
for i in range(2,21):
    tree = DecisionTreeClassifier(min_samples_split=i)
    tree.fit(X_train, y_train)
    y_pred = tree.predict(X_test)
    print("Accuracy with min_samples_split " + str(i) + " is " + str(accuracy_score(y_test,y_pred)*100))

Accuracy with min_samples_split 2 is 75.33632286995515
Accuracy with min_samples_split 3 is 78.02690582959642
Accuracy with min_samples_split 4 is 79.82062780269058
Accuracy with min_samples_split 5 is 78.47533632286996
Accuracy with min_samples_split 6 is 76.68161434977578
Accuracy with min_samples_split 7 is 78.47533632286996
Accuracy with min_samples_split 8 is 77.13004484304933
Accuracy with min_samples_split 9 is 78.9237668161435
Accuracy with min_samples_split 10 is 78.47533632286996
Accuracy with min_samples_split 11 is 78.9237668161435
Accuracy with min_samples_split 12 is 80.26905829596413
Accuracy with min_samples_split 13 is 82.0627802690583
Accuracy with min_samples_split 14 is 82.0627802690583
Accuracy with min_samples_split 15 is 83.40807174887892
Accuracy with min_samples_split 16 is 83.40807174887892
Accuracy with min_samples_split 17 is 83.40807174887892
Accuracy with min_samples_split 18 is 83.40807174887892
Accuracy with min_samples_split 19 is 83.40807174887892
Accu

## We use min_samples_split as 15 since it is achieving greater accuracy

In [45]:
final_tree = DecisionTreeClassifier(min_samples_split=15)
final_tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)
print("Accuracy with min_samples_split " + str(i) + " is " + str(accuracy_score(y_test,y_pred)*100))

Accuracy with min_samples_split 20 is 83.40807174887892
