# Decisiontrees
Build a decision tree model to classify whether a passenger survives the titanic or not.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
df=pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
from sklearn.impute import SimpleImputer

In [5]:
imp=SimpleImputer(missing_values=np.nan,strategy="mean")

In [6]:
Age=imp.fit_transform(df[['Age']])

In [7]:
df['Age']=Age

In [8]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [9]:
df.drop(['PassengerId','Name','Ticket','Cabin','Embarked'],axis=1,inplace=True)

In [10]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.000000,1,0,7.2500
1,1,1,female,38.000000,1,0,71.2833
2,1,3,female,26.000000,0,0,7.9250
3,1,1,female,35.000000,1,0,53.1000
4,0,3,male,35.000000,0,0,8.0500
...,...,...,...,...,...,...,...
886,0,2,male,27.000000,0,0,13.0000
887,1,1,female,19.000000,0,0,30.0000
888,0,3,female,29.699118,1,2,23.4500
889,1,1,male,26.000000,0,0,30.0000


In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
lb=LabelEncoder()

In [13]:
sex=lb.fit_transform(df.Sex)

In [14]:
df['Sex']=sex

In [15]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,1,22.000000,1,0,7.2500
1,1,1,0,38.000000,1,0,71.2833
2,1,3,0,26.000000,0,0,7.9250
3,1,1,0,35.000000,1,0,53.1000
4,0,3,1,35.000000,0,0,8.0500
...,...,...,...,...,...,...,...
886,0,2,1,27.000000,0,0,13.0000
887,1,1,0,19.000000,0,0,30.0000
888,0,3,0,29.699118,1,2,23.4500
889,1,1,1,26.000000,0,0,30.0000


In [16]:
X=df.drop('Survived',axis=1)

In [17]:
y=df['Survived']

In [18]:
from sklearn.model_selection import train_test_split


In [20]:
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.3)

In [21]:
from sklearn import tree
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [22]:
dt=tree.DecisionTreeClassifier()

In [23]:
dt.fit(X_train,y_train)

DecisionTreeClassifier()

In [26]:
pred=dt.predict(X_test)

In [27]:
confusion_matrix(y_test,pred)

array([[146,  26],
       [ 34,  62]])

In [28]:
accuracy_score(y_test,pred)

0.7761194029850746

In [29]:
dt.get_depth()

23

In [30]:
max_depth=[x for x in range(1,30)]

In [32]:
accuracy=[]

In [35]:
for i in max_depth:
    dt=tree.DecisionTreeClassifier(criterion='gini',max_depth=i)
    dt.fit(X_train,y_train)
    pred=dt.predict(X_test)
    accuracy.append(accuracy_score(y_test,pred))

In [42]:
accuracy.index(max(accuracy))

3

In [43]:
dt=tree.DecisionTreeClassifier(criterion='gini',max_depth=3)
dt.fit(X_train,y_train)
pred=dt.predict(X_test)

In [44]:
confusion_matrix(y_test,pred)

array([[159,  13],
       [ 38,  58]])