# Titanic Dataset : Predicting Survivors

### Loading dataset

In [1]:
import pandas as pd
import numpy as np


In [2]:
train = pd.read_csv("./titanic/train.csv")
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Shape and Describe

In [3]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
train.shape

(891, 12)

## How to see the Number of people that survived ? ( Hint: value_counts()  )

In [5]:
train["Survived"].value_counts()

0    549
1    342
Name: Survived, dtype: int64

### Lets see the number of males and Females that survived

In [6]:
train["Survived"][train["Sex"] == 'male'].value_counts() # 1 : survived, 0: Died

0    468
1    109
Name: Survived, dtype: int64

In [7]:
train["Survived"][train["Sex"] == 'female'].value_counts() # 1 : survived, 0: Died

1    233
0     81
Name: Survived, dtype: int64

## Hmmm, ok but can you normalize these...sure

In [8]:
males = train["Survived"][train["Sex"] == 'male'].value_counts(normalize = True)
males

0    0.811092
1    0.188908
Name: Survived, dtype: float64

In [9]:
females = train["Survived"][train["Sex"] == 'female'].value_counts(normalize = True)
females

1    0.742038
0    0.257962
Name: Survived, dtype: float64

### So  the  rate  for  males  surviving   is  ~ 19 %  and   for  females  it  is  ~ 74%

## What About Age ? Remember : Women and Children first !

In [10]:
children = train["Survived"][train["Age"] < 18].value_counts(normalize = True)
children

1    0.539823
0    0.460177
Name: Survived, dtype: float64

### So about ~ 54 % children survived.

In [11]:
adults = train["Survived"][train["Age"] >= 18].value_counts(normalize = True)
adults

0    0.618968
1    0.381032
Name: Survived, dtype: float64

### And for the adults it was close to 62 %

# Lets get predicting ..... summon the tree

In [12]:
from sklearn import tree

### Hang on a minute .... we need to convert Sex and Class to integers and not to forget imputing all the missing values

In [13]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [14]:
# 1. C = Cherbourg ,  2. Q = Queenstown,   3.  S = Southampton
# Sex male = 0 , female = 1

In [15]:
train["Sex_int"] = 0
train["Sex_int"][train["Sex"] == "male"] = 0

train["Sex_int"][train["Sex"] == "female"] = 1

train[["Sex_int","Sex"]].head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,Sex_int,Sex
0,0,male
1,1,female
2,1,female
3,1,female
4,0,male


In [16]:
train["Embarked"] = train["Embarked"].fillna("S")

train["Embarked_int"] = 0
train["Embarked_int"][train["Embarked"] == 'C'] = 1
train["Embarked_int"][train["Embarked"] == 'Q'] = 2
train["Embarked_int"][train["Embarked"] == 'S'] = 3
train[["Embarked_int","Embarked"]].tail()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Embarked_int,Embarked
886,3,S
887,3,S
888,3,S
889,1,C
890,2,Q


## features

In [17]:
features_tree_one = train[["Pclass","Sex_int","Age","Embarked_int"]]

In [18]:
features_tree_one.values #  values converts pandas dataframe to array structre....perfect for inputs to train our tree

array([[  3.,   0.,  22.,   3.],
       [  1.,   1.,  38.,   1.],
       [  3.,   1.,  26.,   3.],
       ..., 
       [  3.,   1.,  nan,   3.],
       [  1.,   0.,  26.,   1.],
       [  3.,   0.,  32.,   2.]])

## Target

In [19]:
target = train["Survived"].values

## Training our first tree