In [39]:
import pandas as pd
import numpy as np
import math

### Assumptions
- women, children and elderly survive 
- rich probably survive - Pclass 1 & 2
    - higher chances of survival if rich and woman/child
- families stay together

## Problem - what sorts of people were more likely to survive?

In [9]:
#load datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [10]:
train.columns, test.columns

(Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
        'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
       dtype='object'),
 Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
        'Ticket', 'Fare', 'Cabin', 'Embarked'],
       dtype='object'))

In [None]:
lambda x : True if (x > 10 and x < 20) else False

In [67]:
train['Age_floor'] = train.Age.apply(lambda x: x if (math.isnan(x)) else int(math.floor(x)))

In [74]:
train['Surname'] = train.Name.apply(lambda x: x.split(',')[0])

In [83]:
train['Title'] = train.Name.apply(lambda x: x.split(',')[1].split('.')[0])

In [93]:
train[train.Fare == max(train.Fare)]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_floor,Surname,Title
258,259,1,1,"Ward, Miss. Anna",female,35.0,0,0,PC 17755,512.3292,,C,35.0,Ward,Miss
679,680,1,1,"Cardeza, Mr. Thomas Drake Martinez",male,36.0,0,1,PC 17755,512.3292,B51 B53 B55,C,36.0,Cardeza,Mr
737,738,1,1,"Lesurer, Mr. Gustave J",male,35.0,0,0,PC 17755,512.3292,B101,C,35.0,Lesurer,Mr


- Avg Fare for PClass 1 = 84.154687 
    - above this: survived - 51, died - 15
    - below this: survived - 291, died - 534   
- Avg Fare for PClass 2 = 20.662183
    - above this: survived - 195, died - 172
    - below this: survived - 147, died - 377
- Avg Fare for PClass 3 = 13.675550
    - above this: survived - 234, died - 224
    - below this: survived - 108, died - 325

In [107]:
train[train.Pclass == 3].Fare.describe()

count    491.000000
mean      13.675550
std       11.778142
min        0.000000
25%        7.750000
50%        8.050000
75%       15.500000
max       69.550000
Name: Fare, dtype: float64

In [116]:
train[train.Fare<=20.662183 ].Survived.value_counts()

0    377
1    147
Name: Survived, dtype: int64

In [113]:
# are you more likely to survive alone or in a family?
train.groupby(['Surname'])['Survived'].agg(['sum','count'])

Unnamed: 0_level_0,sum,count
Surname,Unnamed: 1_level_1,Unnamed: 2_level_1
Abbing,0,1
Abbott,1,2
Abelson,1,2
Adahl,0,1
Adams,0,1
Ahlin,0,1
Aks,1,1
Albimona,1,1
Alexander,0,1
Alhomaki,0,1


In [92]:
train[train['Pclass']==1]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_floor,Surname,Title
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,38.0,Cumings,Mrs
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,35.0,Futrelle,Mrs
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,54.0,McCarthy,Mr
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S,58.0,Bonnell,Miss
23,24,1,1,"Sloper, Mr. William Thompson",male,28.0,0,0,113788,35.5000,A6,S,28.0,Sloper,Mr
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S,47.0,Beckwith,Mrs
872,873,0,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0000,B51 B53 B55,S,33.0,Carlsson,Mr
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C,56.0,Potter,Mrs
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,19.0,Graham,Miss


In [87]:
train.groupby(['Title']).mean()['Age']

Title
 Capt            70.000000
 Col             58.000000
 Don             40.000000
 Dr              42.000000
 Jonkheer        38.000000
 Lady            48.000000
 Major           48.500000
 Master           4.574167
 Miss            21.773973
 Mlle            24.000000
 Mme             24.000000
 Mr              32.368090
 Mrs             35.898148
 Ms              28.000000
 Rev             43.166667
 Sir             49.000000
 the Countess    33.000000
Name: Age, dtype: float64

In [81]:
train[train.Surname=='Sandstrom']

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_floor,Surname
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S,4.0,Sandstrom
394,395,1,3,"Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengt...",female,24.0,0,2,PP 9549,16.7,G6,S,24.0,Sandstrom


In [58]:
train.Sex.value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [59]:
train.groupby(['Age']).sum('Survived').reset_index()[['Survived', 'Age']]

Unnamed: 0,Survived,Age
0,1,0.42
1,1,0.67
2,2,0.75
3,2,0.83
4,1,0.92
...,...,...
83,0,70.00
84,0,70.50
85,0,71.00
86,0,74.00


In [51]:
train.groupby(['Pclass']).sum('Survived').reset_index()[['Survived', 'Pclass']]

Unnamed: 0,Survived,Pclass
0,136,1
1,87,2
2,119,3


In [50]:
train.groupby(['Sex']).sum('Survived').reset_index()[['Survived', 'Sex']]

Unnamed: 0,Survived,Sex
0,233,female
1,109,male


In [38]:
train[train.Age>75]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
630,631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,0,0,27042,30.0,A23,S


In [15]:
train[train.Name.str.contains('Braund')]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
477,478,0,3,"Braund, Mr. Lewis Richard",male,29.0,1,0,3460,7.0458,,S


In [16]:
train[train.Name.str.contains('Cumings')]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


## Example

In [2]:
gender_submission = pd.read_csv('gender_submission.csv')

In [4]:
gender_submission.Survived.value_counts()

0    266
1    152
Name: Survived, dtype: int64

In [8]:
gender_submission.describe()

Unnamed: 0,PassengerId,Survived
count,418.0,418.0
mean,1100.5,0.363636
std,120.810458,0.481622
min,892.0,0.0
25%,996.25,0.0
50%,1100.5,0.0
75%,1204.75,1.0
max,1309.0,1.0
