In [3]:
from collections import OrderedDict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

data_train = pd.read_csv("./../data/train.csv")
data_test = pd.read_csv("./../data/test.csv")
data_all = pd.concat([data_train, data_test])


# after concatenation easy to pick train vs test by Survived null column

In [4]:
print("===== survived by class and sex")
print(data_train.groupby(["Pclass", "Sex"])["Survived"].value_counts(normalize=True))

===== survived by class and sex
Pclass  Sex     Survived
1       female  1           0.968085
                0           0.031915
        male    0           0.631148
                1           0.368852
2       female  1           0.921053
                0           0.078947
        male    0           0.842593
                1           0.157407
3       female  0           0.500000
                1           0.500000
        male    0           0.864553
                1           0.135447
dtype: float64


In [5]:
describe_fields = ["Age", "Fare", "Pclass", "SibSp", "Parch"]

print("===== train: males")
print(data_train[data_train["Sex"] == "male"][describe_fields].describe())

print("===== test: males")
print(data_test[data_test["Sex"] == "male"][describe_fields].describe())

print("===== train: females")
print(data_train[data_train["Sex"] == "female"][describe_fields].describe())

print("===== test: females")
print(data_test[data_test["Sex"] == "female"][describe_fields].describe())

===== train: males
              Age        Fare      Pclass       SibSp       Parch
count  453.000000  577.000000  577.000000  577.000000  577.000000
mean    30.726645   25.523893    2.389948    0.429809    0.235702
std     14.678201   43.138263    0.813580    1.061811    0.612294
min      0.420000    0.000000    1.000000    0.000000    0.000000
25%     21.000000    7.895800    2.000000    0.000000    0.000000
50%     29.000000   10.500000    3.000000    0.000000    0.000000
75%     39.000000   26.550000    3.000000    0.000000    0.000000
max     80.000000  512.329200    3.000000    8.000000    5.000000
===== test: males
              Age        Fare      Pclass       SibSp       Parch
count  205.000000  265.000000  266.000000  266.000000  266.000000
mean    30.272732   27.527877    2.334586    0.379699    0.274436
std     13.389528   41.079423    0.808497    0.843735    0.883745
min      0.330000    0.000000    1.000000    0.000000    0.000000
25%     22.000000    7.854200    2.0000

In [6]:
data_all.sort_values(by=["Name"]).head(50)

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
845,42.0,,S,7.55,"Abbing, Mr. Anthony",0,846,3,male,0,0.0,C.A. 5547
392,13.0,,S,20.25,"Abbott, Master. Eugene Joseph",2,1284,3,male,0,,C.A. 2673
746,16.0,,S,20.25,"Abbott, Mr. Rossmore Edward",1,747,3,male,1,0.0,C.A. 2673
279,35.0,,S,20.25,"Abbott, Mrs. Stanton (Rosa Hunt)",1,280,3,female,1,1.0,C.A. 2673
345,16.0,,S,7.65,"Abelseth, Miss. Karen Marie",0,1237,3,female,0,,348125
57,25.0,F G63,S,7.65,"Abelseth, Mr. Olaus Jorgensen",0,949,3,male,0,,348122
308,30.0,,C,24.0,"Abelson, Mr. Samuel",0,309,2,male,1,0.0,P/PP 3381
874,28.0,,C,24.0,"Abelson, Mrs. Samuel (Hannah Wizosky)",0,875,2,female,1,1.0,P/PP 3381
251,20.0,,S,7.925,"Abrahamsson, Mr. Abraham August Johannes",0,1143,3,male,0,,SOTON/O2 3101284
8,18.0,,C,7.2292,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",0,900,3,female,0,,2657


In [14]:
data_all['LastName'] = data_all.Name.str.extract("(.+),.+")
data_train['LastName'] = data_train.Name.str.extract("(.+),.+")

In [15]:
data_train[(data_train.Parch + data_train.SibSp) >= 2].groupby(['LastName', 'Survived']).size()

# is it important that woman would be not alone ... family... 

LastName       Survived
Abbott         0           1
               1           1
Allison        0           2
               1           1
Andersson      0           7
               1           1
Appleton       1           1
Asplund        0           1
               1           3
Backstrom      1           1
Baclini        1           4
Becker         1           2
Beckwith       1           2
Boulos         0           2
Bourke         0           3
Brown          0           1
               1           1
Caldwell       1           2
Carter         1           4
Christy        1           1
Collyer        0           1
               1           2
Compton        1           1
Coutts         1           2
Crosby         0           1
               1           1
Danbom         0           2
Davies         0           1
               1           1
Dean           0           1
                          ..
Nicholls       0           1
Palsson        0           4
Panula         0   