# Kaggle Titanic First Competition

https://www.kaggle.com/c/titanic/overview

## Exploratory Data Analysis

There were an estimated 2,224 passengers and crew aboard the RMS Titanic.  Over 1500 peopled died.  Survival percentage 32.5%.

**Initialization**

In [1]:
%run init.ipynb

In [97]:
from data.data import ExtractData, TransformData, TransformBin
from zeetle.data import eda

import matplotlib.pyplot as plt 
plt.rc("font", size=14)
import seaborn as sns
sns.set(style="white") #white background style for seaborn plots
sns.set(style="whitegrid", color_codes=True)

# Extract Data

In [99]:
[1,2,3] == [1,2,3]

True

In [96]:
train = ExtractData('../data/raw/kaggle_train.csv')
holdout = ExtractData('../data/raw/kaggle_test.csv')

Xy_raw_train = train.Xy_raw
Xy_raw_holdout = holdout.Xy_raw

Xy_raw_train['fare_raw'] = Xy_raw_train['fare_raw'].replace(0, np.nan)

In [98]:
TransformBin(train.Xy_raw, 'fare_raw')

ValueError: too many values to unpack (expected 2)

In [86]:
tmp = pd.qcut(train.Xy_raw.fare_raw, 5)
tmp[tmp.isna()]

passengerid
180    NaN
264    NaN
272    NaN
278    NaN
303    NaN
      ... 
675    NaN
733    NaN
807    NaN
816    NaN
823    NaN
Name: fare_raw, Length: 15, dtype: category
Categories (5, interval[float64]): [(4.010999999999999, 7.896] < (7.896, 11.133] < (11.133, 23.0] < (23.0, 40.125] < (40.125, 512.329]]

In [52]:
train.Xy_raw[train.Xy_raw.fare_raw.isna()]

Unnamed: 0_level_0,survived,pclass,name,sex,age_raw,sibsp,parch,ticket,fare_raw,cabin,embarked
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
180,0,3,"Leonard, Mr. Lionel",male,36.0,0,0,LINE,,,S
264,0,1,"Harrison, Mr. William",male,40.0,0,0,112059,,B94,S
272,1,3,"Tornquist, Mr. William Henry",male,25.0,0,0,LINE,,,S
278,0,2,"Parkes, Mr. Francis ""Frank""",male,,0,0,239853,,,S
303,0,3,"Johnson, Mr. William Cahoone Jr",male,19.0,0,0,LINE,,,S
...,...,...,...,...,...,...,...,...,...,...,...
675,0,2,"Watson, Mr. Ennis Hastings",male,,0,0,239856,,,S
733,0,2,"Knight, Mr. Robert J",male,,0,0,239855,,,S
807,0,1,"Andrews, Mr. Thomas Jr",male,39.0,0,0,112050,,A36,S
816,0,1,"Fry, Mr. Richard",male,,0,0,112058,,B102,S


In [4]:
df_nan = pd.concat([train.Xy_raw.isna().sum().sort_values(ascending=False).to_frame(), 
                  holdout.Xy_raw.isna().sum().sort_values(ascending=False).to_frame()], axis=1)

df_nan.columns = ['train', 'holdout']

with pd.option_context('display.max_rows', 21):
    df_nan
    
    

Unnamed: 0,train,holdout
age_raw,177,86.0
cabin,687,327.0
embarked,2,0.0
fare_raw,15,1.0
name,0,0.0
parch,0,0.0
pclass,0,0.0
sex,0,0.0
sibsp,0,0.0
survived,0,


# Transform Data according to the rules determined in the EDA

In [53]:
translate_title_dictionary = {
        "Mlle": "Miss",
        "Mme": "Miss",
        "Sir": "Mr",
        "Ms": "Mrs",
        "Rev": np.nan,
        "Col": "Mr",
        "Capt": "Mr",
        "Lady": "Miss",
        "the Countess of": "Mrs",
        "Dr": np.nan,
    }

age_bins = (0, 5, 12, 18, 25, 35, 60, 120)
age_bin_label =   ['baby','child','teen','student','young_adult','adult','senior']

assert len(age_bins) == len(age_bin_label) + 1

In [55]:
train_transformed = TransformData(train,
                                  translate_title_dictionary = translate_title_dictionary,
                                  age_bins = age_bins,
                                  xy_age_estimate=None,
                                  age_bin_label = age_bin_label)

train_transformed.transform()

len(train_transformed.Xy)

train_transformed.Xy[train_transformed.Xy.fare_raw.isna()]

891

Unnamed: 0_level_0,survived,pclass,name,sex,age_raw,sibsp,parch,ticket,fare_raw,cabin,...,last_name,family_size,fare_estimate,fare,fare_bin,age_estimate,age,age_bin,is_child,is_traveling_alone
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
180,0,3,"Leonard, Mr. Lionel",male,36.0,0,0,LINE,,,...,Leonard,1,8.05,8.05,q2,33.0,36.0,adult,False,True
264,0,1,"Harrison, Mr. William",male,40.0,0,0,112059,,B94,...,Harrison,1,61.98,61.98,q4,33.0,40.0,adult,False,True
272,1,3,"Tornquist, Mr. William Henry",male,25.0,0,0,LINE,,,...,Tornquist,1,8.05,8.05,q2,33.0,25.0,student,False,True
278,0,2,"Parkes, Mr. Francis ""Frank""",male,,0,0,239853,,,...,Parkes,1,15.02,15.02,q3,33.0,33.0,young_adult,False,True
303,0,3,"Johnson, Mr. William Cahoone Jr",male,19.0,0,0,LINE,,,...,Johnson,1,8.05,8.05,q2,33.0,19.0,student,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
675,0,2,"Watson, Mr. Ennis Hastings",male,,0,0,239856,,,...,Watson,1,15.02,15.02,q3,33.0,33.0,young_adult,False,True
733,0,2,"Knight, Mr. Robert J",male,,0,0,239855,,,...,Knight,1,15.02,15.02,q3,33.0,33.0,young_adult,False,True
807,0,1,"Andrews, Mr. Thomas Jr",male,39.0,0,0,112050,,A36,...,Andrews,1,61.98,61.98,q4,33.0,39.0,adult,False,True
816,0,1,"Fry, Mr. Richard",male,,0,0,112058,,B102,...,Fry,1,61.98,61.98,q4,33.0,33.0,young_adult,False,True


In [57]:
train_transformed.Xy.to_csv(filename)

In [58]:
filename = '../tests/data/expected_transformed_kaggle_train.csv'

test_train_transformed = pd.read_csv(filename, index_col='passengerid')
test_train_transformed

test_train_transformed.pclass = test_train_transformed.pclass.astype('category')

fare_bin_cat_type = pd.CategoricalDtype(categories=["q1", "q2", "q3", "q4"], ordered=True)
test_train_transformed.fare_bin = test_train_transformed.fare_bin.astype(fare_bin_cat_type)

age_bin_cat_type = pd.CategoricalDtype(categories=['baby','child','teen','student','young_adult','adult','senior'], ordered=True)
test_train_transformed.age_bin = test_train_transformed.age_bin.astype(age_bin_cat_type)

Unnamed: 0_level_0,survived,pclass,name,sex,age_raw,sibsp,parch,ticket,fare_raw,cabin,...,last_name,family_size,fare_estimate,fare,fare_bin,age_estimate,age,age_bin,is_child,is_traveling_alone
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,...,Braund,2,8.05,7.2500,q1,33.0,22.0,student,False,False
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,...,Cumings,2,61.98,71.2833,q4,35.9,38.0,adult,False,False
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,...,Heikkinen,1,8.05,7.9250,q1,22.0,26.0,young_adult,False,True
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,...,Futrelle,2,61.98,53.1000,q4,35.9,35.0,young_adult,False,False
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,...,Allen,1,8.05,8.0500,q2,33.0,35.0,young_adult,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,...,Montvila,1,15.02,13.0000,q2,33.0,27.0,young_adult,False,True
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,...,Graham,1,61.98,30.0000,q3,22.0,19.0,student,False,True
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,...,Johnston,4,8.05,23.4500,q3,22.0,22.0,student,False,False
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,...,Behr,1,61.98,30.0000,q3,33.0,26.0,young_adult,False,True


In [59]:
assert train_transformed.Xy.equals(test_train_transformed)

In [60]:
translate_title_dictionary = {
    "Mlle": "Miss",
    "Mme": "Miss",
    "Sir": "Mr",
    "Ms": "Mrs",
    "Rev": np.nan,
    "Col": "Mr",
    "Capt": "Mr",
    "Lady": "Miss",
    "the Countess of": "Mrs",
    "Dr": np.nan,
}

age_bins = (0, 5, 12, 18, 25, 35, 60, 120)
age_bin_label = ["baby", "child", "teen", "student", "young_adult", "adult", "senior"]

assert len(age_bins) == len(age_bin_label) + 1


def read_csv_expected_data():
    filename = '../tests/data/expected_transformed_kaggle_train.csv'

    test_train_transformed = pd.read_csv(filename, index_col='passengerid')

    test_train_transformed.pclass = test_train_transformed.pclass.astype('category')

    fare_bin_cat_type = pd.CategoricalDtype(categories=["q1", "q2", "q3", "q4"], ordered=True)
    test_train_transformed.fare_bin = test_train_transformed.fare_bin.astype(fare_bin_cat_type)

    age_bin_cat_type = pd.CategoricalDtype(categories=['baby','child','teen','student','young_adult','adult','senior'], ordered=True)
    test_train_transformed.age_bin = test_train_transformed.age_bin.astype(age_bin_cat_type)

    return test_train_transformed


def test_refactor_transformed_data():
    train = ExtractData("../data/raw/kaggle_train.csv")

    transformed_train = TransformData(
        train,
        translate_title_dictionary=translate_title_dictionary,
        age_bins=age_bins,
        xy_age_estimate=None,
        age_bin_label=age_bin_label,
    )

    transformed_train.transform()
    expected_train = read_csv_expected_data()
    
    #assert expected_train.equals(transformed_train.Xy)
    
    return expected_train, transformed_train
    
expected_train, transformed_train = test_refactor_transformed_data()

In [77]:
transformed_train.Xy_fare_estimate

pclass
1    60.29
2    14.25
3     8.05
Name: fare_estimate, dtype: float64

In [61]:
columns = ['survived', 'pclass', 'name', 'sex', 'age_raw', 'sibsp', 'parch',
           'ticket','cabin', 'embarked', 'title', 'last_name',
           'family_size', 
     #      'fare_raw', 'fare_estimate', 'fare', 'fare_bin', 
       'age_estimate', 'age', 'age_bin', 
           'is_child', 'is_traveling_alone',
          ]

columns = ['fare_raw', 'fare_estimate', 'fare', 'fare_bin', ]
assert expected_train[columns].equals(transformed_train.Xy[columns])

AssertionError: 

In [68]:
with pd.option_context('display.max_rows', 23):
	transformed_train.Xy.dtypes

survived                 int64
pclass                category
name                    object
sex                     object
age_raw                float64
sibsp                    int64
parch                    int64
ticket                  object
fare_raw               float64
cabin                   object
embarked                object
title                   object
last_name               object
family_size              int64
fare_estimate          float64
fare                   float64
fare_bin              category
age_estimate           float64
age                    float64
age_bin               category
is_child                  bool
is_traveling_alone        bool
dtype: object

In [78]:
pid = expected_train[expected_train.fare_raw.isna()].index
pid

expected_train.loc[pid]
transformed_train.Xy.loc[pid]

Int64Index([180, 264, 272, 278, 303, 414, 467, 482, 598, 634, 675, 733, 807,
            816, 823],
           dtype='int64', name='passengerid')

Unnamed: 0_level_0,survived,pclass,name,sex,age_raw,sibsp,parch,ticket,fare_raw,cabin,...,last_name,family_size,fare_estimate,fare,fare_bin,age_estimate,age,age_bin,is_child,is_traveling_alone
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
180,0,3,"Leonard, Mr. Lionel",male,36.0,0,0,LINE,,,...,Leonard,1,8.05,8.05,q2,33.0,36.0,adult,False,True
264,0,1,"Harrison, Mr. William",male,40.0,0,0,112059,,B94,...,Harrison,1,61.98,61.98,q4,33.0,40.0,adult,False,True
272,1,3,"Tornquist, Mr. William Henry",male,25.0,0,0,LINE,,,...,Tornquist,1,8.05,8.05,q2,33.0,25.0,student,False,True
278,0,2,"Parkes, Mr. Francis ""Frank""",male,,0,0,239853,,,...,Parkes,1,15.02,15.02,q3,33.0,33.0,young_adult,False,True
303,0,3,"Johnson, Mr. William Cahoone Jr",male,19.0,0,0,LINE,,,...,Johnson,1,8.05,8.05,q2,33.0,19.0,student,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
675,0,2,"Watson, Mr. Ennis Hastings",male,,0,0,239856,,,...,Watson,1,15.02,15.02,q3,33.0,33.0,young_adult,False,True
733,0,2,"Knight, Mr. Robert J",male,,0,0,239855,,,...,Knight,1,15.02,15.02,q3,33.0,33.0,young_adult,False,True
807,0,1,"Andrews, Mr. Thomas Jr",male,39.0,0,0,112050,,A36,...,Andrews,1,61.98,61.98,q4,33.0,39.0,adult,False,True
816,0,1,"Fry, Mr. Richard",male,,0,0,112058,,B102,...,Fry,1,61.98,61.98,q4,33.0,33.0,young_adult,False,True


Unnamed: 0_level_0,survived,pclass,name,sex,age_raw,sibsp,parch,ticket,fare_raw,cabin,...,last_name,family_size,fare_estimate,fare,fare_bin,age_estimate,age,age_bin,is_child,is_traveling_alone
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
180,0,3,"Leonard, Mr. Lionel",male,36.0,0,0,LINE,0.0,,...,Leonard,1,8.05,0.0,q1,33.0,36.0,adult,False,True
264,0,1,"Harrison, Mr. William",male,40.0,0,0,112059,0.0,B94,...,Harrison,1,60.29,0.0,q1,33.0,40.0,adult,False,True
272,1,3,"Tornquist, Mr. William Henry",male,25.0,0,0,LINE,0.0,,...,Tornquist,1,8.05,0.0,q1,33.0,25.0,student,False,True
278,0,2,"Parkes, Mr. Francis ""Frank""",male,,0,0,239853,0.0,,...,Parkes,1,14.25,0.0,q1,33.0,33.0,young_adult,False,True
303,0,3,"Johnson, Mr. William Cahoone Jr",male,19.0,0,0,LINE,0.0,,...,Johnson,1,8.05,0.0,q1,33.0,19.0,student,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
675,0,2,"Watson, Mr. Ennis Hastings",male,,0,0,239856,0.0,,...,Watson,1,14.25,0.0,q1,33.0,33.0,young_adult,False,True
733,0,2,"Knight, Mr. Robert J",male,,0,0,239855,0.0,,...,Knight,1,14.25,0.0,q1,33.0,33.0,young_adult,False,True
807,0,1,"Andrews, Mr. Thomas Jr",male,39.0,0,0,112050,0.0,A36,...,Andrews,1,60.29,0.0,q1,33.0,39.0,adult,False,True
816,0,1,"Fry, Mr. Richard",male,,0,0,112058,0.0,B102,...,Fry,1,60.29,0.0,q1,33.0,33.0,young_adult,False,True


In [36]:
expected_train[columns]
transformed_train.Xy[columns]

Unnamed: 0_level_0,fare_raw,fare_estimate,fare,fare_bin
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,7.2500,8.05,7.2500,q1
2,71.2833,61.98,71.2833,q4
3,7.9250,8.05,7.9250,q1
4,53.1000,61.98,53.1000,q4
5,8.0500,8.05,8.0500,q2
...,...,...,...,...
887,13.0000,15.02,13.0000,q2
888,30.0000,61.98,30.0000,q3
889,23.4500,8.05,23.4500,q3
890,30.0000,61.98,30.0000,q3


Unnamed: 0_level_0,fare_raw,fare_estimate,fare,fare_bin
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,7.2500,8.05,7.2500,q1
2,71.2833,60.29,71.2833,q4
3,7.9250,8.05,7.9250,q2
4,53.1000,60.29,53.1000,q4
5,8.0500,8.05,8.0500,q2
...,...,...,...,...
887,13.0000,14.25,13.0000,q2
888,30.0000,60.29,30.0000,q3
889,23.4500,8.05,23.4500,q3
890,30.0000,60.29,30.0000,q3


In [20]:
holdout_transformed = TransformData(holdout, 
                                    adult_age_threshold_min=train_transformed.adult_age_threshold_min,
                                    translate_title_dictionary = translate_title_dictionary,
                                    age_bins=train_transformed.age_bins,
                                    age_bin_label=train_transformed.age_bin_label,
                                    Xy_age_estimate = train_transformed.Xy_age_estimate,
                                    Xy_fare_estimate = train_transformed.Xy_fare_estimate,
                                    fare_bins = train_transformed.fare_bins,
                                    fare_bin_labels = train_transformed.fare_bin_labels, 
                                    embarked_mode =  train_transformed.embarked_mode, 
                                    )

#assert len(holdout.Xy_raw) == len(holdout_transformed.Xy)

assert holdout_transformed.adult_age_threshold_min == train_transformed.adult_age_threshold_min
assert holdout_transformed.age_bins == train_transformed.age_bins
assert (holdout_transformed.Xy_age_estimate == train_transformed.Xy_age_estimate).all()[0]
assert (holdout_transformed.Xy_fare_estimate == train_transformed.Xy_fare_estimate).all()[0]
assert holdout_transformed.embarked_mode == train_transformed.embarked_mode

len(holdout.Xy_raw)
len(holdout_transformed.Xy)

418

418

In [22]:
train_transformed.Xy_age_estimate
train_transformed.Xy_fare_estimate

train_transformed.Xy.title.value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,age_estimate
sex,title,Unnamed: 2_level_1
female,Miss,22.0
female,Mrs,35.9
male,Master,4.6
male,Mr,33.0


Unnamed: 0_level_0,fare_estimate
pclass,Unnamed: 1_level_1
1,61.98
2,15.02
3,8.05


Mr        537
Miss      186
Mrs       128
Master     40
Name: title, dtype: int64

**Display NaN**

In [23]:
df_nan = pd.concat([train_transformed.Xy.isna().sum().to_frame(), 
                    holdout_transformed.Xy.isna().sum().to_frame()], axis=1)

df_nan.columns = ['train_transformed', 'holdout_transformed']

with pd.option_context('display.max_rows', 25):
    df_nan.sort_values(by='holdout_transformed',ascending=False)
    

Unnamed: 0,train_transformed,holdout_transformed
cabin,687,327.0
cabin_prefix,665,296.0
age_raw,177,86.0
fare_raw,15,3.0
age,0,0.0
is_traveling_alone,0,0.0
ticket,0,0.0
sibsp,0,0.0
sex,0,0.0
pclass,0,0.0


### Save Transformed Data to data/preprocessed

In [24]:
columns_to_drop = ['cabin', 'cabin_prefix', 'age_raw', 'fare_raw', 'fare_estimate', 'age_estimate']

train_transformed.Xy.drop(columns_to_drop, axis=1).to_csv('../data/processed/train_v4.csv', index=True)
holdout_transformed.Xy.drop(columns_to_drop, axis=1).to_csv('../data/processed/holdout_v4.csv', index=True)

# Analysis of Holdout Data