# Kaggle Titanic First Competition

https://www.kaggle.com/c/titanic/overview

## Exploratory Data Analysis

There were an estimated 2,224 passengers and crew aboard the RMS Titanic.  Over 1500 peopled died.  Survival percentage 32.5%.

**Initialization**

In [80]:
%run init.ipynb

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [81]:
from data.data import ExtractData, TransformData
from zeetle.data import eda

import matplotlib.pyplot as plt 
plt.rc("font", size=14)
import seaborn as sns
sns.set(style="white") #white background style for seaborn plots
sns.set(style="whitegrid", color_codes=True)

# Extract Data

In [82]:
train = ExtractData('../data/raw/train.csv')
holdout = ExtractData('../data/raw/holdout.csv')

Xy_raw_train = train.Xy_raw
Xy_raw_holdout = holdout.Xy_raw

In [83]:
df_nan = pd.concat([train.Xy_raw.isna().sum().sort_values(ascending=False).to_frame(), 
                  holdout.Xy_raw.isna().sum().sort_values(ascending=False).to_frame()], axis=1)

df_nan.columns = ['train', 'holdout']

with pd.option_context('display.max_rows', 21):
    df_nan
    
    

Unnamed: 0,train,holdout
age_known,177,86.0
cabin,687,327.0
embarked,2,0.0
fare,0,1.0
name,0,0.0
parch,0,0.0
pclass,0,0.0
sex,0,0.0
sibsp,0,0.0
survived,0,


# Transform Data according to the rules determined in the EDA

In [84]:
translate_title_dictionary = {
        "Capt":       "Officer",
        "Col":        "Officer",
        "Major":      "Officer",
        "Dr":         "Officer",
        "Rev":        "Officer",
        "Jonkheer":   "Royalty",
        "Don":        "Royalty",
        "Sir" :       "Royalty",
        "the Countess of":"Royalty",
        "Dona":       "Royalty",
        "Lady" :      "Royalty",
        "Mme":        "Mrs",
        "Ms":         "Mrs",
        "Mrs" :       "Mrs",
        "Mlle":       "Miss",
        "Miss" :      "Miss",
        "Mr" :        "Mr",
        "Master" :    "Master"
                }

age_bins = (0, 5, 12, 18, 25, 35, 60, 120)
age_bin_label =   ['baby','child','teen','student','young_adult','adult','senior']

assert len(age_bins) == len(age_bin_label) + 1

In [85]:
train_transformed = TransformData(train,
                                  translate_title_dictionary = translate_title_dictionary,
                                  age_bins = age_bins,
                                  age_bin_label = age_bin_label)

In [86]:
train_transformed.Xy.groupby(['age_bin']).age.describe()
train_transformed.Xy.groupby(['is_child']).age.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
age_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
baby,48.0,2.678542,1.496339,0.42,1.0,2.5,4.0,5.0
child,25.0,8.76,1.690168,6.0,8.0,9.0,10.0,12.0
teen,70.0,16.578571,1.438688,13.0,16.0,17.0,18.0,18.0
student,198.0,22.05202,1.834047,19.0,21.0,21.8,24.0,25.0
young_adult,315.0,31.133333,2.434427,26.0,29.0,32.5,32.5,35.0
adult,213.0,44.10892,6.914366,35.7,38.0,43.0,49.0,60.0
senior,22.0,66.022727,5.024884,61.0,62.0,64.5,70.0,80.0


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
is_child,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,818.0,32.006724,11.417732,13.0,23.0,32.0,36.0,80.0
True,73.0,4.761233,3.295099,0.42,2.0,4.0,8.0,12.0


In [87]:
holdout_transformed = TransformData(holdout, 
                                    adult_age_threshold_min=train_transformed.adult_age_threshold_min,
                                    translate_title_dictionary = translate_title_dictionary,
                                    age_bins=train_transformed.age_bins,
                                    age_bin_label=train_transformed.age_bin_label,
                                    Xy_age_estimate = train_transformed.Xy_age_estimate,
                                    fare_mode =  train_transformed.fare_mode, 
                                    fare_bins = train_transformed.fare_bins,
                                    fare_bin_labels = train_transformed.fare_bin_labels, 
                                    embarked_mode =  train_transformed.embarked_mode, 
                                    )

#assert len(holdout.Xy_raw) == len(holdout_transformed.Xy)

assert holdout_transformed.adult_age_threshold_min == train_transformed.adult_age_threshold_min
assert holdout_transformed.age_bins == train_transformed.age_bins
assert (holdout_transformed.Xy_age_estimate == train_transformed.Xy_age_estimate).all()[0]
assert holdout_transformed.fare_mode == train_transformed.fare_mode
assert holdout_transformed.embarked_mode == train_transformed.embarked_mode

len(holdout.Xy_raw)
len(holdout_transformed.Xy)

418

418

In [75]:
train_transformed.Xy_age_estimate

Unnamed: 0_level_0,Unnamed: 1_level_0,age_estimate
sex,title,Unnamed: 2_level_1
female,Miss,21.8
female,Mrs,35.7
female,Officer,49.0
female,Royalty,40.5
male,Master,4.6
male,Mr,32.5
male,Officer,46.3
male,Royalty,49.0


**Display NaN**

In [88]:
df_nan = pd.concat([train_transformed.Xy.isna().sum().to_frame(), 
                    holdout_transformed.Xy.isna().sum().to_frame()], axis=1)

df_nan.columns = ['train_transformed', 'holdout_transformed']

with pd.option_context('display.max_rows', 22):
    df_nan.sort_values(by='holdout_transformed',ascending=False)
    

Unnamed: 0,train_transformed,holdout_transformed
cabin,687,327.0
cabin_prefix,665,296.0
age_known,177,86.0
fare_bin,0,2.0
is_traveling_alone,0,0.0
ticket,0,0.0
sibsp,0,0.0
sex,0,0.0
pclass,0,0.0
parch,0,0.0


### Save Transformed Data to data/preprocessed

In [89]:
columns_to_drop = ['cabin', 'cabin_prefix', 'age_known']

In [90]:
train_transformed.Xy.drop(columns_to_drop, axis=1).to_csv('../data/processed/train_v2.csv', index=True)
holdout_transformed.Xy.drop(columns_to_drop, axis=1).to_csv('../data/processed/holdout_v2.csv', index=True)

In [91]:
holdout_transformed.Xy

Unnamed: 0_level_0,pclass,name,sex,age_known,sibsp,parch,ticket,fare,cabin,embarked,...,last_name,cabin_number,cabin_prefix,family_size,age_estimate,age,age_bin,fare_bin,is_child,is_traveling_alone
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,...,Kelly,330911,,1,32.5,34.5,young_adult,q1,False,True
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,...,Wilkes,363272,,2,35.7,47.0,adult,q1,False,False
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,...,Myles,240276,,1,32.5,62.0,senior,q2,False,True
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,...,Wirz,315154,,1,32.5,27.0,young_adult,q2,False,True
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,...,Hirvonen,3101298,,3,35.7,22.0,student,q2,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,...,Spector,3236,A.5.,1,32.5,32.5,young_adult,q2,False,True
1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,...,Oliva y Ocana,17758,PC,1,35.7,39.0,adult,q4,False,True
1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,...,Saether,3101262,SOTON/O.Q.,1,32.5,38.5,adult,q1,False,True
1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,...,Ware,359309,,1,32.5,32.5,young_adult,q2,False,True
