# Kaggle Titanic First Competition

https://www.kaggle.com/c/titanic/overview

## Exploratory Data Analysis

There were an estimated 2,224 passengers and crew aboard the RMS Titanic.  Over 1500 peopled died.  Survival percentage 32.5%.

**Initialization**

In [2]:
%run init.ipynb

In [3]:
from data.data import ExtractData, TransformData
from zeetle.data import eda

import matplotlib.pyplot as plt 
plt.rc("font", size=14)
import seaborn as sns
sns.set(style="white") #white background style for seaborn plots
sns.set(style="whitegrid", color_codes=True)

# Extract Data

In [4]:
train = ExtractData('../data/raw/train.csv')
holdout = ExtractData('../data/raw/holdout.csv')

Xy_raw_train = train.Xy_raw
Xy_raw_holdout = holdout.Xy_raw

In [5]:
df_nan = pd.concat([train.Xy_raw.isna().sum().sort_values(ascending=False).to_frame(), 
                  holdout.Xy_raw.isna().sum().sort_values(ascending=False).to_frame()], axis=1)

df_nan.columns = ['train', 'holdout']

with pd.option_context('display.max_rows', 21):
    df_nan
    
    

Unnamed: 0,train,holdout
age_known,177,86.0
cabin,687,327.0
embarked,2,0.0
fare,0,1.0
name,0,0.0
parch,0,0.0
pclass,0,0.0
sex,0,0.0
sibsp,0,0.0
survived,0,


# Transform Data according to the rules determined in the EDA

In [10]:
train_transformed = TransformData(train)
holdout_transformed = TransformData(holdout, 
                                    adult_age_threshold_min=train_transformed.adult_age_threshold_min,
                                    age_bins=train_transformed.age_bins,
                                    Xy_age_estimate = train_transformed.Xy_age_estimate,
                                    fare_mode =  train_transformed.fare_mode, 
                                    embarked_mode =  train_transformed.embarked_mode, 
                                    )

assert holdout_transformed.adult_age_threshold_min == train_transformed.adult_age_threshold_min
assert holdout_transformed.age_bins == train_transformed.age_bins
assert (holdout_transformed.Xy_age_estimate == train_transformed.Xy_age_estimate).all()[0]
assert holdout_transformed.fare_mode == train_transformed.fare_mode
assert holdout_transformed.embarked_mode == train_transformed.embarked_mode

**Display NaN**

In [11]:
df_nan = pd.concat([train_transformed.Xy.isna().sum().to_frame(), 
                    holdout_transformed.Xy.isna().sum().to_frame()], axis=1)

df_nan.columns = ['train_transformed', 'holdout_transformed']

with pd.option_context('display.max_rows', 21):
    df_nan.sort_values(by='holdout_transformed',ascending=False)
    
    

Unnamed: 0,train_transformed,holdout_transformed
cabin,687,327.0
cabin_prefix,665,296.0
age_known,177,86.0
age,0,0.0
last_name,0,0.0
ticket,0,0.0
sibsp,0,0.0
sex,0,0.0
pclass,0,0.0
parch,0,0.0


### Save Transformed Data to data/preprocessed

In [12]:
columns_to_drop = ['cabin', 'cabin_prefix', 'age_known']

In [14]:
train_transformed.Xy.drop(columns_to_drop, axis=1).to_csv('../data/processed/train.csv', index=True)
holdout_transformed.Xy.drop(columns_to_drop, axis=1).to_csv('../data/processed/holdout.csv', index=True)

In [15]:
holdout_transformed.Xy

Unnamed: 0_level_0,pclass,name,sex,age_known,sibsp,parch,ticket,fare,cabin,embarked,title,last_name,cabin_number,cabin_prefix,family_size,age_estimate,age,age_bin,is_child,is_travelling_alone
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,Mr,Kelly,330911,,1,33.0,34.5,"(30.0, 40.0]",False,True
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,Mrs,Wilkes,363272,,2,35.9,47.0,"(40.0, 50.0]",False,False
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,Mr,Myles,240276,,1,33.0,62.0,"(60.0, inf]",False,True
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,Mr,Wirz,315154,,1,33.0,27.0,"(20.0, 30.0]",False,True
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,Mrs,Hirvonen,3101298,,3,35.9,22.0,"(20.0, 30.0]",False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,Mr,Spector,3236,A.5.,1,33.0,33.0,"(30.0, 40.0]",False,True
1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,Mrs,Oliva y Ocana,17758,PC,1,35.9,39.0,"(30.0, 40.0]",False,True
1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,Mr,Saether,3101262,SOTON/O.Q.,1,33.0,38.5,"(30.0, 40.0]",False,True
1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,Mr,Ware,359309,,1,33.0,33.0,"(30.0, 40.0]",False,True
