# Kaggle Titanic First Competition

https://www.kaggle.com/c/titanic/overview

## Exploratory Data Analysis

There were an estimated 2,224 passengers and crew aboard the RMS Titanic.  Over 1500 peopled died.  Survival percentage 32.5%.

**Initialization**

In [1]:
%run init.ipynb

In [2]:
from data.data import ExtractData, TransformData
from zeetle.data import eda

import matplotlib.pyplot as plt 
plt.rc("font", size=14)
import seaborn as sns
sns.set(style="white") #white background style for seaborn plots
sns.set(style="whitegrid", color_codes=True)

# Extract Data

In [3]:
train = ExtractData('../data/raw/train.csv')
holdout = ExtractData('../data/raw/holdout.csv')

Xy_raw_train = train.Xy_raw
Xy_raw_holdout = holdout.Xy_raw

Xy_raw_train['fare_raw'] = Xy_raw_train['fare_raw'].replace(0, np.nan)

In [4]:
df_nan = pd.concat([train.Xy_raw.isna().sum().sort_values(ascending=False).to_frame(), 
                  holdout.Xy_raw.isna().sum().sort_values(ascending=False).to_frame()], axis=1)

df_nan.columns = ['train', 'holdout']

with pd.option_context('display.max_rows', 21):
    df_nan
    
    

Unnamed: 0,train,holdout
age_raw,177,86.0
cabin,687,327.0
embarked,2,0.0
fare_raw,15,1.0
name,0,0.0
parch,0,0.0
pclass,0,0.0
sex,0,0.0
sibsp,0,0.0
survived,0,


# Transform Data according to the rules determined in the EDA

In [12]:
translate_title_dictionary = {
        "Mlle": "Miss",
        "Mme": "Miss",
        "Sir": "Mr",
        "Ms": "Mrs",
        "Rev": np.nan,
        "Col": "Mr",
        "Capt": "Mr",
        "Lady": "Miss",
        "the Countess of": "Mrs",
        "Dr": np.nan,
    }

age_bins = (0, 5, 12, 18, 25, 35, 60, 120)
age_bin_label =   ['baby','child','teen','student','young_adult','adult','senior']

assert len(age_bins) == len(age_bin_label) + 1

In [17]:
train_transformed = TransformData(train,
                                  translate_title_dictionary = translate_title_dictionary,
                                  age_bins = age_bins,
                                  Xy_age_estimate=None,
                                  age_bin_label = age_bin_label)

len(train_transformed.Xy)

891

In [18]:
train_transformed.Xy.groupby(['age_bin']).age.describe()
train_transformed.Xy.groupby(['is_child']).age.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
age_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
baby,48.0,2.678542,1.496339,0.42,1.0,2.5,4.0,5.0
child,25.0,8.76,1.690168,6.0,8.0,9.0,10.0,12.0
teen,70.0,16.578571,1.438688,13.0,16.0,17.0,18.0,18.0
student,198.0,22.088384,1.830652,19.0,21.0,22.0,24.0,25.0
young_adult,316.0,31.327532,2.54786,26.0,29.0,33.0,33.0,35.0
adult,212.0,44.114623,6.909738,35.9,38.0,43.0,49.0,60.0
senior,22.0,66.022727,5.024884,61.0,62.0,64.5,70.0,80.0


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
is_child,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,818.0,32.076161,11.404938,13.0,23.0,32.0,36.0,80.0
True,73.0,4.761233,3.295099,0.42,2.0,4.0,8.0,12.0


In [19]:
train_transformed.Xy.title.value_counts()

Mr        537
Miss      186
Mrs       128
Master     40
Name: title, dtype: int64

In [20]:
holdout_transformed = TransformData(holdout, 
                                    adult_age_threshold_min=train_transformed.adult_age_threshold_min,
                                    translate_title_dictionary = translate_title_dictionary,
                                    age_bins=train_transformed.age_bins,
                                    age_bin_label=train_transformed.age_bin_label,
                                    Xy_age_estimate = train_transformed.Xy_age_estimate,
                                    Xy_fare_estimate = train_transformed.Xy_fare_estimate,
                                    fare_bins = train_transformed.fare_bins,
                                    fare_bin_labels = train_transformed.fare_bin_labels, 
                                    embarked_mode =  train_transformed.embarked_mode, 
                                    )

#assert len(holdout.Xy_raw) == len(holdout_transformed.Xy)

assert holdout_transformed.adult_age_threshold_min == train_transformed.adult_age_threshold_min
assert holdout_transformed.age_bins == train_transformed.age_bins
assert (holdout_transformed.Xy_age_estimate == train_transformed.Xy_age_estimate).all()[0]
assert (holdout_transformed.Xy_fare_estimate == train_transformed.Xy_fare_estimate).all()[0]
assert holdout_transformed.embarked_mode == train_transformed.embarked_mode

len(holdout.Xy_raw)
len(holdout_transformed.Xy)

418

418

In [22]:
train_transformed.Xy_age_estimate
train_transformed.Xy_fare_estimate

train_transformed.Xy.title.value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,age_estimate
sex,title,Unnamed: 2_level_1
female,Miss,22.0
female,Mrs,35.9
male,Master,4.6
male,Mr,33.0


Unnamed: 0_level_0,fare_estimate
pclass,Unnamed: 1_level_1
1,61.98
2,15.02
3,8.05


Mr        537
Miss      186
Mrs       128
Master     40
Name: title, dtype: int64

**Display NaN**

In [23]:
df_nan = pd.concat([train_transformed.Xy.isna().sum().to_frame(), 
                    holdout_transformed.Xy.isna().sum().to_frame()], axis=1)

df_nan.columns = ['train_transformed', 'holdout_transformed']

with pd.option_context('display.max_rows', 25):
    df_nan.sort_values(by='holdout_transformed',ascending=False)
    

Unnamed: 0,train_transformed,holdout_transformed
cabin,687,327.0
cabin_prefix,665,296.0
age_raw,177,86.0
fare_raw,15,3.0
age,0,0.0
is_traveling_alone,0,0.0
ticket,0,0.0
sibsp,0,0.0
sex,0,0.0
pclass,0,0.0


### Save Transformed Data to data/preprocessed

In [24]:
columns_to_drop = ['cabin', 'cabin_prefix', 'age_raw', 'fare_raw', 'fare_estimate', 'age_estimate']

train_transformed.Xy.drop(columns_to_drop, axis=1).to_csv('../data/processed/train_v4.csv', index=True)
holdout_transformed.Xy.drop(columns_to_drop, axis=1).to_csv('../data/processed/holdout_v4.csv', index=True)

# Analysis of Holdout Data