# Kaggle Titanic First Competition

https://www.kaggle.com/c/titanic/overview

## Exploratory Data Analysis

There were an estimated 2,224 passengers and crew aboard the RMS Titanic.  Over 1500 peopled died.  Survival percentage 32.5%.

**Initialization**

In [1]:
%run init.ipynb

In [2]:
from data.data import ExtractData, TransformData
from zeetle.data import eda

import matplotlib.pyplot as plt 
plt.rc("font", size=14)
import seaborn as sns
sns.set(style="white") #white background style for seaborn plots
sns.set(style="whitegrid", color_codes=True)

# Extract Data

In [3]:
train = ExtractData('../data/raw/kaggle_train.csv')
holdout = ExtractData('../data/raw/kaggle_test.csv')

Xy_raw_train = train.Xy_raw
Xy_raw_holdout = holdout.Xy_raw

Xy_raw_train['fare_raw'] = Xy_raw_train['fare_raw'].replace(0, np.nan)

In [4]:
df_nan = pd.concat([train.Xy_raw.isna().sum().sort_values(ascending=False).to_frame(), 
                  holdout.Xy_raw.isna().sum().sort_values(ascending=False).to_frame()], axis=1)

df_nan.columns = ['train', 'holdout']

with pd.option_context('display.max_rows', 21):
    df_nan
    
    

Unnamed: 0,train,holdout
age_raw,177,86.0
cabin,687,327.0
embarked,2,0.0
fare_raw,15,1.0
name,0,0.0
parch,0,0.0
pclass,0,0.0
sex,0,0.0
sibsp,0,0.0
survived,0,


# Transform Data according to the rules determined in the EDA

In [5]:
translate_title_dictionary = {
        "Mlle": "Miss",
        "Mme": "Miss",
        "Sir": "Mr",
        "Ms": "Mrs",
        "Rev": np.nan,
        "Col": "Mr",
        "Capt": "Mr",
        "Lady": "Miss",
        "the Countess of": "Mrs",
        "Dr": np.nan,
    }

age_bins = (0, 5, 12, 18, 25, 35, 60, 120)
age_bin_label =   ['baby','child','teen','student','young_adult','adult','senior']

assert len(age_bins) == len(age_bin_label) + 1

In [6]:
train_transformed = TransformData(train,
                                  translate_title_dictionary = translate_title_dictionary,
                                  age_bins = age_bins,
                                  xy_age_estimate=None,
                                  age_bin_label = age_bin_label)

len(train_transformed.Xy)

891

In [11]:
train_transformed.Xy

Unnamed: 0_level_0,survived,pclass,name,sex,age_raw,sibsp,parch,ticket,fare_raw,cabin,...,last_name,family_size,fare_estimate,fare,fare_bin,age_estimate,age,age_bin,is_child,is_traveling_alone
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,...,Braund,2,8.05,7.2500,q1,33.0,22.0,student,False,False
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,...,Cumings,2,61.98,71.2833,q4,35.9,38.0,adult,False,False
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,...,Heikkinen,1,8.05,7.9250,q1,22.0,26.0,young_adult,False,True
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,...,Futrelle,2,61.98,53.1000,q4,35.9,35.0,young_adult,False,False
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,...,Allen,1,8.05,8.0500,q2,33.0,35.0,young_adult,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,...,Montvila,1,15.02,13.0000,q2,33.0,27.0,young_adult,False,True
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,...,Graham,1,61.98,30.0000,q3,22.0,19.0,student,False,True
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,...,Johnston,4,8.05,23.4500,q3,22.0,22.0,student,False,False
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,...,Behr,1,61.98,30.0000,q3,33.0,26.0,young_adult,False,True


In [15]:
filename = '../tests/data/expected_transformed_kaggle_train.csv'

In [90]:
train_transformed.Xy.to_csv(filename, index=True)
train_transformed.Xy.age_bin

passengerid
1          student
2            adult
3      young_adult
4      young_adult
5      young_adult
          ...     
887    young_adult
888        student
889        student
890    young_adult
891    young_adult
Name: age_bin, Length: 891, dtype: category
Categories (7, object): [baby < child < teen < student < young_adult < adult < senior]

In [91]:
test_train_transformed = pd.read_csv(filename, index_col='passengerid')
test_train_transformed.pclass = test_train_transformed.pclass.astype('category')

fare_bin_cat_type = pd.CategoricalDtype(categories=["q1", "q2", "q3", "q4"], ordered=True)
test_train_transformed.fare_bin = test_train_transformed.fare_bin.astype(fare_bin_cat_type)

age_bin_cat_type = pd.CategoricalDtype(categories=['baby','child','teen','student','young_adult','adult','senior'], ordered=True)
test_train_transformed.age_bin = test_train_transformed.age_bin.astype(age_bin_cat_type)
test_train_transformed

Unnamed: 0_level_0,survived,pclass,name,sex,age_raw,sibsp,parch,ticket,fare_raw,cabin,...,last_name,family_size,fare_estimate,fare,fare_bin,age_estimate,age,age_bin,is_child,is_traveling_alone
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,...,Braund,2,8.05,7.2500,q1,33.0,22.0,student,False,False
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,...,Cumings,2,61.98,71.2833,q4,35.9,38.0,adult,False,False
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,...,Heikkinen,1,8.05,7.9250,q1,22.0,26.0,young_adult,False,True
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,...,Futrelle,2,61.98,53.1000,q4,35.9,35.0,young_adult,False,False
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,...,Allen,1,8.05,8.0500,q2,33.0,35.0,young_adult,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,...,Montvila,1,15.02,13.0000,q2,33.0,27.0,young_adult,False,True
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,...,Graham,1,61.98,30.0000,q3,22.0,19.0,student,False,True
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,...,Johnston,4,8.05,23.4500,q3,22.0,22.0,student,False,False
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,...,Behr,1,61.98,30.0000,q3,33.0,26.0,young_adult,False,True


In [92]:
columns = ['survived', 'pclass', 'name', 'sex',
           'age_raw', 'sibsp', 'parch',
           'ticket', 'fare_raw', 'cabin', 'embarked', 'title', 'last_name',
           'family_size', 'fare_estimate', 'fare', 'age_estimate',
           'age', 'is_child', 'is_traveling_alone', 
           'age_bin',
           'fare_bin'
          ]

assert train_transformed.Xy[columns].equals(test_train_transformed[columns])

In [39]:
train_transformed.Xy.columns

Index(['survived', 'pclass', 'name', 'sex', 'age_raw', 'sibsp', 'parch',
       'ticket', 'fare_raw', 'cabin', 'embarked', 'title', 'last_name',
       'family_size', 'fare_estimate', 'fare', 'fare_bin', 'age_estimate',
       'age', 'age_bin', 'is_child', 'is_traveling_alone'],
      dtype='object')

In [18]:
train_transformed.Xy.groupby(['age_bin']).age.describe()
train_transformed.Xy.groupby(['is_child']).age.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
age_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
baby,48.0,2.678542,1.496339,0.42,1.0,2.5,4.0,5.0
child,25.0,8.76,1.690168,6.0,8.0,9.0,10.0,12.0
teen,70.0,16.578571,1.438688,13.0,16.0,17.0,18.0,18.0
student,198.0,22.088384,1.830652,19.0,21.0,22.0,24.0,25.0
young_adult,316.0,31.327532,2.54786,26.0,29.0,33.0,33.0,35.0
adult,212.0,44.114623,6.909738,35.9,38.0,43.0,49.0,60.0
senior,22.0,66.022727,5.024884,61.0,62.0,64.5,70.0,80.0


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
is_child,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,818.0,32.076161,11.404938,13.0,23.0,32.0,36.0,80.0
True,73.0,4.761233,3.295099,0.42,2.0,4.0,8.0,12.0


In [19]:
train_transformed.Xy.title.value_counts()

Mr        537
Miss      186
Mrs       128
Master     40
Name: title, dtype: int64

In [20]:
holdout_transformed = TransformData(holdout, 
                                    adult_age_threshold_min=train_transformed.adult_age_threshold_min,
                                    translate_title_dictionary = translate_title_dictionary,
                                    age_bins=train_transformed.age_bins,
                                    age_bin_label=train_transformed.age_bin_label,
                                    Xy_age_estimate = train_transformed.Xy_age_estimate,
                                    Xy_fare_estimate = train_transformed.Xy_fare_estimate,
                                    fare_bins = train_transformed.fare_bins,
                                    fare_bin_labels = train_transformed.fare_bin_labels, 
                                    embarked_mode =  train_transformed.embarked_mode, 
                                    )

#assert len(holdout.Xy_raw) == len(holdout_transformed.Xy)

assert holdout_transformed.adult_age_threshold_min == train_transformed.adult_age_threshold_min
assert holdout_transformed.age_bins == train_transformed.age_bins
assert (holdout_transformed.Xy_age_estimate == train_transformed.Xy_age_estimate).all()[0]
assert (holdout_transformed.Xy_fare_estimate == train_transformed.Xy_fare_estimate).all()[0]
assert holdout_transformed.embarked_mode == train_transformed.embarked_mode

len(holdout.Xy_raw)
len(holdout_transformed.Xy)

418

418

In [22]:
train_transformed.Xy_age_estimate
train_transformed.Xy_fare_estimate

train_transformed.Xy.title.value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,age_estimate
sex,title,Unnamed: 2_level_1
female,Miss,22.0
female,Mrs,35.9
male,Master,4.6
male,Mr,33.0


Unnamed: 0_level_0,fare_estimate
pclass,Unnamed: 1_level_1
1,61.98
2,15.02
3,8.05


Mr        537
Miss      186
Mrs       128
Master     40
Name: title, dtype: int64

**Display NaN**

In [23]:
df_nan = pd.concat([train_transformed.Xy.isna().sum().to_frame(), 
                    holdout_transformed.Xy.isna().sum().to_frame()], axis=1)

df_nan.columns = ['train_transformed', 'holdout_transformed']

with pd.option_context('display.max_rows', 25):
    df_nan.sort_values(by='holdout_transformed',ascending=False)
    

Unnamed: 0,train_transformed,holdout_transformed
cabin,687,327.0
cabin_prefix,665,296.0
age_raw,177,86.0
fare_raw,15,3.0
age,0,0.0
is_traveling_alone,0,0.0
ticket,0,0.0
sibsp,0,0.0
sex,0,0.0
pclass,0,0.0


### Save Transformed Data to data/preprocessed

In [24]:
columns_to_drop = ['cabin', 'cabin_prefix', 'age_raw', 'fare_raw', 'fare_estimate', 'age_estimate']

train_transformed.Xy.drop(columns_to_drop, axis=1).to_csv('../data/processed/train_v4.csv', index=True)
holdout_transformed.Xy.drop(columns_to_drop, axis=1).to_csv('../data/processed/holdout_v4.csv', index=True)

# Analysis of Holdout Data