In [24]:
import pandas as pd

# Read test and training set 
test_data = pd.read_csv('data/test.csv')
train_data = pd.read_csv('data/train.csv')

First of all have a brief look at the data. Understand the structure of the data. 

In [25]:
train_data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


I'll do Data Cleaning first. Exclude columns that I won't use, handle missing values.

In [26]:
columns_to_exclude = ['Name', 'Ticket']
train_data_cleaned = train_data.drop(columns=columns_to_exclude)

# Check missing values 
missing_values_count = train_data_cleaned.isnull().sum()
missing_values_count[0:10]

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Dealing with missing values. For embarked we only have 2 missing values, so I am going to put the most common port of embarkation here. For age I could use mean value based on gender and fill it in. This seems to be the most natural solution. More than half the values for Cabin are missing. I could remove this column, but the missing information may be useful later in the feature engineering phase. 

In [28]:
# Calculate average age for males and females
average_male_age = train_data[train_data['Sex'] == 'male']['Age'].mean()
average_female_age = train_data[train_data['Sex'] == 'female']['Age'].mean()

# Fill missing values in 'Age' based on gender
train_data_cleaned['Age'] = train_data_cleaned.apply(
    lambda row: average_male_age if pd.isnull(row['Age']) and row['Sex'] == 'male' else
                  average_female_age if pd.isnull(row['Age']) and row['Sex'] == 'female' else
                  row['Age'],
    axis=1
)

# Fill missing values in 'Cabin' with 'C', which is the most common cabin 
train_data_cleaned['Embarked'].fillna('C', inplace=True)
train_data_cleaned.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,0,3,male,22.0,1,0,7.25,,S
1,2,1,1,female,38.0,1,0,71.2833,C85,C
2,3,1,3,female,26.0,0,0,7.925,,S
3,4,1,1,female,35.0,1,0,53.1,C123,S
4,5,0,3,male,35.0,0,0,8.05,,S


In [29]:
sex_encoded = pd.get_dummies(train_data['Sex'], prefix='Sex', drop_first=True)

train_data_cleaned['Sex'] = sex_encoded['Sex_male']
train_data_cleaned.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,0,3,True,22.0,1,0,7.25,,S
1,2,1,1,False,38.0,1,0,71.2833,C85,C
2,3,1,3,False,26.0,0,0,7.925,,S
3,4,1,1,False,35.0,1,0,53.1,C123,S
4,5,0,3,True,35.0,0,0,8.05,,S
