In [15]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [29]:
def clean_data(train_data):
    # Categorical to binary / one-hot
    train_data["Sex"] = (train_data["Sex"] == "male").astype(int)
    train_data["HasCabin"] = (~train_data["Cabin"].isna()).astype(int)
    train_data = pd.concat([train_data.drop("Embarked", axis=1), pd.get_dummies(train_data["Embarked"], dummy_na=True, prefix="Embarked")], axis=1)
    
    # Drop columns not used for regression
    del train_data["PassengerId"]
    del train_data["Name"]
    del train_data["Ticket"]
    del train_data["Cabin"]
    
    return train_data

In [30]:
train_data = pd.read_csv("data/train.csv")
train_data = clean_data(train_data)
print(train_data.head())
print("Values of Parch: {}".format(train_data["Parch"].unique()))
print("Values of Pclass: {}".format(train_data["Parch"].unique()))
print("Values of SibSp: {}".format(train_data["SibSp"].unique()))

   Survived  Pclass  Sex   Age  SibSp  Parch     Fare  HasCabin  Embarked_C  \
0         0       3    1  22.0      1      0   7.2500         0           0   
1         1       1    0  38.0      1      0  71.2833         1           1   
2         1       3    0  26.0      0      0   7.9250         0           0   
3         1       1    0  35.0      1      0  53.1000         1           0   
4         0       3    1  35.0      0      0   8.0500         0           0   

   Embarked_Q  Embarked_S  Embarked_nan  
0           0           1             0  
1           0           0             0  
2           0           1             0  
3           0           1             0  
4           0           1             0  
Values of Parch: [0 1 2 5 3 4 6]
Values of Pclass: [0 1 2 5 3 4 6]
Values of SibSp: [1 0 3 4 2 5 8]


In [31]:
# Correlation
for x in filter(lambda x: x != "Survived", train_data.columns):
    print("{}: {:.3f}".format(x, train_data[["Survived", x]].corr().iloc[0,1]))

Pclass: -0.338
Sex: -0.543
Age: -0.077
SibSp: -0.035
Parch: 0.082
Fare: 0.257
HasCabin: 0.317
Embarked_C: 0.168
Embarked_Q: 0.004
Embarked_S: -0.156
Embarked_nan: 0.060


* Pclass, age, sibsp & male sex negatively correlated with survival
* Fare & HasCabin (presumably describe similar phenomenon) positively correlated with survival
* Different ports of embarcation have different effects on survival (??!)

* Test:
    * How correlated are Pclass, Fare & HasCabin? (all capture socioeconomic status)
    * What is going on with the ports of embarkation?