In [1]:
# Titanic: Machine Learning from Disaster
# Download data from here : https://www.kaggle.com/c/titanic/data

# Explaining the data
# Variable          Definition                                          Key
#-------------------------------------------------------------------------------------------------------------------------
# survival          Survival                                            0 = No, 1 = Yes
# pclass            Ticket                                              class	1 = 1st, 2 = 2nd, 3 = 3rd
# sex               Sex	
# Age               Age in years	
# sibsp	            # of siblings / spouses aboard the Titanic	
# parch             # of parents / children aboard the Titanic	
# ticket            Ticket number	
# fare              Passenger fare	
# cabin             Cabin number	
# embarked          Port of Embarkation                                 C = Cherbourg, Q = Queenstown, S = Southampton
#-------------------------------------------------------------------------------------------------------------------------

In [2]:
# import pandas library for data analysis
import pandas as pd

# import training data and test data, set Passenger ID as index
# difference between two data : training data has information whether the passenger survived or not, while the test data doesn't
train = pd.read_csv("../Downloads/train.csv", index_col="PassengerId")
test = pd.read_csv("../Downloads/test.csv", index_col="PassengerId")
# review the data size
print(train.shape)
print(test.shape)

(891, 11)
(418, 10)


In [3]:
# look at the first five data
train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
# Analyze data
# Survival rate : PClass 1 > PClass 2 > PClass 3
train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [6]:
# Survival rate : female > male
train[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


In [7]:
train[["SibSp", "Survived"]].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,SibSp,Survived
1,1,0.535885
2,2,0.464286
0,0,0.345395
3,3,0.25
4,4,0.166667
5,5,0.0
6,8,0.0


In [8]:
train[["Parch", "Survived"]].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Parch,Survived
3,3,0.6
1,1,0.550847
2,2,0.5
0,0,0.343658
5,5,0.2
4,4,0.0
6,6,0.0


In [9]:
train["family"] = train["Parch"] + train["SibSp"] + 1
train[["family", "Survived"]].groupby(['family'], as_index=False).mean().sort_values(by='Survived', ascending=False)
# -> Conclusion : survival rate grows if you have less family member

Unnamed: 0,family,Survived
3,4,0.724138
2,3,0.578431
1,2,0.552795
6,7,0.333333
0,1,0.303538
4,5,0.2
5,6,0.136364
7,8,0.0
8,11,0.0


In [10]:

# Survival rate : C = Cherbourg > Q = Queenstown > S = Southampton
train[["Embarked", "Survived"]].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.336957


In [11]:
# Preprocessing
# Match columns
test["family"] = test["Parch"] + test["SibSp"] + 1

# Encode "Sex" column
train.loc[train["Sex"] == "male", "Sex_encode"] = 0
train.loc[train["Sex"] == "female", "Sex_encode"] = 1
test.loc[test["Sex"] == "male", "Sex_encode"] = 0
test.loc[test["Sex"] == "female", "Sex_encode"] = 1

# Encode "Embarked" column
train["Embarked_C"] = train["Embarked"] == "C"
train["Embarked_S"] = train["Embarked"] == "S"
train["Embarked_Q"] = train["Embarked"] == "Q"
test["Embarked_C"] = test["Embarked"] == "C"
test["Embarked_S"] = test["Embarked"] == "S"
test["Embarked_Q"] = test["Embarked"] == "Q"

# Remove null data
train["Fare_filled"] = train["Fare"]
test["Fare_filled"] = test["Fare"]
test.loc[test["Fare"].isnull(), "Fare_filled"] = 0


In [12]:
# Prepare for training
feature_names = ["Pclass", "Sex_encode", "Fare_filled", "Embarked_C", "Embarked_S", "Embarked_Q", "family"]
label_name = "Survived"

X_train = train[feature_names]
X_test = test[feature_names]
y_train = train[label_name]

from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=8, random_state=0)
model.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=8, random_state=0)

In [13]:
# Predict
predictions = model.predict(X_test)

In [14]:
# Submit
submission = pd.read_csv("../Downloads/gender_submission.csv", index_col="PassengerId")
submission["Survived"] = predictions
submission.to_csv("../Downloads/prediction_submission.csv")
