# TODO: Predict Survivor at Titanic shipwreck

In [2]:
import pandas as pd

In [96]:
base_path = '../data/'
train = pd.read_csv(base_path + 'train.csv')
test = pd.read_csv(base_path + 'test.csv')
g_submission = pd.read_csv(base_path + 'gender_submission.csv')

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


## Data pre-processing

### Handling Missing-value

In [12]:
train[train.isnull().any(axis=1)].shape

(708, 12)

#### Embarked

In [14]:
train[train.Embarked.isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [15]:
train.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

##### Inference Embarked

In [21]:
class_1 = train.Pclass == 1 # Masks
female = train.Sex == 'female'

train[class_1 & female].Embarked.value_counts()

S    48
C    43
Q     1
Name: Embarked, dtype: int64

In [24]:
train.loc[train.Embarked.isnull(), 'Embarked'] = 'S'

In [26]:
train[class_1 & female].Embarked.value_counts()

S    50
C    43
Q     1
Name: Embarked, dtype: int64

#### Cabin

In [27]:
train.Cabin.value_counts()

B96 B98        4
G6             4
C23 C25 C27    4
C22 C26        3
F33            3
              ..
E34            1
C7             1
C54            1
E36            1
C148           1
Name: Cabin, Length: 147, dtype: int64

##### Drop Cabin and Other Columns

In [30]:
train.drop(columns=[
    'PassengerId',
    'Name',
    'Ticket',
    'Cabin',
], inplace=True)

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


#### Age

In [28]:
train.Age.value_counts()

24.00    30
22.00    27
18.00    26
19.00    25
28.00    25
         ..
36.50     1
55.50     1
0.92      1
23.50     1
74.00     1
Name: Age, Length: 88, dtype: int64

In [33]:
print(train.Age.mean())
train.fillna(train.Age.mean(), inplace=True)

train[train.Age.isnull()]

29.69911764705882


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked


In [34]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


## Feature Engineering

### Encoding Categorical feature
* One-hot: Nominal feature

In [49]:
train_OHE = pd.get_dummies(data=train, columns=['Sex', 'Embarked'])
train_OHE

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.000000,1,0,7.2500,0,1,0,0,1
1,1,1,38.000000,1,0,71.2833,1,0,1,0,0
2,1,3,26.000000,0,0,7.9250,1,0,0,0,1
3,1,1,35.000000,1,0,53.1000,1,0,0,0,1
4,0,3,35.000000,0,0,8.0500,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,27.000000,0,0,13.0000,0,1,0,0,1
887,1,1,19.000000,0,0,30.0000,1,0,0,0,1
888,0,3,29.699118,1,2,23.4500,1,0,0,0,1
889,1,1,26.000000,0,0,30.0000,0,1,1,0,0


### Normalization: Scaling features

In [40]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [50]:
X, y = train_OHE.drop(columns='Survived'), train_OHE.Survived

In [67]:
# scaler.fit(): Find Min-Max value
# scaler.transform()

# tmp = scaler.fit_transform(X[['Age', 'Fare', 'Pclass']])
# X['Age'] = tmp[:, 0]
# X['Fare'] = tmp[:, 1]
# X['Pclass'] = tmp[:, 2]
# X

X_scaled = scaler.fit_transform(X)
X_scaled

array([[1.        , 0.27117366, 0.125     , ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.4722292 , 0.125     , ..., 1.        , 0.        ,
        0.        ],
       [1.        , 0.32143755, 0.        , ..., 0.        , 0.        ,
        1.        ],
       ...,
       [1.        , 0.36792055, 0.125     , ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.32143755, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [1.        , 0.39683338, 0.        , ..., 0.        , 1.        ,
        0.        ]])

## Training
1. Linear Classifier
2. Logistic Regression
3. Decision Tree
4. Random Forest

In [68]:
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [82]:
# Evaluation metric
from sklearn.metrics import accuracy_score

In [83]:
clf = SGDClassifier()
clf_2 = LogisticRegression()
clf_3 = DecisionTreeClassifier()
clf_4 = RandomForestClassifier()

In [103]:
clf.fit(X_scaled, y)
clf_2.fit(X_scaled, y)
clf_3.fit(X_scaled, y)
clf_4.fit(X_scaled, y)

pred = clf.predict(X_scaled)
pred_2 = clf_2.predict(X_scaled)
pred_3 = clf_3.predict(X_scaled)
pred_4 = clf_4.predict(X_scaled)

In [104]:
print('1. Linear Classifier, Accuracy for Training: %.4f' % accuracy_score(y, pred))
print('2. Logistic Classifier, Accuracy for Training: %.4f' % accuracy_score(y, pred_2))
print('3. Decision Tree, Accuracy for Training: %.4f' % accuracy_score(y, pred_3))
print('4. Random Forest Classifier, Accuracy for Training: %.4f' % accuracy_score(y, pred_4))

1. Linear Classifier, Accuracy for Training: 0.8058
2. Logistic Classifier, Accuracy for Training: 0.8013
3. Decision Tree, Accuracy for Training: 0.9820
4. Random Forest Classifier, Accuracy for Training: 0.9820


## Test

In [93]:
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


### Pre-processing

In [97]:
test.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    object 
 2   Age       332 non-null    float64
 3   SibSp     418 non-null    int64  
 4   Parch     418 non-null    int64  
 5   Fare      417 non-null    float64
 6   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 23.0+ KB


In [99]:
test.Fare.fillna(train.Fare.mean(), inplace=True) # Watch for 'train'
test.Age.fillna(train.Age.mean(), inplace=True) # Watch for 'train'
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    object 
 2   Age       418 non-null    float64
 3   SibSp     418 non-null    int64  
 4   Parch     418 non-null    int64  
 5   Fare      418 non-null    float64
 6   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 23.0+ KB


### Feature engineering

In [106]:
test_OHE = pd.get_dummies(data=test, columns=['Sex', 'Embarked'])

test_scaled = scaler.fit_transform(test_OHE)
test_scaled

array([[1.        , 0.4527232 , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [1.        , 0.61756561, 0.125     , ..., 0.        , 0.        ,
        1.        ],
       [0.5       , 0.8153765 , 0.        , ..., 0.        , 1.        ,
        0.        ],
       ...,
       [1.        , 0.50547277, 0.        , ..., 0.        , 0.        ,
        1.        ],
       [1.        , 0.38941207, 0.        , ..., 0.        , 0.        ,
        1.        ],
       [1.        , 0.38941207, 0.125     , ..., 1.        , 0.        ,
        0.        ]])

### Prediction

In [107]:
result = clf.predict(test_scaled)
result2 = clf_2.predict(test_scaled)
result3 = clf_3.predict(test_scaled)
result4 = clf_4.predict(test_scaled)

## Submit

In [111]:
g_submission.Survived = result4
g_submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,1
414,1306,1
415,1307,0
416,1308,1


In [112]:
g_submission.to_csv('../out/submission.csv', index=False)