In [2]:
import numpy as np
import pandas as pd

In [3]:
train_pd = pd.read_csv('data/train.csv')

In [4]:
train_pd = train_pd[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
train_pd

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


In [5]:
train_pd = train_pd.dropna()
train_pd

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
885,0,3,female,39.0,0,5,29.1250,Q
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
889,1,1,male,26.0,0,0,30.0000,C


In [6]:
train_pd.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,712.0,712.0,712.0,712.0,712.0,712.0
mean,0.404494,2.240169,29.642093,0.514045,0.432584,34.567251
std,0.491139,0.836854,14.492933,0.930692,0.854181,52.938648
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,1.0,20.0,0.0,0.0,8.05
50%,0.0,2.0,28.0,0.0,0.0,15.64585
75%,1.0,3.0,38.0,1.0,1.0,33.0
max,1.0,3.0,80.0,5.0,6.0,512.3292


In [7]:
pclass = train_pd['Pclass'].to_numpy()

# Convert to one-hot encoding
as_categorical_pclass = np.eye(3)[pclass - 1]
as_categorical_pclass

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       ...,
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.]])

In [8]:
def sex_str_to_binary(s):
    if s=="male":
        return 1
    else:
        return 0
sex = train_pd['Sex']

as_binary_sex = sex.apply(sex_str_to_binary).to_numpy()
as_binary_sex[:10]

array([1, 0, 0, 0, 1, 1, 1, 0, 0, 0])

In [9]:
age_np = train_pd['Age'].to_numpy()
age_np = (age_np - train_pd['Age'].mean()) / train_pd['Age'].std()
age_np[:10]

array([-0.52729787,  0.57668847, -0.25130129,  0.36969103,  0.36969103,
        1.68067481, -1.9072808 , -0.18230214, -1.07929105, -1.76928251])

In [10]:
sibsp = train_pd['SibSp']
sibsp_np = sibsp.to_numpy()
sibsp_np = (sibsp_np - train_pd['SibSp'].mean()) / train_pd['SibSp'].std()
sibsp_np[:10]

array([ 0.52214373,  0.52214373, -0.55232545,  0.52214373, -0.55232545,
       -0.55232545,  2.67108207, -0.55232545,  0.52214373,  0.52214373])

In [11]:
parch = train_pd['Parch']
parch = parch.to_numpy()
parch = (parch - train_pd['Parch'].mean()) / train_pd['Parch'].std()
parch[:10]

array([-0.50643136, -0.50643136, -0.50643136, -0.50643136, -0.50643136,
       -0.50643136,  0.66428009,  1.83499154, -0.50643136,  0.66428009])

In [12]:
fare = train_pd['Fare'].to_numpy()
fare = (fare - train_pd['Fare'].mean()) / train_pd['Fare'].std()
fare[:10]

array([-0.51601717,  0.69355848, -0.50326656,  0.35007975, -0.50090534,
        0.32670363, -0.25486581, -0.44266244, -0.08493703, -0.33750864])

In [13]:
from sklearn.linear_model import LogisticRegression

In [14]:
train_Y = train_pd['Survived'].to_numpy()
train_Y[:10]

array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1])

In [15]:
train_X = np.concatenate((as_categorical_pclass,
                as_binary_sex.reshape(-1, 1),
                age_np.reshape(-1, 1),
                sibsp_np.reshape(-1,1),
                parch.reshape(-1,1),
                fare.reshape(-1,1)), axis=1)
train_X

array([[ 0.        ,  0.        ,  1.        , ...,  0.52214373,
        -0.50643136, -0.51601717],
       [ 1.        ,  0.        ,  0.        , ...,  0.52214373,
        -0.50643136,  0.69355848],
       [ 0.        ,  0.        ,  1.        , ..., -0.55232545,
        -0.50643136, -0.50326656],
       ...,
       [ 1.        ,  0.        ,  0.        , ..., -0.55232545,
        -0.50643136, -0.08627442],
       [ 1.        ,  0.        ,  0.        , ..., -0.55232545,
        -0.50643136, -0.08627442],
       [ 0.        ,  0.        ,  1.        , ..., -0.55232545,
        -0.50643136, -0.50657227]])

In [16]:
model = LogisticRegression(random_state=0).fit(train_X, train_Y)
model.score(train_X, train_Y)

0.800561797752809

In [17]:
test_pd = pd.read_csv('data/test.csv')
test_pd

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [18]:
test_pd = test_pd[['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
test_pd

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare
0,892,3,male,34.5,0,0,7.8292
1,893,3,female,47.0,1,0,7.0000
2,894,2,male,62.0,0,0,9.6875
3,895,3,male,27.0,0,0,8.6625
4,896,3,female,22.0,1,1,12.2875
...,...,...,...,...,...,...,...
413,1305,3,male,,0,0,8.0500
414,1306,1,female,39.0,0,0,108.9000
415,1307,3,male,38.5,0,0,7.2500
416,1308,3,male,,0,0,8.0500


In [19]:
test_pd = test_pd.fillna(test_pd['Age'].mean())
test_pd

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare
0,892,3,male,34.50000,0,0,7.8292
1,893,3,female,47.00000,1,0,7.0000
2,894,2,male,62.00000,0,0,9.6875
3,895,3,male,27.00000,0,0,8.6625
4,896,3,female,22.00000,1,1,12.2875
...,...,...,...,...,...,...,...
413,1305,3,male,30.27259,0,0,8.0500
414,1306,1,female,39.00000,0,0,108.9000
415,1307,3,male,38.50000,0,0,7.2500
416,1308,3,male,30.27259,0,0,8.0500


In [20]:
pclass = test_pd['Pclass'].to_numpy()

# Convert to one-hot encoding
as_categorical_pclass = np.eye(3)[pclass - 1]
as_categorical_pclass[:10]

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [21]:
sex = test_pd['Sex']
as_binary_sex = sex.apply(sex_str_to_binary).to_numpy()
as_binary_sex[:10]

array([1, 0, 1, 1, 0, 1, 0, 1, 0, 1])

In [22]:
age_np = test_pd['Age'].to_numpy()
age_np = (age_np - train_pd['Age'].mean()) / train_pd['Age'].std()
age_np[:10]

array([ 0.33519146,  1.19768079,  2.23266799, -0.18230214, -0.52729787,
       -1.07929105,  0.0246953 , -0.25130129, -0.80329446, -0.59629702])

In [23]:
sibsp = test_pd['SibSp']
sibsp_np = sibsp.to_numpy()
sibsp_np = (sibsp_np - train_pd['SibSp'].mean()) / train_pd['SibSp'].std()
sibsp_np[:10]

array([-0.55232545,  0.52214373, -0.55232545, -0.55232545,  0.52214373,
       -0.55232545, -0.55232545,  0.52214373, -0.55232545,  1.5966129 ])

In [24]:
parch = test_pd['Parch']
parch = parch.to_numpy()
parch = (parch - train_pd['Parch'].mean()) / train_pd['Parch'].std()
parch[:10]

array([-0.50643136, -0.50643136, -0.50643136, -0.50643136,  0.66428009,
       -0.50643136, -0.50643136,  0.66428009, -0.50643136, -0.50643136])

In [25]:
fare = test_pd['Fare'].to_numpy()
fare = (fare - train_pd['Fare'].mean()) / train_pd['Fare'].std()
fare[:10]

array([-0.5050762 , -0.52073962, -0.4699733 , -0.48933534, -0.42085985,
       -0.47870983, -0.50885416, -0.10516422, -0.51641008, -0.1967797 ])

In [26]:
test_X = np.concatenate((as_categorical_pclass,
                as_binary_sex.reshape(-1, 1),
                age_np.reshape(-1, 1),
                sibsp_np.reshape(-1,1),
                parch.reshape(-1,1),
                fare.reshape(-1,1)), axis=1)
test_X

array([[ 0.        ,  0.        ,  1.        , ..., -0.55232545,
        -0.50643136, -0.5050762 ],
       [ 0.        ,  0.        ,  1.        , ...,  0.52214373,
        -0.50643136, -0.52073962],
       [ 0.        ,  1.        ,  0.        , ..., -0.55232545,
        -0.50643136, -0.4699733 ],
       ...,
       [ 0.        ,  0.        ,  1.        , ..., -0.55232545,
        -0.50643136, -0.51601717],
       [ 0.        ,  0.        ,  1.        , ..., -0.55232545,
        -0.50643136, -0.50090534],
       [ 0.        ,  0.        ,  1.        , ...,  0.52214373,
         0.66428009, -0.23062454]])

In [27]:
predictions = model.predict(test_X)
passenger_ids = test_pd['PassengerId'].to_numpy()
sub_df = pd.DataFrame({'PassengerId':passenger_ids, 'Survived':predictions})

In [28]:
sub_df.to_csv('submission/sub2.csv', index=False)

In [29]:
submission_df = pd.read_csv('data/gender_submission.csv')
submission_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
