In [19]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from pathlib import Path

In [20]:
data_path = Path('data')

train_path = data_path / 'train.csv'
test_path = data_path / 'test.csv'

In [21]:
train_data = pd.read_csv(str(train_path))
test_data = pd.read_csv(str(test_path))

In [22]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [23]:
y = train_data['Survived']
train_data = train_data.drop(columns=['PassengerId', 'Name', 'Ticket', 'Survived'], axis=1)

In [24]:
train_data.isnull().sum()

Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [25]:
train_data

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,22.0,1,0,7.2500,,S
1,1,female,38.0,1,0,71.2833,C85,C
2,3,female,26.0,0,0,7.9250,,S
3,1,female,35.0,1,0,53.1000,C123,S
4,3,male,35.0,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,,S
887,1,female,19.0,0,0,30.0000,B42,S
888,3,female,,1,2,23.4500,,S
889,1,male,26.0,0,0,30.0000,C148,C


In [26]:
# Treat missing cabin values as another category

train_data['Cabin'] = train_data['Cabin'].where(train_data['Cabin'].isnull(), 1)
train_data['Cabin'] = train_data['Cabin'].fillna(0)

In [27]:
# Fill missing age values with average of all ages

age_average = train_data['Age'].mean()
train_data['Age'] = train_data['Age'].fillna(age_average)

In [28]:
# Fill missing embarked with most common values

most_common = train_data['Embarked'].value_counts().index[0]
train_data['Embarked'] = train_data['Embarked'].fillna(most_common)

In [29]:
train_data.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Cabin       0
Embarked    0
dtype: int64

In [31]:
train_data['Sex'] = train_data['Sex'].map({'male':0, 'female':1})

In [32]:
for col in train_data.columns:
    print(train_data[col].value_counts())

3    491
1    216
2    184
Name: Pclass, dtype: int64
Series([], Name: Sex, dtype: int64)
29.699118    177
24.000000     30
22.000000     27
18.000000     26
28.000000     25
            ... 
55.500000      1
53.000000      1
20.500000      1
23.500000      1
0.420000       1
Name: Age, Length: 89, dtype: int64
0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64
0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64
8.0500     43
13.0000    42
7.8958     38
7.7500     34
26.0000    31
           ..
8.4583      1
9.8375      1
8.3625      1
14.1083     1
17.4000     1
Name: Fare, Length: 248, dtype: int64
0    687
1    204
Name: Cabin, dtype: int64
S    646
C    168
Q     77
Name: Embarked, dtype: int64


In [33]:
train_data = pd.get_dummies(train_data, columns=['Embarked'], prefix='emb_')

In [34]:
train_data = pd.get_dummies(train_data, columns=['Parch'], prefix='parch_')

In [35]:
train_data = pd.get_dummies(train_data, columns=['SibSp'], prefix='sibsp_')

In [36]:
train_data = pd.get_dummies(train_data, columns=['Pclass'], prefix='pclass_')

In [37]:
train_data.head()

Unnamed: 0,Sex,Age,Fare,Cabin,emb__C,emb__Q,emb__S,parch__0,parch__1,parch__2,...,sibsp__0,sibsp__1,sibsp__2,sibsp__3,sibsp__4,sibsp__5,sibsp__8,pclass__1,pclass__2,pclass__3
0,,22.0,7.25,0,0,0,1,1,0,0,...,0,1,0,0,0,0,0,0,0,1
1,,38.0,71.2833,1,1,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
2,,26.0,7.925,0,0,0,1,1,0,0,...,1,0,0,0,0,0,0,0,0,1
3,,35.0,53.1,1,0,0,1,1,0,0,...,0,1,0,0,0,0,0,1,0,0
4,,35.0,8.05,0,0,0,1,1,0,0,...,1,0,0,0,0,0,0,0,0,1
