# Data EDA and code testing for this tutorial

https://towardsdatascience.com/build-and-run-a-docker-container-for-your-machine-learning-model-60209c2d7a7f

In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [42]:
train_fp = "../data/train.csv"
test_fp = "../data/test.csv"

In [43]:
train = pd.read_csv(train_fp)
print(train.shape)
train.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [44]:
train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [45]:
features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
target = "Survived"
train = train[features + [target]]
train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,3,male,22.0,1,0,7.25,S,0
1,1,female,38.0,1,0,71.2833,C,1
2,3,female,26.0,0,0,7.925,S,1
3,1,female,35.0,1,0,53.1,S,1
4,3,male,35.0,0,0,8.05,S,0


In [46]:
train.isna().sum()

Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
Survived      0
dtype: int64

In [47]:
train["Age"].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [48]:
train["Age"] = train["Age"].fillna(train["Age"].mean())

In [49]:
train["Embarked"].value_counts(dropna=False)

S      644
C      168
Q       77
NaN      2
Name: Embarked, dtype: int64

In [50]:
train = train.dropna(subset=["Embarked"])

In [51]:
train.isna().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
Survived    0
dtype: int64

In [52]:
train.groupby("Pclass")["Fare"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,214.0,84.193516,78.746457,0.0,30.77185,58.6896,93.5,512.3292
2,184.0,20.662183,13.417399,0.0,13.0,14.25,26.0,73.5
3,491.0,13.67555,11.778142,0.0,7.75,8.05,15.5,69.55


In [53]:
train.shape

(889, 8)

In [54]:
train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,3,male,22.0,1,0,7.25,S,0
1,1,female,38.0,1,0,71.2833,C,1
2,3,female,26.0,0,0,7.925,S,1
3,1,female,35.0,1,0,53.1,S,1
4,3,male,35.0,0,0,8.05,S,0


In [55]:
target

'Survived'

In [56]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train.drop(target, axis=1), train[target], test_size=0.2)
print(X_train.shape, X_test.shape)

(711, 7) (178, 7)


In [57]:
y_train.head()

341    1
65     1
661    0
415    0
247    1
Name: Survived, dtype: int64

In [59]:
X_train.loc[:, target] = y_train
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
341,1,female,24.0,3,2,263.0,S,1
65,3,male,29.699118,1,1,15.2458,C,1
661,3,male,40.0,0,0,7.225,C,0
415,3,female,29.699118,0,0,8.05,S,0
247,2,female,24.0,0,2,14.5,S,1


In [60]:
X_test.loc[:, target] = y_test
X_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
309,1,female,30.0,0,0,56.9292,C,1
557,1,male,29.699118,0,0,227.525,C,0
100,3,female,28.0,0,0,7.8958,S,0
432,2,female,42.0,1,0,26.0,S,1
288,2,male,42.0,0,0,13.0,S,1


In [61]:
X_train.to_csv("../titanic_train_preprocessed.csv", index=False)
X_test.to_csv("../titanic_test_preprocessed.csv", index=False)

In [62]:
X_train.isna().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
Survived    0
dtype: int64