# Implementation 2

Here are some examples of code implementation for data preprocessing.<br>
Objective: create a "simplified" data set from the Titanic data set.

## Import libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn 

In [4]:
dataset_raw = pd.read_csv('Titanic.csv')

In [5]:
dataset_raw.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Surname,Fsize,Family,FsizeD,Deck,Child,Mother
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,Braund,2,Braund_2,small,,Adult,Not Mother
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,Cumings,2,Cumings_2,small,C,Adult,Not Mother
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,Heikkinen,1,Heikkinen_1,singleton,,Adult,Not Mother
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,Futrelle,2,Futrelle_2,small,C,Adult,Not Mother
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,Allen,1,Allen_1,singleton,,Adult,Not Mother


## Remove unrelevant data

Removing unrelevant columns and the ones we will not use as predictors:

In [6]:
dataset = dataset_raw.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Title', 'Surname', 'Fsize', 'Family', 'FsizeD', 'Deck', 'Child', 'Mother'], axis=1)
dataset.sample(n=5, random_state = 20)

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
347,1,3,female,24.0,16.1,S
674,0,2,male,26.0,0.0,S
791,0,2,male,16.0,26.0,S
836,0,3,male,21.0,8.6625,S
56,1,2,female,21.0,10.5,S


Removing unrelevant rows:

In [7]:
dataset.duplicated().sum()

76

In [8]:
dataset = dataset.drop_duplicates()
dataset.duplicated().sum()

0

In [9]:
(dataset['Fare'] <= 0).sum()

15

In [10]:
dataset = dataset.drop(dataset[dataset['Fare'] <= 0].index, axis=0)
(dataset['Fare'] <= 0).sum()

0

In [11]:
dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
0,0,3,male,22.0,7.25,S
1,1,1,female,38.0,71.2833,C
2,1,3,female,26.0,7.925,S
3,1,1,female,35.0,53.1,S
4,0,3,male,35.0,8.05,S


## Prepare predictors and response

Response:

In [28]:
# y = dataset['Survived']
y = dataset.iloc[:, 0]
print(y[:6])

0    0
1    1
2    1
3    1
4    0
5    0
Name: Survived, dtype: int64


Features:

In [29]:
# X = dataset.drop('Survived', axis=1)
X = dataset.iloc[:, 1:]
print(X[:6])

   Pclass     Sex   Age     Fare Embarked
0       3    male  22.0   7.2500        S
1       1  female  38.0  71.2833        C
2       3  female  26.0   7.9250        S
3       1  female  35.0  53.1000        S
4       3    male  35.0   8.0500        S
5       3    male  21.0   8.4583        Q


## Data encoding

Label encoding for a boolean data:

In [30]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(X['Sex'])
X['Sex'] = label_encoder.transform(X['Sex'])
print(X[:6])

   Pclass  Sex   Age     Fare Embarked
0       3    1  22.0   7.2500        S
1       1    0  38.0  71.2833        C
2       3    0  26.0   7.9250        S
3       1    0  35.0  53.1000        S
4       3    1  35.0   8.0500        S
5       3    1  21.0   8.4583        Q


One hot encoder for a categorical data:

In [31]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
oh_encoder = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), ['Embarked'])], remainder='passthrough')
test = oh_encoder.fit_transform(X)
print(test[:6])

[[ 0.      0.      1.      3.      1.     22.      7.25  ]
 [ 1.      0.      0.      1.      0.     38.     71.2833]
 [ 0.      0.      1.      3.      0.     26.      7.925 ]
 [ 0.      0.      1.      1.      0.     35.     53.1   ]
 [ 0.      0.      1.      3.      1.     35.      8.05  ]
 [ 0.      1.      0.      3.      1.     21.      8.4583]]


In [32]:
X = pd.get_dummies(X, columns=['Embarked'], prefix=['Embarked_from'])
X.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked_from_C,Embarked_from_Q,Embarked_from_S
0,3,1,22.0,7.25,0,0,1
1,1,0,38.0,71.2833,1,0,0
2,3,0,26.0,7.925,0,0,1
3,1,0,35.0,53.1,0,0,1
4,3,1,35.0,8.05,0,0,1


## Data set split

In [54]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1)

In [55]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(600, 7)
(200, 7)
(600,)
(200,)


In [56]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked_from_C,Embarked_from_Q,Embarked_from_S
138,3,1,16.0,9.2167,0,0,1
373,1,1,22.0,135.6333,1,0,0
215,1,0,31.0,113.275,1,0,0
458,2,0,50.0,10.5,0,0,1
571,1,0,53.0,51.4792,0,0,1


## Scaling

In [57]:
X_train[['Age', 'Fare']][:5]

Unnamed: 0,Age,Fare
138,16.0,9.2167
373,22.0,135.6333
215,31.0,113.275
458,50.0,10.5
571,53.0,51.4792


In [58]:
pd.options.mode.chained_assignment = None  # default='warn'
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train[['Age', 'Fare']] = scaler.fit_transform(X_train[['Age', 'Fare']])
X_test[['Age', 'Fare']] = scaler.transform(X_test[['Age', 'Fare']])

In [60]:
X_train[['Age', 'Fare']][:5]

Unnamed: 0,Age,Fare
138,-0.930789,-0.505767
373,-0.525425,1.943312
215,0.082621,1.510162
458,1.366274,-0.480906
571,1.568956,0.312987


In [62]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked_from_C,Embarked_from_Q,Embarked_from_S
138,3,1,-0.930789,-0.505767,0,0,1
373,1,1,-0.525425,1.943312,1,0,0
215,1,0,0.082621,1.510162,1,0,0
458,2,0,1.366274,-0.480906,0,0,1
571,1,0,1.568956,0.312987,0,0,1
