# Library

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer

# Dataset

In [2]:
dataset = pd.read_csv('titanic.csv')
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [3]:
rows, cols = dataset.shape
print('Jumlah baris\t:', rows)
print('Jumlah kolom\t:', cols)

Jumlah baris	: 418
Jumlah kolom	: 12


# Select Feature

In [4]:
data = dataset[['Age', 'Fare']]
data.head()

Unnamed: 0,Age,Fare
0,34.5,7.8292
1,47.0,7.0
2,62.0,9.6875
3,27.0,8.6625
4,22.0,12.2875


In [5]:
class_ = dataset[['Survived']]
class_.head()

Unnamed: 0,Survived
0,0
1,1
2,0
3,0
4,1


In [6]:
combined_data = pd.concat([data, class_], axis=1)
combined_data.head()

Unnamed: 0,Age,Fare,Survived
0,34.5,7.8292,0
1,47.0,7.0,1
2,62.0,9.6875,0
3,27.0,8.6625,0
4,22.0,12.2875,1


# Handling Missing Values

In [7]:
mean_age_by_survived = combined_data.groupby('Survived')['Age'].mean()
print(mean_age_by_survived)

Survived
0    30.272732
1    30.272362
Name: Age, dtype: float64


In [8]:
for survived, mean_age in mean_age_by_survived.items():
    combined_data.loc[(combined_data['Survived'] == survived) & (combined_data['Age'].isnull()), 'Age'] = mean_age

combined_data.tail(10)

Unnamed: 0,Age,Fare,Survived
408,30.272362,7.7208,1
409,3.0,13.775,1
410,30.272362,7.75,1
411,37.0,90.0,1
412,28.0,7.775,1
413,30.272732,8.05,0
414,39.0,108.9,1
415,38.5,7.25,0
416,30.272732,8.05,0
417,30.272732,22.3583,0


In [9]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       418 non-null    float64
 1   Fare      417 non-null    float64
 2   Survived  418 non-null    int64  
dtypes: float64(2), int64(1)
memory usage: 9.9 KB


In [10]:
combined_data = combined_data.dropna()

# Data Normalization

In [11]:
data = combined_data[['Age', 'Fare']]
data

Unnamed: 0,Age,Fare
0,34.500000,7.8292
1,47.000000,7.0000
2,62.000000,9.6875
3,27.000000,8.6625
4,22.000000,12.2875
...,...,...
413,30.272732,8.0500
414,39.000000,108.9000
415,38.500000,7.2500
416,30.272732,8.0500


In [12]:
norm_data_minmax = pd.DataFrame()

# MinMax Algorithm
norm_data_minmax[['Age_algorithm', 'Fare_algorithm']] = (data - data.min()) / (data.max() - data.min())

# MinMax from sklearn
minmax_scaler = MinMaxScaler()
norm_data_minmax[['Age_library', 'Fare_library']] = minmax_scaler.fit_transform(data)

norm_data_minmax = norm_data_minmax[['Age_algorithm', 'Age_library', 'Fare_algorithm', 'Fare_library']]
norm_data_minmax

Unnamed: 0,Age_algorithm,Age_library,Fare_algorithm,Fare_library
0,0.452723,0.452723,0.015282,0.015282
1,0.617566,0.617566,0.013663,0.013663
2,0.815377,0.815377,0.018909,0.018909
3,0.353818,0.353818,0.016908,0.016908
4,0.287881,0.287881,0.023984,0.023984
...,...,...,...,...
413,0.396977,0.396977,0.015713,0.015713
414,0.512066,0.512066,0.212559,0.212559
415,0.505473,0.505473,0.014151,0.014151
416,0.396977,0.396977,0.015713,0.015713


In [13]:
norm_data_zscore = pd.DataFrame()

# Z-Score Algorithm
norm_data_zscore[['Age_algorithm', 'Fare_algorithm']] = (data - data.mean()) / data.std()

# Z-Score from sklearn
zscore_scaler = StandardScaler()
norm_data_zscore[['Age_library', 'Fare_library']] = zscore_scaler.fit_transform(data)

norm_data_zscore = norm_data_zscore[['Age_algorithm', 'Age_library', 'Fare_algorithm', 'Fare_library']]
norm_data_zscore

Unnamed: 0,Age_algorithm,Age_library,Fare_algorithm,Fare_library
0,0.342283,0.342694,-0.497213,-0.497811
1,1.337317,1.338923,-0.512045,-0.512660
2,2.531358,2.534398,-0.463974,-0.464532
3,-0.254737,-0.255043,-0.482308,-0.482888
4,-0.652751,-0.653535,-0.417469,-0.417971
...,...,...,...,...
413,0.005781,0.005788,-0.493264,-0.493856
414,0.700495,0.701337,1.310606,1.312180
415,0.660694,0.661487,-0.507573,-0.508183
416,0.005781,0.005788,-0.493264,-0.493856


In [14]:
norm_data_sigmoid = pd.DataFrame()

# Sigmoidal Algorithm
norm_data_sigmoid[['Age_algorithm', 'Fare_algorithm']] = 1 / (1 + np.exp(-data))

# Sigmoidal from sklearn
sigmoid_scaler = Normalizer()
norm_data_sigmoid[['Age_library', 'Fare_library']] = sigmoid_scaler.fit_transform(data)

norm_data_sigmoid = norm_data_sigmoid[['Age_algorithm', 'Age_library', 'Fare_algorithm', 'Fare_library']]
norm_data_sigmoid

Unnamed: 0,Age_algorithm,Age_library,Fare_algorithm,Fare_library
0,1.0,0.975204,0.999602,0.221306
1,1.0,0.989090,0.999089,0.147311
2,1.0,0.988012,0.999938,0.154377
3,1.0,0.952194,0.999827,0.305495
4,1.0,0.873055,0.999995,0.487621
...,...,...,...,...
413,1.0,0.966415,0.999681,0.256985
414,1.0,0.337158,1.000000,0.941448
415,1.0,0.982727,0.999290,0.185059
416,1.0,0.966415,0.999681,0.256985
