In [1]:
# @title Data manipulation with Python
import pandas as pd

url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
df = pd.read_csv(url)

In [2]:
# @title menampilkan dataset
# menampilkan 5 baris pertama
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# @title melihat missing value
# .sum().sum() untuk menjumlahkan missing value secara keseluruhan
df.isnull().sum() 

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

terdapat missing value pada kolom/atribut, cabin dan embarked masing-masing yaitu:
- age = 177
- cabin = 687
- embarked = 2

to-do:
- hapus kolom cabin
- isi nilai kosong pada kolom age dengan mediannya
- hapus data yang kosong pada kolom embarked

In [10]:
# @title Menghapus Kolom yang tidak relevan untuk pelatihan model
kolom_sampah = ['PassengerId', 'Name', 'Cabin', 'Ticket']
df_bersih = df.drop(columns=kolom_sampah)

In [11]:
# @title Menangani missing value pada kolom age
median_umur = df_bersih['Age'].median()
df_bersih['Age'] = df_bersih['Age'].fillna(median_umur)

In [12]:
# @title Menghapus sisa baris yang masih memiliki missing values (misal ada 2 baris kosong di
# kolom 'Embarked')
df_bersih = df_bersih.dropna()

In [13]:
# @title Melihat kembali sum of missing value
df_bersih.isnull().sum() # ataua kalau kamu lebih lengkap bisa pake df_bersih.info()


Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [14]:
df_bersih.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [18]:
# @title One hot encoding kalo kategoorikal
df_final = pd.get_dummies(df_bersih, columns=['Sex', 'Embarked'], drop_first=True, dtype=int)
df_final.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0
2,1,3,26.0,0,0,7.925,0,0,1
3,1,1,35.0,1,0,53.1,0,0,1
4,0,3,35.0,0,0,8.05,1,0,1


In [21]:
# @title normalisasi menggunakan min max scaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
hasil_scaling = scaler.fit_transform(df_final)

In [22]:
# @title tampilkan kembali hasil data final
hasil_scaling

array([[0.        , 1.        , 0.27117366, ..., 1.        , 0.        ,
        1.        ],
       [1.        , 0.        , 0.4722292 , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 1.        , 0.32143755, ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.        , 1.        , 0.34656949, ..., 0.        , 0.        ,
        1.        ],
       [1.        , 0.        , 0.32143755, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.39683338, ..., 1.        , 1.        ,
        0.        ]], shape=(889, 9))

In [24]:
# @title normalisasi menggunakan standard scaler
from sklearn.preprocessing import StandardScaler    
scaler = StandardScaler()
hasil_scaling_std = scaler.fit_transform(df_final)

In [25]:
hasil_scaling_std

array([[-0.78696114,  0.82520863, -0.56367407, ...,  0.73534203,
        -0.30794088,  0.61679395],
       [ 1.27071078, -1.57221121,  0.66921696, ..., -1.35991138,
        -0.30794088, -1.62128697],
       [ 1.27071078,  0.82520863, -0.25545131, ..., -1.35991138,
        -0.30794088,  0.61679395],
       ...,
       [-0.78696114,  0.82520863, -0.10133993, ..., -1.35991138,
        -0.30794088,  0.61679395],
       [ 1.27071078, -1.57221121, -0.25545131, ...,  0.73534203,
        -0.30794088, -1.62128697],
       [-0.78696114,  0.82520863,  0.20688282, ...,  0.73534203,
         3.24737656, -1.62128697]], shape=(889, 9))

DONE, TIMEKASEH