# Data Pipeline Demo

Data Source : https://data.jakarta.go.id/dataset/indeks-standar-pencemaran-udara-ispu-tahun-2021

In [246]:
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np 
import joblib
import os
import yaml

In [164]:
params_dir = "../config/params.yaml"

In [165]:
def load_params(param_dir):
    with open(param_dir, 'r') as file:
        params = yaml.safe_load(file)
        
    return params

In [248]:
params = load_params(params_dir)

## 1. Data Collection

In [3]:
# fungsi untuk membaca nama file, memuat file, dan menggabungkan dataset
def read_dataset(dataset_dir):
    dataset = pd.DataFrame()

    for i in tqdm(os.listdir(dataset_dir)):
        dataset = pd.concat([pd.read_csv(dataset_dir + i), dataset])
    
    return dataset

In [4]:
# melakukan pembacaan nama file, memuat file, dan menggabungkan dataset
dataset = read_dataset(params["dataset_dir"])

100%|██████████| 7/7 [00:00<00:00, 174.98it/s]


In [5]:
# cek kondisi dataset
dataset

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori,location
0,2021-09-01,DKI1 (Bunderan HI),63,88,29,15,24,38,88,PM25,SEDANG,
1,2021-09-02,DKI1 (Bunderan HI),60,83,29,11,30,28,83,PM25,SEDANG,
2,2021-09-03,DKI1 (Bunderan HI),60,82,27,11,37,30,82,PM25,SEDANG,
3,2021-09-04,DKI1 (Bunderan HI),58,77,26,10,31,28,77,PM25,SEDANG,
4,2021-09-05,DKI1 (Bunderan HI),63,85,27,11,28,28,85,PM25,SEDANG,
...,...,...,...,...,...,...,...,...,...,...,...,...
150,2021-08-27,DKI5 (Kebon Jeruk) Jakarta Barat,61,96,34,8,29,15,96,PM25,SEDANG,
151,2021-08-28,DKI5 (Kebon Jeruk) Jakarta Barat,63,100,31,8,44,12,100,PM25,SEDANG,
152,2021-08-29,DKI5 (Kebon Jeruk) Jakarta Barat,67,111,32,10,36,13,111,PM25,TIDAK SEHAT,
153,2021-08-30,DKI5 (Kebon Jeruk) Jakarta Barat,83,126,35,16,32,29,126,PM25,TIDAK SEHAT,


In [6]:
# terdapat beberapa temuan disini:
# 1. index hanya terlihat sampai 154 padahal jumlah rows sampai 1070
# 2. tanggal hanya terlihat dari bulan 8 dan bulan 9
# 3. kolom location terlihat banyak NaN

# harus diselidiki lebih lanjut

In [7]:
# simpan dataset yang telah digabungkan
joblib.dump(dataset, "../data/processed/dataset.pkl")

['../data/processed/dataset.pkl']

## 2. Data Definition

In [8]:
# definisikan data

## 3. Data Validation

### 3.1. Tipe Data

In [9]:
# cek tipe data
dataset.dtypes

tanggal     object
stasiun     object
pm10        object
pm25        object
so2         object
co          object
o3          object
no2         object
max         object
critical    object
categori    object
location    object
dtype: object

In [10]:
# dari pengecekan data terlihat bahwa semuanya adalah data objek (string), perlu diselidiki lebih lanjut

### 3.2. Range

In [11]:
# pengecekan cakupan data menjadi kacau jika tipe data tidak sesuai
dataset.describe()

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori,location
count,1070,765,1070,1070,1070,1070,1070,1070,1070,1062,1068,305
unique,214,5,112,180,175,76,108,100,206,96,8,4
top,2021-09-01,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),71,---,10,13,15,77,PM25,SEDANG,SEDANG
freq,5,153,61,27,104,65,44,43,20,714,609,182


### 3.3. Dimensi Data

In [12]:
# dimensi data kemungkinan besar tidak terpengaruh, namun nanti kita kembali lagi
dataset.shape

(1070, 12)

### 3.4. Handling Columns Error

#### 3.4.1. Memperbaiki dataset yang kolomnya error

In [13]:
# dengan melihat file dataset satu persatu, terdapat 2 file yang nama kolomnya kemungkinan besar tidak benar
dataset_juli = pd.read_csv("../data/raw/indeks-standar-pencemar-udara-di-spku-bulan-juli-tahun-2021.csv")
dataset_juni = pd.read_csv("../data/raw/indeks-standar-pencemar-udara-di-spku-bulan-juni-tahun-2021.csv")

In [14]:
# cek dataset yang kolomnya bermasalah
dataset_juli

Unnamed: 0,tanggal,pm10,pm25,so2,co,o3,no2,max,critical,categori,location
0,2021-07-01,DKI1 (Bunderan HI),55,76,27,14,20,31,76,PM25,SEDANG
1,2021-07-02,DKI1 (Bunderan HI),55,70,33,16,14,30,70,PM25,SEDANG
2,2021-07-03,DKI1 (Bunderan HI),58,81,33,12,19,25,81,PM25,SEDANG
3,2021-07-04,DKI1 (Bunderan HI),68,102,27,11,27,22,102,PM25,TIDAK SEHAT
4,2021-07-05,DKI1 (Bunderan HI),71,106,28,12,34,25,106,PM25,TIDAK SEHAT
...,...,...,...,...,...,...,...,...,...,...,...
150,2021-07-27,DKI5 (Kebon Jeruk) Jakarta Barat,64,110,32,13,29,35,110,PM25,TIDAK SEHAT
151,2021-07-28,DKI5 (Kebon Jeruk) Jakarta Barat,70,130,33,17,28,45,130,PM25,TIDAK SEHAT
152,2021-07-29,DKI5 (Kebon Jeruk) Jakarta Barat,78,140,32,18,29,39,140,PM25,TIDAK SEHAT
153,2021-07-30,DKI5 (Kebon Jeruk) Jakarta Barat,75,121,37,12,50,21,121,PM25,TIDAK SEHAT


In [15]:
# cek dataset yang kolomnya bermasalah
dataset_juni

Unnamed: 0,tanggal,pm10,pm25,so2,co,o3,no2,max,critical,categori,location
0,2021-06-01,DKI1 (Bunderan HI),59,83,22,18,19,35,83,PM25,SEDANG
1,2021-06-02,DKI1 (Bunderan HI),59,84,21,20,24,38,84,PM25,SEDANG
2,2021-06-03,DKI1 (Bunderan HI),54,76,22,20,17,41,76,PM25,SEDANG
3,2021-06-04,DKI1 (Bunderan HI),63,87,20,13,14,30,87,PM25,SEDANG
4,2021-06-05,DKI1 (Bunderan HI),59,79,23,20,19,38,79,PM25,SEDANG
...,...,...,...,...,...,...,...,...,...,...,...
145,2021-06-26,DKI5 (Kebon Jeruk) Jakarta Barat,45,70,39,8,16,22,70,PM25,SEDANG
146,2021-06-27,DKI5 (Kebon Jeruk) Jakarta Barat,68,120,28,22,17,41,120,PM25,TIDAK SEHAT
147,2021-06-28,DKI5 (Kebon Jeruk) Jakarta Barat,59,99,25,17,21,30,99,PM25,SEDANG
148,2021-06-29,DKI5 (Kebon Jeruk) Jakarta Barat,72,128,35,24,21,50,128,PM25,TIDAK SEHAT


In [16]:
# buat kolom baru yang urutannya sesuai dengan tipe datanya
dataset_new_columns = dataset_juni.columns.to_list()
dataset_new_columns.remove("location")
dataset_new_columns.insert(1, "stasiun")
dataset_new_columns

['tanggal',
 'stasiun',
 'pm10',
 'pm25',
 'so2',
 'co',
 'o3',
 'no2',
 'max',
 'critical',
 'categori']

In [17]:
# memperbaiki nama kolom agar sesuai dengan tipe datanya
dataset_juli.columns = dataset_new_columns
dataset_juni.columns = dataset_new_columns

In [18]:
# cek dataset setelah diperbaiki
dataset_juli

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori
0,2021-07-01,DKI1 (Bunderan HI),55,76,27,14,20,31,76,PM25,SEDANG
1,2021-07-02,DKI1 (Bunderan HI),55,70,33,16,14,30,70,PM25,SEDANG
2,2021-07-03,DKI1 (Bunderan HI),58,81,33,12,19,25,81,PM25,SEDANG
3,2021-07-04,DKI1 (Bunderan HI),68,102,27,11,27,22,102,PM25,TIDAK SEHAT
4,2021-07-05,DKI1 (Bunderan HI),71,106,28,12,34,25,106,PM25,TIDAK SEHAT
...,...,...,...,...,...,...,...,...,...,...,...
150,2021-07-27,DKI5 (Kebon Jeruk) Jakarta Barat,64,110,32,13,29,35,110,PM25,TIDAK SEHAT
151,2021-07-28,DKI5 (Kebon Jeruk) Jakarta Barat,70,130,33,17,28,45,130,PM25,TIDAK SEHAT
152,2021-07-29,DKI5 (Kebon Jeruk) Jakarta Barat,78,140,32,18,29,39,140,PM25,TIDAK SEHAT
153,2021-07-30,DKI5 (Kebon Jeruk) Jakarta Barat,75,121,37,12,50,21,121,PM25,TIDAK SEHAT


In [19]:
# cek dataset setelah diperbaiki
dataset_juni

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori
0,2021-06-01,DKI1 (Bunderan HI),59,83,22,18,19,35,83,PM25,SEDANG
1,2021-06-02,DKI1 (Bunderan HI),59,84,21,20,24,38,84,PM25,SEDANG
2,2021-06-03,DKI1 (Bunderan HI),54,76,22,20,17,41,76,PM25,SEDANG
3,2021-06-04,DKI1 (Bunderan HI),63,87,20,13,14,30,87,PM25,SEDANG
4,2021-06-05,DKI1 (Bunderan HI),59,79,23,20,19,38,79,PM25,SEDANG
...,...,...,...,...,...,...,...,...,...,...,...
145,2021-06-26,DKI5 (Kebon Jeruk) Jakarta Barat,45,70,39,8,16,22,70,PM25,SEDANG
146,2021-06-27,DKI5 (Kebon Jeruk) Jakarta Barat,68,120,28,22,17,41,120,PM25,TIDAK SEHAT
147,2021-06-28,DKI5 (Kebon Jeruk) Jakarta Barat,59,99,25,17,21,30,99,PM25,SEDANG
148,2021-06-29,DKI5 (Kebon Jeruk) Jakarta Barat,72,128,35,24,21,50,128,PM25,TIDAK SEHAT


In [20]:
# gabungkan kedua dataset juni dan juli
dataset_juni_juli = pd.concat([dataset_juni, dataset_juli])

In [21]:
# cek dataset
dataset_juni_juli

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori
0,2021-06-01,DKI1 (Bunderan HI),59,83,22,18,19,35,83,PM25,SEDANG
1,2021-06-02,DKI1 (Bunderan HI),59,84,21,20,24,38,84,PM25,SEDANG
2,2021-06-03,DKI1 (Bunderan HI),54,76,22,20,17,41,76,PM25,SEDANG
3,2021-06-04,DKI1 (Bunderan HI),63,87,20,13,14,30,87,PM25,SEDANG
4,2021-06-05,DKI1 (Bunderan HI),59,79,23,20,19,38,79,PM25,SEDANG
...,...,...,...,...,...,...,...,...,...,...,...
150,2021-07-27,DKI5 (Kebon Jeruk) Jakarta Barat,64,110,32,13,29,35,110,PM25,TIDAK SEHAT
151,2021-07-28,DKI5 (Kebon Jeruk) Jakarta Barat,70,130,33,17,28,45,130,PM25,TIDAK SEHAT
152,2021-07-29,DKI5 (Kebon Jeruk) Jakarta Barat,78,140,32,18,29,39,140,PM25,TIDAK SEHAT
153,2021-07-30,DKI5 (Kebon Jeruk) Jakarta Barat,75,121,37,12,50,21,121,PM25,TIDAK SEHAT


In [22]:
# cek kembali tipe datanya
dataset_juni_juli.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 305 entries, 0 to 154
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   tanggal   305 non-null    object
 1   stasiun   305 non-null    object
 2   pm10      305 non-null    object
 3   pm25      305 non-null    object
 4   so2       305 non-null    object
 5   co        305 non-null    object
 6   o3        305 non-null    object
 7   no2       305 non-null    object
 8   max       305 non-null    int64 
 9   critical  304 non-null    object
 10  categori  305 non-null    object
dtypes: int64(1), object(10)
memory usage: 28.6+ KB


In [23]:
# terlihat ada kolom yang telah berubah dan terlihat lebih baik
# namun masih harus di selidiki lebih lanjut

### 3.5. Handing Column "Tanggal"

#### 3.5.1. Convert tipe data tanggal pada dataset juni juli ke datetime

In [24]:
# convert data teks ke datetime
dataset_juni_juli.tanggal = pd.to_datetime(dataset_juni_juli.tanggal)

In [25]:
# cek data tanggal yang telah di convert
sorted(dataset_juni_juli.tanggal.value_counts().index)

[Timestamp('2021-06-01 00:00:00'),
 Timestamp('2021-06-02 00:00:00'),
 Timestamp('2021-06-03 00:00:00'),
 Timestamp('2021-06-04 00:00:00'),
 Timestamp('2021-06-05 00:00:00'),
 Timestamp('2021-06-06 00:00:00'),
 Timestamp('2021-06-07 00:00:00'),
 Timestamp('2021-06-08 00:00:00'),
 Timestamp('2021-06-09 00:00:00'),
 Timestamp('2021-06-10 00:00:00'),
 Timestamp('2021-06-11 00:00:00'),
 Timestamp('2021-06-12 00:00:00'),
 Timestamp('2021-06-13 00:00:00'),
 Timestamp('2021-06-14 00:00:00'),
 Timestamp('2021-06-15 00:00:00'),
 Timestamp('2021-06-16 00:00:00'),
 Timestamp('2021-06-17 00:00:00'),
 Timestamp('2021-06-18 00:00:00'),
 Timestamp('2021-06-19 00:00:00'),
 Timestamp('2021-06-20 00:00:00'),
 Timestamp('2021-06-21 00:00:00'),
 Timestamp('2021-06-22 00:00:00'),
 Timestamp('2021-06-23 00:00:00'),
 Timestamp('2021-06-24 00:00:00'),
 Timestamp('2021-06-25 00:00:00'),
 Timestamp('2021-06-26 00:00:00'),
 Timestamp('2021-06-27 00:00:00'),
 Timestamp('2021-06-28 00:00:00'),
 Timestamp('2021-06-

In [26]:
# cek tipe data tanggal yang telah di convert
dataset_juni_juli.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 305 entries, 0 to 154
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   tanggal   305 non-null    datetime64[ns]
 1   stasiun   305 non-null    object        
 2   pm10      305 non-null    object        
 3   pm25      305 non-null    object        
 4   so2       305 non-null    object        
 5   co        305 non-null    object        
 6   o3        305 non-null    object        
 7   no2       305 non-null    object        
 8   max       305 non-null    int64         
 9   critical  304 non-null    object        
 10  categori  305 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(9)
memory usage: 28.6+ KB


#### 3.5.2. Casting kolom tanggal pada dataset utama

In [27]:
# casting tipe data ke datetime
dataset.tanggal = pd.to_datetime(dataset.tanggal)

In [28]:
# cek tipe data untuk kolom tanggal
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1070 entries, 0 to 154
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   tanggal   1070 non-null   datetime64[ns]
 1   stasiun   765 non-null    object        
 2   pm10      1070 non-null   object        
 3   pm25      1070 non-null   object        
 4   so2       1070 non-null   object        
 5   co        1070 non-null   object        
 6   o3        1070 non-null   object        
 7   no2       1070 non-null   object        
 8   max       1070 non-null   object        
 9   critical  1062 non-null   object        
 10  categori  1068 non-null   object        
 11  location  305 non-null    object        
dtypes: datetime64[ns](1), object(11)
memory usage: 108.7+ KB


#### 3.5.3. Replace data bulan Juni dan Juli di dataset utama

In [29]:
# cek dataset shape
dataset.shape, dataset_juni_juli.shape

((1070, 12), (305, 11))

In [30]:
# set tanggal menjadi index agar gampang untuk dropingnya pada dataset utama
dataset.set_index("tanggal", inplace = True)
dataset_juni_juli.set_index("tanggal", inplace = True)

In [31]:
# drop row untuk bulan juni dan juli pada dataset utama berdasarkan data datetime pada dataset_juni_juli
dataset.drop(index = dataset_juni_juli.index.unique(), inplace = True)

In [32]:
# cek dataset shape
# length dataset utama telah berkurang 305 menjadi 765
dataset.shape, dataset_juni_juli.shape

((765, 11), (305, 10))

In [33]:
# gabung dataset utama dengan dataset bulan juni juli
dataset = pd.concat([dataset, dataset_juni_juli])

In [34]:
# cek dataset shape
# length dataset utama telah kembali menjadi 1070
dataset.shape, dataset_juni_juli.shape

((1070, 11), (305, 10))

In [35]:
# kembalikan tanggal menjadi kolom
dataset.reset_index(inplace = True)

In [36]:
# cek kembali tipe data
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1070 entries, 0 to 1069
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   tanggal   1070 non-null   datetime64[ns]
 1   stasiun   1070 non-null   object        
 2   pm10      1070 non-null   object        
 3   pm25      1070 non-null   object        
 4   so2       1070 non-null   object        
 5   co        1070 non-null   object        
 6   o3        1070 non-null   object        
 7   no2       1070 non-null   object        
 8   max       1070 non-null   object        
 9   critical  1061 non-null   object        
 10  categori  1069 non-null   object        
 11  location  0 non-null      object        
dtypes: datetime64[ns](1), object(11)
memory usage: 100.4+ KB


In [37]:
# cek dataset
dataset

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori,location
0,2021-09-01,DKI1 (Bunderan HI),63,88,29,15,24,38,88,PM25,SEDANG,
1,2021-09-02,DKI1 (Bunderan HI),60,83,29,11,30,28,83,PM25,SEDANG,
2,2021-09-03,DKI1 (Bunderan HI),60,82,27,11,37,30,82,PM25,SEDANG,
3,2021-09-04,DKI1 (Bunderan HI),58,77,26,10,31,28,77,PM25,SEDANG,
4,2021-09-05,DKI1 (Bunderan HI),63,85,27,11,28,28,85,PM25,SEDANG,
...,...,...,...,...,...,...,...,...,...,...,...,...
1065,2021-07-27,DKI5 (Kebon Jeruk) Jakarta Barat,64,110,32,13,29,35,110,PM25,TIDAK SEHAT,
1066,2021-07-28,DKI5 (Kebon Jeruk) Jakarta Barat,70,130,33,17,28,45,130,PM25,TIDAK SEHAT,
1067,2021-07-29,DKI5 (Kebon Jeruk) Jakarta Barat,78,140,32,18,29,39,140,PM25,TIDAK SEHAT,
1068,2021-07-30,DKI5 (Kebon Jeruk) Jakarta Barat,75,121,37,12,50,21,121,PM25,TIDAK SEHAT,


In [38]:
# drop kolom categori yang telah tidak dibutuhkan
dataset.drop(columns = "location", inplace = True)

### 3.6. Handling Column "PM10"

In [39]:
# terlihat tidak ada masalah pada kolom pm10
dataset

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori
0,2021-09-01,DKI1 (Bunderan HI),63,88,29,15,24,38,88,PM25,SEDANG
1,2021-09-02,DKI1 (Bunderan HI),60,83,29,11,30,28,83,PM25,SEDANG
2,2021-09-03,DKI1 (Bunderan HI),60,82,27,11,37,30,82,PM25,SEDANG
3,2021-09-04,DKI1 (Bunderan HI),58,77,26,10,31,28,77,PM25,SEDANG
4,2021-09-05,DKI1 (Bunderan HI),63,85,27,11,28,28,85,PM25,SEDANG
...,...,...,...,...,...,...,...,...,...,...,...
1065,2021-07-27,DKI5 (Kebon Jeruk) Jakarta Barat,64,110,32,13,29,35,110,PM25,TIDAK SEHAT
1066,2021-07-28,DKI5 (Kebon Jeruk) Jakarta Barat,70,130,33,17,28,45,130,PM25,TIDAK SEHAT
1067,2021-07-29,DKI5 (Kebon Jeruk) Jakarta Barat,78,140,32,18,29,39,140,PM25,TIDAK SEHAT
1068,2021-07-30,DKI5 (Kebon Jeruk) Jakarta Barat,75,121,37,12,50,21,121,PM25,TIDAK SEHAT


In [40]:
# namun kolom pm10 bukanlah bertipe integer
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1070 entries, 0 to 1069
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   tanggal   1070 non-null   datetime64[ns]
 1   stasiun   1070 non-null   object        
 2   pm10      1070 non-null   object        
 3   pm25      1070 non-null   object        
 4   so2       1070 non-null   object        
 5   co        1070 non-null   object        
 6   o3        1070 non-null   object        
 7   no2       1070 non-null   object        
 8   max       1070 non-null   object        
 9   critical  1061 non-null   object        
 10  categori  1069 non-null   object        
dtypes: datetime64[ns](1), object(10)
memory usage: 92.1+ KB


In [41]:
# terdapat beberapa data yang sebenarnya numerik namun bertipe teks serta ada beberapa data yang memang benar berupa teks ("---")
dataset.pm10.to_list()

['63',
 '60',
 '60',
 '58',
 '63',
 '68',
 '52',
 '53',
 '45',
 '51',
 '66',
 '61',
 '51',
 '31',
 '54',
 '48',
 '42',
 '50',
 '53',
 '51',
 '55',
 '57',
 '54',
 '57',
 '49',
 '52',
 '51',
 '45',
 '54',
 '63',
 '79',
 '66',
 '66',
 '62',
 '74',
 '66',
 '58',
 '64',
 '68',
 '58',
 '70',
 '66',
 '56',
 '36',
 '69',
 '52',
 '46',
 '56',
 '52',
 '58',
 '61',
 '64',
 '61',
 '57',
 '55',
 '56',
 '53',
 '49',
 '54',
 '71',
 '57',
 '65',
 '56',
 '53',
 '61',
 '73',
 '60',
 '60',
 '48',
 '49',
 '61',
 '55',
 '51',
 '32',
 '53',
 '51',
 '47',
 '55',
 '56',
 '45',
 '54',
 '57',
 '57',
 '55',
 '54',
 '54',
 '48',
 '46',
 '61',
 '67',
 '68',
 '65',
 '62',
 '57',
 '69',
 '62',
 '56',
 '63',
 '52',
 '45',
 '59',
 '63',
 '54',
 '36',
 '60',
 '53',
 '50',
 '57',
 '---',
 '---',
 '---',
 '---',
 '---',
 '---',
 '---',
 '---',
 '---',
 '---',
 '---',
 '---',
 '64',
 '62',
 '56',
 '58',
 '57',
 '65',
 '56',
 '51',
 '47',
 '46',
 '60',
 '58',
 '51',
 '26',
 '53',
 '38',
 '35',
 '46',
 '53',
 '45',
 '47',
 

In [42]:
# replace data teks dengan -1
dataset.pm10 = dataset.pm10.replace("---", -1).astype(int)

In [43]:
# cek tipe data
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1070 entries, 0 to 1069
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   tanggal   1070 non-null   datetime64[ns]
 1   stasiun   1070 non-null   object        
 2   pm10      1070 non-null   int32         
 3   pm25      1070 non-null   object        
 4   so2       1070 non-null   object        
 5   co        1070 non-null   object        
 6   o3        1070 non-null   object        
 7   no2       1070 non-null   object        
 8   max       1070 non-null   object        
 9   critical  1061 non-null   object        
 10  categori  1069 non-null   object        
dtypes: datetime64[ns](1), int32(1), object(9)
memory usage: 87.9+ KB


### 3.7. Handling Column "PM25"

In [47]:
# cek tipe data
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1070 entries, 0 to 1069
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   tanggal   1070 non-null   datetime64[ns]
 1   stasiun   1070 non-null   object        
 2   pm10      1070 non-null   int32         
 3   pm25      1070 non-null   object        
 4   so2       1070 non-null   object        
 5   co        1070 non-null   object        
 6   o3        1070 non-null   object        
 7   no2       1070 non-null   object        
 8   max       1070 non-null   object        
 9   critical  1061 non-null   object        
 10  categori  1069 non-null   object        
dtypes: datetime64[ns](1), int32(1), object(9)
memory usage: 87.9+ KB


In [48]:
# cek janis data
dataset.pm25.to_list()

['88',
 '83',
 '82',
 '77',
 '85',
 '92',
 '65',
 '76',
 '62',
 '66',
 '93',
 '80',
 '64',
 '42',
 '81',
 '65',
 '66',
 '73',
 '78',
 '69',
 '81',
 '80',
 '80',
 '74',
 '67',
 '72',
 '72',
 '63',
 '76',
 '94',
 '102',
 '89',
 '87',
 '79',
 '100',
 '87',
 '72',
 '87',
 '88',
 '77',
 '99',
 '82',
 '71',
 '47',
 '95',
 '66',
 '66',
 '79',
 '73',
 '81',
 '85',
 '82',
 '85',
 '76',
 '81',
 '74',
 '77',
 '71',
 '81',
 '105',
 '83',
 '99',
 '86',
 '77',
 '93',
 '108',
 '84',
 '82',
 '61',
 '71',
 '89',
 '78',
 '65',
 '51',
 '81',
 '72',
 '72',
 '80',
 '81',
 '59',
 '78',
 '78',
 '82',
 '77',
 '75',
 '74',
 '71',
 '---',
 '26',
 '27',
 '118',
 '115',
 '111',
 '93',
 '120',
 '109',
 '86',
 '106',
 '82',
 '80',
 '109',
 '111',
 '88',
 '61',
 '110',
 '89',
 '85',
 '101',
 '---',
 '---',
 '99',
 '119',
 '116',
 '109',
 '102',
 '98',
 '89',
 '69',
 '102',
 '117',
 '95',
 '101',
 '92',
 '84',
 '89',
 '95',
 '77',
 '78',
 '69',
 '73',
 '98',
 '84',
 '70',
 '37',
 '79',
 '59',
 '60',
 '69',
 '83',
 '6

In [49]:
# replace data teks ("---") dengan nilai -1
dataset.pm25 = dataset.pm25.replace("---", -1).astype(int)

In [50]:
# cek tipe data
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1070 entries, 0 to 1069
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   tanggal   1070 non-null   datetime64[ns]
 1   stasiun   1070 non-null   object        
 2   pm10      1070 non-null   int32         
 3   pm25      1070 non-null   int32         
 4   so2       1070 non-null   object        
 5   co        1070 non-null   object        
 6   o3        1070 non-null   object        
 7   no2       1070 non-null   object        
 8   max       1070 non-null   object        
 9   critical  1061 non-null   object        
 10  categori  1069 non-null   object        
dtypes: datetime64[ns](1), int32(2), object(8)
memory usage: 83.7+ KB


### 3.8. Handling Column "SO2"

In [51]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1070 entries, 0 to 1069
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   tanggal   1070 non-null   datetime64[ns]
 1   stasiun   1070 non-null   object        
 2   pm10      1070 non-null   int32         
 3   pm25      1070 non-null   int32         
 4   so2       1070 non-null   object        
 5   co        1070 non-null   object        
 6   o3        1070 non-null   object        
 7   no2       1070 non-null   object        
 8   max       1070 non-null   object        
 9   critical  1061 non-null   object        
 10  categori  1069 non-null   object        
dtypes: datetime64[ns](1), int32(2), object(8)
memory usage: 83.7+ KB


In [52]:
dataset.so2.to_list()

['29',
 '29',
 '27',
 '26',
 '27',
 '31',
 '28',
 '25',
 '25',
 '27',
 '32',
 '26',
 '29',
 '24',
 '29',
 '29',
 '29',
 '29',
 '26',
 '27',
 '28',
 '29',
 '29',
 '34',
 '31',
 '28',
 '27',
 '25',
 '26',
 '27',
 '55',
 '52',
 '51',
 '50',
 '52',
 '51',
 '51',
 '52',
 '54',
 '53',
 '53',
 '51',
 '52',
 '50',
 '54',
 '52',
 '51',
 '53',
 '50',
 '52',
 '52',
 '54',
 '53',
 '53',
 '52',
 '---',
 '---',
 '---',
 '57',
 '63',
 '46',
 '46',
 '45',
 '45',
 '46',
 '48',
 '48',
 '48',
 '46',
 '49',
 '50',
 '47',
 '47',
 '46',
 '50',
 '50',
 '49',
 '50',
 '48',
 '49',
 '50',
 '49',
 '51',
 '52',
 '50',
 '50',
 '49',
 '49',
 '49',
 '49',
 '40',
 '39',
 '42',
 '42',
 '43',
 '42',
 '42',
 '40',
 '41',
 '42',
 '43',
 '41',
 '42',
 '41',
 '42',
 '42',
 '42',
 '43',
 '---',
 '---',
 '44',
 '44',
 '43',
 '46',
 '45',
 '45',
 '42',
 '41',
 '45',
 '45',
 '30',
 '28',
 '7',
 '---',
 '---',
 '---',
 '---',
 '---',
 '---',
 '---',
 '---',
 '---',
 '---',
 '---',
 '---',
 '---',
 '---',
 '---',
 '---',
 '---',

In [53]:
dataset.so2 = dataset.so2.replace("---", -1).astype(int)

In [54]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1070 entries, 0 to 1069
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   tanggal   1070 non-null   datetime64[ns]
 1   stasiun   1070 non-null   object        
 2   pm10      1070 non-null   int32         
 3   pm25      1070 non-null   int32         
 4   so2       1070 non-null   int32         
 5   co        1070 non-null   object        
 6   o3        1070 non-null   object        
 7   no2       1070 non-null   object        
 8   max       1070 non-null   object        
 9   critical  1061 non-null   object        
 10  categori  1069 non-null   object        
dtypes: datetime64[ns](1), int32(3), object(7)
memory usage: 79.5+ KB


### 3.9. Handling Column "CO"

In [55]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1070 entries, 0 to 1069
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   tanggal   1070 non-null   datetime64[ns]
 1   stasiun   1070 non-null   object        
 2   pm10      1070 non-null   int32         
 3   pm25      1070 non-null   int32         
 4   so2       1070 non-null   int32         
 5   co        1070 non-null   object        
 6   o3        1070 non-null   object        
 7   no2       1070 non-null   object        
 8   max       1070 non-null   object        
 9   critical  1061 non-null   object        
 10  categori  1069 non-null   object        
dtypes: datetime64[ns](1), int32(3), object(7)
memory usage: 79.5+ KB


In [56]:
dataset.co.to_list()

['15',
 '11',
 '11',
 '10',
 '11',
 '11',
 '9',
 '16',
 '12',
 '10',
 '12',
 '10',
 '8',
 '8',
 '20',
 '9',
 '7',
 '8',
 '8',
 '10',
 '11',
 '11',
 '8',
 '8',
 '7',
 '6',
 '12',
 '9',
 '12',
 '13',
 '14',
 '10',
 '9',
 '9',
 '12',
 '10',
 '9',
 '13',
 '12',
 '9',
 '11',
 '9',
 '9',
 '8',
 '17',
 '8',
 '7',
 '8',
 '8',
 '11',
 '10',
 '10',
 '8',
 '8',
 '8',
 '7',
 '11',
 '8',
 '10',
 '12',
 '13',
 '15',
 '9',
 '9',
 '11',
 '13',
 '11',
 '12',
 '9',
 '9',
 '10',
 '9',
 '8',
 '7',
 '17',
 '10',
 '7',
 '9',
 '10',
 '7',
 '9',
 '14',
 '10',
 '8',
 '11',
 '10',
 '12',
 '10',
 '13',
 '16',
 '16',
 '12',
 '10',
 '9',
 '12',
 '9',
 '8',
 '15',
 '12',
 '8',
 '10',
 '10',
 '7',
 '6',
 '19',
 '7',
 '6',
 '7',
 '---',
 '---',
 '9',
 '10',
 '9',
 '8',
 '7',
 '7',
 '11',
 '6',
 '13',
 '13',
 '12',
 '9',
 '6',
 '6',
 '7',
 '8',
 '6',
 '8',
 '8',
 '6',
 '8',
 '7',
 '6',
 '3',
 '14',
 '4',
 '3',
 '5',
 '7',
 '5',
 '6',
 '7',
 '6',
 '6',
 '4',
 '4',
 '9',
 '5',
 '11',
 '10',
 '11',
 '11',
 '10',
 '11',
 

In [57]:
dataset.co = dataset.co.replace("---", -1).astype(int)

In [58]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1070 entries, 0 to 1069
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   tanggal   1070 non-null   datetime64[ns]
 1   stasiun   1070 non-null   object        
 2   pm10      1070 non-null   int32         
 3   pm25      1070 non-null   int32         
 4   so2       1070 non-null   int32         
 5   co        1070 non-null   int32         
 6   o3        1070 non-null   object        
 7   no2       1070 non-null   object        
 8   max       1070 non-null   object        
 9   critical  1061 non-null   object        
 10  categori  1069 non-null   object        
dtypes: datetime64[ns](1), int32(4), object(6)
memory usage: 75.4+ KB


### 3.10. Handling Column "O3"

In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1070 entries, 0 to 1069
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   tanggal   1070 non-null   datetime64[ns]
 1   stasiun   1070 non-null   object        
 2   pm10      1070 non-null   int32         
 3   pm25      1070 non-null   int32         
 4   so2       1070 non-null   object        
 5   co        1070 non-null   object        
 6   o3        1070 non-null   object        
 7   no2       1070 non-null   object        
 8   max       1070 non-null   object        
 9   critical  1061 non-null   object        
 10  categori  1069 non-null   object        
dtypes: datetime64[ns](1), int32(2), object(8)
memory usage: 83.7+ KB


In [60]:
dataset.o3.to_list()

['24',
 '30',
 '37',
 '31',
 '28',
 '26',
 '19',
 '23',
 '27',
 '25',
 '34',
 '28',
 '23',
 '20',
 '20',
 '23',
 '26',
 '23',
 '33',
 '25',
 '29',
 '24',
 '27',
 '20',
 '24',
 '28',
 '22',
 '24',
 '29',
 '38',
 '30',
 '39',
 '39',
 '37',
 '35',
 '34',
 '27',
 '27',
 '35',
 '32',
 '46',
 '36',
 '27',
 '47',
 '45',
 '52',
 '53',
 '51',
 '68',
 '54',
 '58',
 '48',
 '53',
 '47',
 '50',
 '54',
 '47',
 '49',
 '63',
 '63',
 '32',
 '29',
 '39',
 '30',
 '36',
 '29',
 '26',
 '29',
 '31',
 '36',
 '43',
 '30',
 '26',
 '21',
 '24',
 '27',
 '29',
 '27',
 '31',
 '33',
 '35',
 '26',
 '29',
 '27',
 '22',
 '27',
 '26',
 '32',
 '31',
 '40',
 '26',
 '26',
 '34',
 '31',
 '27',
 '29',
 '21',
 '29',
 '27',
 '26',
 '33',
 '24',
 '18',
 '19',
 '18',
 '28',
 '29',
 '23',
 '---',
 '---',
 '32',
 '22',
 '24',
 '21',
 '26',
 '33',
 '29',
 '38',
 '38',
 '40',
 '28',
 '25',
 '45',
 '37',
 '36',
 '27',
 '19',
 '28',
 '32',
 '39',
 '45',
 '29',
 '20',
 '17',
 '22',
 '23',
 '25',
 '27',
 '28',
 '30',
 '34',
 '25',
 '27

In [61]:
dataset.o3 = dataset.o3.replace("---", -1).astype(int)

In [62]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1070 entries, 0 to 1069
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   tanggal   1070 non-null   datetime64[ns]
 1   stasiun   1070 non-null   object        
 2   pm10      1070 non-null   int32         
 3   pm25      1070 non-null   int32         
 4   so2       1070 non-null   int32         
 5   co        1070 non-null   int32         
 6   o3        1070 non-null   int32         
 7   no2       1070 non-null   object        
 8   max       1070 non-null   object        
 9   critical  1061 non-null   object        
 10  categori  1069 non-null   object        
dtypes: datetime64[ns](1), int32(5), object(5)
memory usage: 71.2+ KB


### 3.11. Handling Column "NO2"

In [63]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1070 entries, 0 to 1069
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   tanggal   1070 non-null   datetime64[ns]
 1   stasiun   1070 non-null   object        
 2   pm10      1070 non-null   int32         
 3   pm25      1070 non-null   int32         
 4   so2       1070 non-null   int32         
 5   co        1070 non-null   int32         
 6   o3        1070 non-null   int32         
 7   no2       1070 non-null   object        
 8   max       1070 non-null   object        
 9   critical  1061 non-null   object        
 10  categori  1069 non-null   object        
dtypes: datetime64[ns](1), int32(5), object(5)
memory usage: 71.2+ KB


In [64]:
dataset.no2.to_list()

['38',
 '28',
 '30',
 '28',
 '28',
 '24',
 '24',
 '34',
 '31',
 '32',
 '41',
 '27',
 '21',
 '19',
 '45',
 '24',
 '20',
 '22',
 '19',
 '34',
 '36',
 '32',
 '22',
 '26',
 '20',
 '19',
 '33',
 '28',
 '38',
 '37',
 '29',
 '19',
 '18',
 '18',
 '22',
 '13',
 '16',
 '31',
 '29',
 '20',
 '22',
 '12',
 '14',
 '15',
 '35',
 '15',
 '9',
 '15',
 '10',
 '25',
 '19',
 '22',
 '13',
 '13',
 '13',
 '6',
 '17',
 '13',
 '16',
 '24',
 '14',
 '16',
 '13',
 '13',
 '14',
 '15',
 '15',
 '15',
 '12',
 '15',
 '16',
 '13',
 '11',
 '11',
 '22',
 '15',
 '13',
 '15',
 '12',
 '10',
 '15',
 '18',
 '16',
 '15',
 '16',
 '13',
 '16',
 '13',
 '22',
 '19',
 '27',
 '25',
 '23',
 '19',
 '26',
 '16',
 '21',
 '25',
 '24',
 '19',
 '30',
 '19',
 '16',
 '15',
 '23',
 '15',
 '14',
 '15',
 '---',
 '---',
 '20',
 '21',
 '18',
 '18',
 '16',
 '12',
 '19',
 '15',
 '24',
 '27',
 '26',
 '18',
 '9',
 '11',
 '11',
 '13',
 '17',
 '15',
 '15',
 '13',
 '16',
 '12',
 '10',
 '8',
 '26',
 '14',
 '12',
 '12',
 '10',
 '10',
 '13',
 '16',
 '14',
 

In [65]:
dataset.no2 = dataset.no2.replace("---", -1).astype(int)

In [66]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1070 entries, 0 to 1069
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   tanggal   1070 non-null   datetime64[ns]
 1   stasiun   1070 non-null   object        
 2   pm10      1070 non-null   int32         
 3   pm25      1070 non-null   int32         
 4   so2       1070 non-null   int32         
 5   co        1070 non-null   int32         
 6   o3        1070 non-null   int32         
 7   no2       1070 non-null   int32         
 8   max       1070 non-null   object        
 9   critical  1061 non-null   object        
 10  categori  1069 non-null   object        
dtypes: datetime64[ns](1), int32(6), object(4)
memory usage: 67.0+ KB


### 3.12. Handling Column "Max"

In [67]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1070 entries, 0 to 1069
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   tanggal   1070 non-null   datetime64[ns]
 1   stasiun   1070 non-null   object        
 2   pm10      1070 non-null   int32         
 3   pm25      1070 non-null   int32         
 4   so2       1070 non-null   int32         
 5   co        1070 non-null   int32         
 6   o3        1070 non-null   int32         
 7   no2       1070 non-null   int32         
 8   max       1070 non-null   object        
 9   critical  1061 non-null   object        
 10  categori  1069 non-null   object        
dtypes: datetime64[ns](1), int32(6), object(4)
memory usage: 67.0+ KB


In [69]:
dataset["max"].to_list()

[88,
 83,
 82,
 77,
 85,
 92,
 65,
 76,
 62,
 66,
 93,
 80,
 64,
 42,
 81,
 65,
 66,
 73,
 78,
 69,
 81,
 80,
 80,
 74,
 67,
 72,
 72,
 63,
 76,
 94,
 102,
 89,
 87,
 79,
 100,
 87,
 72,
 87,
 88,
 77,
 99,
 82,
 71,
 50,
 95,
 66,
 66,
 79,
 73,
 81,
 85,
 82,
 85,
 76,
 81,
 74,
 77,
 71,
 81,
 105,
 83,
 99,
 86,
 77,
 93,
 108,
 84,
 82,
 61,
 71,
 89,
 78,
 65,
 51,
 81,
 72,
 72,
 80,
 81,
 59,
 78,
 78,
 82,
 77,
 75,
 74,
 71,
 49,
 61,
 67,
 118,
 115,
 111,
 93,
 120,
 109,
 86,
 106,
 82,
 80,
 109,
 111,
 88,
 61,
 110,
 89,
 85,
 101,
 0,
 0,
 99,
 119,
 116,
 109,
 102,
 98,
 89,
 69,
 102,
 117,
 95,
 101,
 92,
 84,
 89,
 95,
 77,
 78,
 69,
 73,
 98,
 84,
 70,
 37,
 79,
 59,
 60,
 69,
 83,
 65,
 77,
 78,
 77,
 73,
 64,
 67,
 78,
 64,
 87,
 108,
 81,
 99,
 85,
 82,
 76,
 83,
 62,
 77,
 69,
 83,
 80,
 76,
 67,
 90,
 106,
 95,
 93,
 55,
 58,
 53,
 65,
 95,
 83,
 85,
 95,
 78,
 76,
 69,
 64,
 68,
 70,
 84,
 103,
 86,
 95,
 90,
 91,
 64,
 85,
 77,
 83,
 100,
 105,
 76,
 105,


In [73]:
dataset["max"] = dataset["max"].replace("PM25", -1).astype(int)

In [74]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1070 entries, 0 to 1069
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   tanggal   1070 non-null   datetime64[ns]
 1   stasiun   1070 non-null   object        
 2   pm10      1070 non-null   int32         
 3   pm25      1070 non-null   int32         
 4   so2       1070 non-null   int32         
 5   co        1070 non-null   int32         
 6   o3        1070 non-null   int32         
 7   no2       1070 non-null   int32         
 8   max       1070 non-null   int32         
 9   critical  1061 non-null   object        
 10  categori  1069 non-null   object        
dtypes: datetime64[ns](1), int32(7), object(3)
memory usage: 62.8+ KB


### 3.13. Handling Column "Critical"

In [77]:
dataset.critical.value_counts()

PM25    1005
PM10      25
SO2       21
O3         9
BAIK       1
Name: critical, dtype: int64

In [78]:
dataset[dataset.critical == "BAIK"]

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori
457,2021-12-03,DKI1 (Bunderan HI),49,31,9,19,7,49,-1,BAIK,


In [86]:
dataset.drop(index = 457, inplace = True)

In [87]:
dataset.loc[457]

KeyError: 457

### 3.14. Handling Column "Categori"

In [89]:
dataset.categori.value_counts()

SEDANG            791
TIDAK SEHAT       207
BAIK               62
TIDAK ADA DATA      9
Name: categori, dtype: int64

In [93]:
dataset.drop(index = dataset[dataset.categori == "TIDAK ADA DATA"].index, inplace = True)

In [94]:
dataset.categori.value_counts()

SEDANG         791
TIDAK SEHAT    207
BAIK            62
Name: categori, dtype: int64

In [96]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1060 entries, 0 to 1069
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   tanggal   1060 non-null   datetime64[ns]
 1   stasiun   1060 non-null   object        
 2   pm10      1060 non-null   int32         
 3   pm25      1060 non-null   int32         
 4   so2       1060 non-null   int32         
 5   co        1060 non-null   int32         
 6   o3        1060 non-null   int32         
 7   no2       1060 non-null   int32         
 8   max       1060 non-null   int32         
 9   critical  1060 non-null   object        
 10  categori  1060 non-null   object        
dtypes: datetime64[ns](1), int32(7), object(3)
memory usage: 70.4+ KB


In [97]:
joblib.dump(dataset, "../data/processed/dataset_clean.pkl")

['../data/processed/dataset_clean.pkl']

## 4. Data Defense

In [220]:
def check_data(input_data, params):
    # check data types
    assert input_data.select_dtypes("datetime").columns.to_list() == params["datetime_columns"], "an error occurs in datetime column(s)."
    assert input_data.select_dtypes("object").columns.to_list() == params["object_columns"], "an error occurs in object column(s)."
    assert input_data.select_dtypes("int32").columns.to_list() == params["int32_columns"], "an error occurs in int32 column(s)."

    # check range of data
    assert set(input_data.stasiun).issubset(set(params["range_stasiun"])), "an error occurs in stasiun range."
    assert input_data.pm10.between(params["range_pm10"][0], params["range_pm10"][1]).sum() == len(input_data), "an error occurs in pm10 range."
    assert input_data.pm25.between(params["range_pm25"][0], params["range_pm25"][1]).sum() == len(input_data), "an error occurs in pm25 range."
    assert input_data.so2.between(params["range_so2"][0], params["range_so2"][1]).sum() == len(input_data), "an error occurs in so2 range."
    assert input_data.co.between(params["range_co"][0], params["range_co"][1]).sum() == len(input_data), "an error occurs in co range."
    assert input_data.o3.between(params["range_o3"][0], params["range_o3"][1]).sum() == len(input_data), "an error occurs in o3 range."
    assert input_data.no2.between(params["range_no2"][0], params["range_no2"][1]).sum() == len(input_data), "an error occurs in no2 range."

In [245]:
check_data(dataset, params)

## 5. Data Splitting

In [249]:
x = dataset[params["predictors"]].copy()
y = dataset.categori.copy()

In [266]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 42, stratify = y)

In [269]:
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, test_size = 0.5, random_state = 42, stratify = y_test)

In [271]:
joblib.dump(x_train, "../data/processed/x_train.pkl")
joblib.dump(y_train, "../data/processed/y_train.pkl")
joblib.dump(x_valid, "../data/processed/x_valid.pkl")
joblib.dump(y_valid, "../data/processed/y_valid.pkl")
joblib.dump(x_test, "../data/processed/x_test.pkl")
joblib.dump(y_test, "../data/processed/y_test.pkl")

['../data/processed/y_test.pkl']