# 4. Data Pipeline

## 4.1. Import needed libraries

In [39]:
import os
import joblib
import sqlite3
import pandas as pd
from sklearn.model_selection import train_test_split

## 4.2. Loading dataset from files

### 4.2.1. Load single file dataset

In [3]:
dataset = pd.read_csv("../data/raw/files/indeks-standar-pencemar-udara-di-spku-bulan-agustus-tahun-2021.csv")

In [4]:
dataset

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori
0,2021-08-01,DKI1 (Bunderan HI),51,68,25,8,29,22,68,PM25,SEDANG
1,2021-08-02,DKI1 (Bunderan HI),47,63,24,10,25,28,63,PM25,SEDANG
2,2021-08-03,DKI1 (Bunderan HI),50,68,26,11,19,35,68,PM25,SEDANG
3,2021-08-04,DKI1 (Bunderan HI),52,70,29,8,24,26,70,PM25,SEDANG
4,2021-08-05,DKI1 (Bunderan HI),52,66,29,9,21,27,66,PM25,SEDANG
...,...,...,...,...,...,...,...,...,...,...,...
150,2021-08-27,DKI5 (Kebon Jeruk) Jakarta Barat,61,96,34,8,29,15,96,PM25,SEDANG
151,2021-08-28,DKI5 (Kebon Jeruk) Jakarta Barat,63,100,31,8,44,12,100,PM25,SEDANG
152,2021-08-29,DKI5 (Kebon Jeruk) Jakarta Barat,67,111,32,10,36,13,111,PM25,TIDAK SEHAT
153,2021-08-30,DKI5 (Kebon Jeruk) Jakarta Barat,83,126,35,16,32,29,126,PM25,TIDAK SEHAT


### 4.2.2. Load multiple files dataset

In [5]:
dataset_root_path = "../data/raw/files/"

In [6]:
list_files = os.listdir(dataset_root_path)

In [7]:
list_files

['indeks-standar-pencemar-udara-di-spku-bulan-agustus-tahun-2021.csv',
 'indeks-standar-pencemar-udara-di-spku-bulan-desember-tahun-2021.csv',
 'indeks-standar-pencemar-udara-di-spku-bulan-juli-tahun-2021.csv',
 'indeks-standar-pencemar-udara-di-spku-bulan-juni-tahun-2021.csv',
 'indeks-standar-pencemar-udara-di-spku-bulan-november-tahun-2021.csv',
 'indeks-standar-pencemar-udara-di-spku-bulan-oktober-tahun-2021.csv',
 'indeks-standar-pencemar-udara-di-spku-bulan-september-tahun-2021.csv']

In [8]:
list_dataset_path = [dataset_root_path + file_name for file_name in list_files]

In [9]:
list_dataset_path

['../data/raw/files/indeks-standar-pencemar-udara-di-spku-bulan-agustus-tahun-2021.csv',
 '../data/raw/files/indeks-standar-pencemar-udara-di-spku-bulan-desember-tahun-2021.csv',
 '../data/raw/files/indeks-standar-pencemar-udara-di-spku-bulan-juli-tahun-2021.csv',
 '../data/raw/files/indeks-standar-pencemar-udara-di-spku-bulan-juni-tahun-2021.csv',
 '../data/raw/files/indeks-standar-pencemar-udara-di-spku-bulan-november-tahun-2021.csv',
 '../data/raw/files/indeks-standar-pencemar-udara-di-spku-bulan-oktober-tahun-2021.csv',
 '../data/raw/files/indeks-standar-pencemar-udara-di-spku-bulan-september-tahun-2021.csv']

In [10]:
dataset = pd.DataFrame()

In [11]:
for dataset_path in list_dataset_path:
    temp = pd.read_csv(dataset_path)
    dataset = pd.concat([dataset, temp])

In [12]:
dataset

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori,location
0,2021-08-01,DKI1 (Bunderan HI),51,68,25,8,29,22,68,PM25,SEDANG,
1,2021-08-02,DKI1 (Bunderan HI),47,63,24,10,25,28,63,PM25,SEDANG,
2,2021-08-03,DKI1 (Bunderan HI),50,68,26,11,19,35,68,PM25,SEDANG,
3,2021-08-04,DKI1 (Bunderan HI),52,70,29,8,24,26,70,PM25,SEDANG,
4,2021-08-05,DKI1 (Bunderan HI),52,66,29,9,21,27,66,PM25,SEDANG,
...,...,...,...,...,...,...,...,...,...,...,...,...
145,2021-09-26,DKI5 (Kebon Jeruk) Jakarta Barat,48,67,---,4,30,9,67,PM25,SEDANG,
146,2021-09-27,DKI5 (Kebon Jeruk) Jakarta Barat,51,78,---,9,22,18,78,PM25,SEDANG,
147,2021-09-28,DKI5 (Kebon Jeruk) Jakarta Barat,42,64,---,5,26,14,64,PM25,SEDANG,
148,2021-09-29,DKI5 (Kebon Jeruk) Jakarta Barat,56,87,---,11,34,19,87,PM25,SEDANG,


## 4.3. Loading dataset from database

#### 4.3.1. Connect to database

In [13]:
connection = sqlite3.connect("../data/raw/database/dataset.db")

In [14]:
cursor = connection.cursor()

#### 4.3.2. Select data from database

In [15]:
result = cursor.execute("SELECT * FROM ispu")
result = result.fetchall()

In [16]:
dataset = pd.DataFrame(result)

In [17]:
dataset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,2021-08-01,DKI1 (Bunderan HI),51,68,25,8,29,22,68,PM25,SEDANG,
1,2021-08-02,DKI1 (Bunderan HI),47,63,24,10,25,28,63,PM25,SEDANG,
2,2021-08-03,DKI1 (Bunderan HI),50,68,26,11,19,35,68,PM25,SEDANG,
3,2021-08-04,DKI1 (Bunderan HI),52,70,29,8,24,26,70,PM25,SEDANG,
4,2021-08-05,DKI1 (Bunderan HI),52,66,29,9,21,27,66,PM25,SEDANG,
...,...,...,...,...,...,...,...,...,...,...,...,...
1065,2021-09-26,DKI5 (Kebon Jeruk) Jakarta Barat,48,67,---,4,30,9,67,PM25,SEDANG,
1066,2021-09-27,DKI5 (Kebon Jeruk) Jakarta Barat,51,78,---,9,22,18,78,PM25,SEDANG,
1067,2021-09-28,DKI5 (Kebon Jeruk) Jakarta Barat,42,64,---,5,26,14,64,PM25,SEDANG,
1068,2021-09-29,DKI5 (Kebon Jeruk) Jakarta Barat,56,87,---,11,34,19,87,PM25,SEDANG,


In [18]:
result = cursor.execute("PRAGMA table_info(ispu)")
result = result.fetchall()

In [19]:
result

[(0, 'tanggal', '', 0, None, 0),
 (1, 'stasiun', '', 0, None, 0),
 (2, 'pm10', '', 0, None, 0),
 (3, 'pm25', '', 0, None, 0),
 (4, 'so2', '', 0, None, 0),
 (5, 'co', '', 0, None, 0),
 (6, 'o3', '', 0, None, 0),
 (7, 'no2', '', 0, None, 0),
 (8, 'max', '', 0, None, 0),
 (9, 'critical', '', 0, None, 0),
 (10, 'categori', '', 0, None, 0),
 (11, 'location', '', 0, None, 0)]

In [20]:
columns_name = [item[1] for item in result]

In [21]:
dataset.columns = columns_name

In [22]:
dataset

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori,location
0,2021-08-01,DKI1 (Bunderan HI),51,68,25,8,29,22,68,PM25,SEDANG,
1,2021-08-02,DKI1 (Bunderan HI),47,63,24,10,25,28,63,PM25,SEDANG,
2,2021-08-03,DKI1 (Bunderan HI),50,68,26,11,19,35,68,PM25,SEDANG,
3,2021-08-04,DKI1 (Bunderan HI),52,70,29,8,24,26,70,PM25,SEDANG,
4,2021-08-05,DKI1 (Bunderan HI),52,66,29,9,21,27,66,PM25,SEDANG,
...,...,...,...,...,...,...,...,...,...,...,...,...
1065,2021-09-26,DKI5 (Kebon Jeruk) Jakarta Barat,48,67,---,4,30,9,67,PM25,SEDANG,
1066,2021-09-27,DKI5 (Kebon Jeruk) Jakarta Barat,51,78,---,9,22,18,78,PM25,SEDANG,
1067,2021-09-28,DKI5 (Kebon Jeruk) Jakarta Barat,42,64,---,5,26,14,64,PM25,SEDANG,
1068,2021-09-29,DKI5 (Kebon Jeruk) Jakarta Barat,56,87,---,11,34,19,87,PM25,SEDANG,


## 4.4. Drop Duplicate

#### 4.4.1. Check for duplipcate

In [23]:
dataset.duplicated().sum()

0

#### 4.4.2. Adding some same data to demonstrate dropping duplicate

In [24]:
new_same_data = dataset[dataset["tanggal"] == "2021-08-01"].reset_index(drop=True)

In [25]:
new_same_data

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori,location
0,2021-08-01,DKI1 (Bunderan HI),51,68,25,8,29,22,68,PM25,SEDANG,
1,2021-08-01,DKI2 (Kelapa Gading),55,75,50,9,67,13,75,PM25,SEDANG,
2,2021-08-01,DKI3 (Jagakarsa),51,66,44,9,31,11,66,PM25,SEDANG,
3,2021-08-01,DKI4 (Lubang Buaya),51,86,39,8,30,22,86,PM25,SEDANG,
4,2021-08-01,DKI5 (Kebon Jeruk) Jakarta Barat,44,70,30,8,29,11,70,PM25,SEDANG,


In [26]:
dataset = pd.concat([dataset, new_same_data])

In [27]:
dataset.reset_index(drop=True, inplace=True)

#### 4.4.3. Rechecking duplicate data

In [28]:
duplicate_data = dataset.duplicated(keep=False)

In [29]:
duplicate_data

0        True
1       False
2       False
3       False
4       False
        ...  
1070     True
1071     True
1072     True
1073     True
1074     True
Length: 1075, dtype: bool

In [30]:
duplicate_data_index = duplicate_data[duplicate_data == True].index

In [31]:
duplicate_data_index

Index([0, 31, 62, 93, 124, 1070, 1071, 1072, 1073, 1074], dtype='int64')

In [32]:
dataset.loc[duplicate_data_index].sort_values(by="stasiun")

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori,location
0,2021-08-01,DKI1 (Bunderan HI),51,68,25,8,29,22,68,PM25,SEDANG,
1070,2021-08-01,DKI1 (Bunderan HI),51,68,25,8,29,22,68,PM25,SEDANG,
31,2021-08-01,DKI2 (Kelapa Gading),55,75,50,9,67,13,75,PM25,SEDANG,
1071,2021-08-01,DKI2 (Kelapa Gading),55,75,50,9,67,13,75,PM25,SEDANG,
62,2021-08-01,DKI3 (Jagakarsa),51,66,44,9,31,11,66,PM25,SEDANG,
1072,2021-08-01,DKI3 (Jagakarsa),51,66,44,9,31,11,66,PM25,SEDANG,
93,2021-08-01,DKI4 (Lubang Buaya),51,86,39,8,30,22,86,PM25,SEDANG,
1073,2021-08-01,DKI4 (Lubang Buaya),51,86,39,8,30,22,86,PM25,SEDANG,
124,2021-08-01,DKI5 (Kebon Jeruk) Jakarta Barat,44,70,30,8,29,11,70,PM25,SEDANG,
1074,2021-08-01,DKI5 (Kebon Jeruk) Jakarta Barat,44,70,30,8,29,11,70,PM25,SEDANG,


In [33]:
dataset.duplicated().sum()

5

#### 4.4.4. Dropping duplicate

In [34]:
dataset.drop_duplicates(inplace=True)

In [35]:
dataset.duplicated().sum()

0

## 4.5. Serialization

In [48]:
joblib.dump(dataset, "../data/processed/dataset.pkl")

['../data/processed/dataset.pkl']