# 5. Exploratory Data Analysis

## 5.1. Importing required libraries

In [1]:
import os
import yaml
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

## 5.2. Load config file

In [9]:
path = "../config/config.yaml"

In [10]:
def load_yaml_file(path: str) -> dict:
    '''
    Brief:
    This function intended to load single yaml file.

    Params:
    1.  path
        String, location of target yaml file.
    
    Return:
    1.  data
        Dict, yaml data in form of dictionary.
    '''
    with open(path, 'r') as file:
        data = yaml.safe_load(file)
        
    return data

In [11]:
params = load_yaml_file(path)

In [12]:
params

{'combined_raw_dataset_dir': '../data/processed/dataset.pkl',
 'datetime_columns': ['tanggal'],
 'int32_columns': ['pm10', 'pm25', 'so2', 'co', 'o3', 'no2', 'max'],
 'object_columns': ['stasiun', 'critical', 'categori'],
 'predictors': ['stasiun', 'pm10', 'pm25', 'so2', 'co', 'o3', 'no2'],
 'label': 'categori',
 'label_categories': ['BAIK', 'SEDANG', 'TIDAK SEHAT'],
 'label_categories_new': ['BAIK', 'TIDAK BAIK'],
 'missing_value_co': 11,
 'missing_value_no2': 18,
 'missing_value_o3': 29,
 'missing_value_pm10': {'BAIK': 28, 'TIDAK BAIK': 55},
 'missing_value_pm25': {'BAIK': 38, 'TIDAK BAIK': 82},
 'missing_value_so2': 35,
 'range_co': [-1, 100],
 'range_no2': [-1, 100],
 'range_o3': [-1, 160],
 'range_pm10': [-1, 800],
 'range_pm25': [-1, 400],
 'range_so2': [-1, 500],
 'range_stasiun': ['DKI1 (Bunderan HI)',
  'DKI2 (Kelapa Gading)',
  'DKI3 (Jagakarsa)',
  'DKI4 (Lubang Buaya)',
  'DKI5 (Kebon Jeruk) Jakarta Barat']}

## 5.3. Data definition

## 5.4. Data validation

#### 5.4.1. Load the combined dataset

In [20]:
def load_pickle(path: str) -> any:
    '''
    Brief:
    This function intended to load single pickle file.

    Params:
    1.  path
        String, location of target pickle file.
    
    Return:
    1.  pickle_data
        any, loaded data. I could be any type of data.
    '''
    pickle_data = joblib.load(path)
        
    return pickle_data

In [21]:
dataset = load_pickle(params["combined_raw_dataset_dir"])

In [22]:
dataset

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori,location
0,2021-08-01,DKI1 (Bunderan HI),51,68,25,8,29,22,68,PM25,SEDANG,
1,2021-08-02,DKI1 (Bunderan HI),47,63,24,10,25,28,63,PM25,SEDANG,
2,2021-08-03,DKI1 (Bunderan HI),50,68,26,11,19,35,68,PM25,SEDANG,
3,2021-08-04,DKI1 (Bunderan HI),52,70,29,8,24,26,70,PM25,SEDANG,
4,2021-08-05,DKI1 (Bunderan HI),52,66,29,9,21,27,66,PM25,SEDANG,
...,...,...,...,...,...,...,...,...,...,...,...,...
1065,2021-09-26,DKI5 (Kebon Jeruk) Jakarta Barat,48,67,---,4,30,9,67,PM25,SEDANG,
1066,2021-09-27,DKI5 (Kebon Jeruk) Jakarta Barat,51,78,---,9,22,18,78,PM25,SEDANG,
1067,2021-09-28,DKI5 (Kebon Jeruk) Jakarta Barat,42,64,---,5,26,14,64,PM25,SEDANG,
1068,2021-09-29,DKI5 (Kebon Jeruk) Jakarta Barat,56,87,---,11,34,19,87,PM25,SEDANG,


#### 5.4.2. Checking data type

In [23]:
dataset.dtypes

tanggal     object
stasiun     object
pm10        object
pm25        object
so2         object
co          object
o3          object
no2         object
max         object
critical    object
categori    object
location    object
dtype: object

#### 5.4.3. Checking data range

In [24]:
dataset.describe()

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori,location
count,1070,765,1070,1070,1070,1070,1070,1070,1070,1062,1068,305
unique,214,5,112,180,175,76,108,100,206,96,8,4
top,2021-08-01,DKI1 (Bunderan HI),DKI1 (Bunderan HI),71,---,10,13,15,77,PM25,SEDANG,SEDANG
freq,5,153,61,27,104,65,44,43,20,714,609,182
