In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as scs
import joblib
import yaml
import matplotlib.pyplot as plt

# 1. Import Params

In [21]:
params_dir = "../config/config.yaml"

In [22]:
def load_params(param_dir):
    with open(param_dir, "r") as file:
        params = yaml.safe_load(file)

    return params

In [23]:
params = load_params(params_dir)

In [24]:
params

{'raw_dataset_dir': 'data/raw/',
 'train_set_path': ['data/processed/X_train.pkl',
  'data/processed/y_train.pkl'],
 'valid_set_path': ['data/processed/X_valid.pkl',
  'data/processed/y_valid.pkl'],
 'test_set_path': ['data/processed/X_test.pkl', 'data/processed/y_test.pkl'],
 'train_feng_set_path': ['data/processed/X_train_feng.pkl',
  'data/processed/y_train_feng.pkl'],
 'valid_feng_set_path': ['data/processed/X_valid_feng.pkl',
  'data/processed/y_valid_feng.pkl'],
 'test_feng_set_path': ['data/processed/X_test_feng.pkl',
  'data/processed/y_test_feng.pkl'],
 'raw_dataset_path': 'data/processed/raw_dataset.pkl',
 'cleaned_raw_dataset_path': 'data/processed/cleaned_raw_dataset.pkl',
 'production_model_path': 'models/production_model.pkl',
 'ohe_stasiun_path': 'models/ohe_stasiun.pkl',
 'le_encoder_path': 'models/le_encoder.pkl',
 'training_log_path': 'log/training_log.json',
 'print_debug': True,
 'datetime_columns': ['tanggal'],
 'int64_columns': ['pm10', 'pm25', 'so2', 'co', 'o3', 

# 2. Load Train Set

In [26]:
X_train = joblib.load("../data/processed/X_train.pkl")
y_train = joblib.load("../data/processed/y_train.pkl")

In [27]:
X_train

Unnamed: 0,stasiun,pm10,pm25,so2,co,o3,no2
983,DKI3 (Jagakarsa),59,88,49,7,23,15
1717,DKI2 (Kelapa Gading),-1,78,-1,16,52,25
1021,DKI4 (Lubang Buaya),47,84,33,16,21,22
1608,DKI3 (Jagakarsa),45,64,52,8,33,13
1055,DKI5 (Kebon Jeruk) Jakarta Barat,45,70,39,8,16,22
...,...,...,...,...,...,...,...
520,DKI3 (Jagakarsa),79,119,51,7,34,17
1561,DKI2 (Kelapa Gading),68,100,64,12,66,26
290,DKI5 (Kebon Jeruk) Jakarta Barat,54,78,32,7,27,18
546,DKI4 (Lubang Buaya),84,161,38,14,27,22


In [28]:
y_train

983          SEDANG
1717         SEDANG
1021         SEDANG
1608         SEDANG
1055         SEDANG
           ...     
520     TIDAK SEHAT
1561         SEDANG
290          SEDANG
546     TIDAK SEHAT
1500         SEDANG
Name: categori, Length: 1157, dtype: object

In [29]:
dataset = pd.concat([X_train, y_train], axis = 1)

In [30]:
dataset

Unnamed: 0,stasiun,pm10,pm25,so2,co,o3,no2,categori
983,DKI3 (Jagakarsa),59,88,49,7,23,15,SEDANG
1717,DKI2 (Kelapa Gading),-1,78,-1,16,52,25,SEDANG
1021,DKI4 (Lubang Buaya),47,84,33,16,21,22,SEDANG
1608,DKI3 (Jagakarsa),45,64,52,8,33,13,SEDANG
1055,DKI5 (Kebon Jeruk) Jakarta Barat,45,70,39,8,16,22,SEDANG
...,...,...,...,...,...,...,...,...
520,DKI3 (Jagakarsa),79,119,51,7,34,17,TIDAK SEHAT
1561,DKI2 (Kelapa Gading),68,100,64,12,66,26,SEDANG
290,DKI5 (Kebon Jeruk) Jakarta Barat,54,78,32,7,27,18,SEDANG
546,DKI4 (Lubang Buaya),84,161,38,14,27,22,TIDAK SEHAT


# 3. EDA

## 3.1. Cek missing value

In [31]:
dataset.replace(-1, np.nan, inplace = True)

In [32]:
dataset.isnull().sum()

stasiun      0
pm10        38
pm25        59
so2         63
co          11
o3          33
no2         12
categori     0
dtype: int64

## 3.2. Cek Informasi Statistik

In [33]:
dataset.skew(numeric_only = True)

pm10    0.290332
pm25    0.265499
so2     0.230249
co      1.676094
o3      1.635067
no2     0.930391
dtype: float64