In [1]:
import numpy as np
import src.utils as utils

## 1. Todo List

1. Missing value handling<br>
1.1. pm10       : mean<br>
1.2. pm25       : mean<br>
1.3. so2        : mean<br>
1.4. co         : median<br>
1.5. o3         : median<br>
1.6. no2        : median<br>

2. Join kategori sedang dan tidak sehat menjadi tidak baik

3. Encoding Feature: Stasiun

4. Encoding Label

## 2. Import Params

In [2]:
config_dir = "config/config.yaml"

In [3]:
config = utils.load_yaml(config_dir)

## 3. Load Dataset

In [4]:
x_train = utils.deserialize_data(config["x_train_path"])
y_train = utils.deserialize_data(config["y_train_path"])

x_valid = utils.deserialize_data(config["x_valid_path"])
y_valid = utils.deserialize_data(config["y_valid_path"])

x_test = utils.deserialize_data(config["x_test_path"])
y_test = utils.deserialize_data(config["y_test_path"])

In [5]:
train_set = utils.combine_dataframe([x_train, y_train], axis = 1)
valid_set = utils.combine_dataframe([x_valid, y_valid], axis = 1)
test_set = utils.combine_dataframe([x_test, y_test], axis = 1)

## 4. Join Categories

In [6]:
def join_cat(set_data, params):
    if params["target"] in set_data.columns.to_list():
        set_data = set_data.copy()
        set_data.categori.replace(params["target_categories"][1], params["target_categories"][2], inplace = True)
        set_data.categori.replace(params["target_categories"][2], params["target_categories_new"][1], inplace = True)
        return set_data
    else:
        raise RuntimeError("Kolom label tidak terdeteksi pada set data yang diberikan!")

### 4.1. Train Set

In [7]:
train_set.categori.value_counts()

categori
SEDANG         914
TIDAK SEHAT    223
BAIK           132
Name: count, dtype: int64

In [8]:
train_set = join_cat(train_set, config)

In [9]:
train_set.categori.value_counts()

categori
TIDAK BAIK    1137
BAIK           132
Name: count, dtype: int64

### 4.2. Valid Set

In [10]:
valid_set.categori.value_counts()

categori
SEDANG         196
TIDAK SEHAT     48
BAIK            28
Name: count, dtype: int64

In [11]:
valid_set = join_cat(valid_set, config)

In [12]:
valid_set.categori.value_counts()

categori
TIDAK BAIK    244
BAIK           28
Name: count, dtype: int64

### 4.3. Test Set

In [13]:
test_set.categori.value_counts()

categori
SEDANG         195
TIDAK SEHAT     48
BAIK            29
Name: count, dtype: int64

In [14]:
test_set = join_cat(test_set, config)

In [15]:
test_set.categori.value_counts()

categori
TIDAK BAIK    243
BAIK           29
Name: count, dtype: int64

## 5. Handling Missing Value

In [16]:
def nan_detector(set_data):
    set_data = set_data.copy()
    set_data.replace(-1, np.nan, inplace = True)
    return set_data

### 5.1. Convert -1 to NaN

#### 5.1.1. Train Set

In [17]:
train_set.describe()

Unnamed: 0,tanggal,pm10,pm25,so2,co,o3,no2,max
count,1269,1269.0,1269.0,1269.0,1269.0,1269.0,1269.0,1269.0
mean,2021-07-04 20:23:15.744680960,51.152088,75.731284,33.360126,11.536643,30.835303,19.267928,78.855004
min,2021-01-01 00:00:00,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,20.0
25%,2021-04-03 00:00:00,42.0,60.0,25.0,8.0,21.0,13.0,62.0
50%,2021-07-17 00:00:00,54.0,77.0,34.0,11.0,28.0,18.0,78.0
75%,2021-10-01 00:00:00,62.0,93.0,43.0,14.0,37.0,25.0,94.0
max,2021-12-31 00:00:00,100.0,174.0,82.0,44.0,151.0,65.0,174.0
std,,17.246488,29.30646,14.710573,5.045186,15.319329,9.231295,24.325463


In [18]:
train_set.isnull().sum()

tanggal     0
stasiun     0
pm10        0
pm25        0
so2         0
co          0
o3          0
no2         0
max         0
critical    0
categori    0
dtype: int64

In [19]:
train_set = nan_detector(train_set)

In [20]:
train_set.isnull().sum()

tanggal      0
stasiun      0
pm10        35
pm25        55
so2         68
co          10
o3          35
no2          9
max          0
critical     0
categori     0
dtype: int64

#### 5.1.2. Valid Set

In [21]:
valid_set.describe()

Unnamed: 0,tanggal,pm10,pm25,so2,co,o3,no2,max
count,272,272.0,272.0,272.0,272.0,272.0,272.0,272.0
mean,2021-07-09 09:47:38.823529472,51.496324,75.459559,34.341912,11.290441,32.533088,18.786765,78.768382
min,2021-01-01 00:00:00,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,27.0
25%,2021-04-05 18:00:00,43.0,60.0,26.0,8.0,23.0,12.0,62.0
50%,2021-07-21 00:00:00,54.0,77.0,34.0,10.5,30.0,17.0,77.0
75%,2021-09-28 00:00:00,62.0,95.0,45.0,13.0,41.0,24.25,95.0
max,2021-12-30 00:00:00,94.0,150.0,80.0,47.0,85.0,62.0,150.0
std,,17.315394,28.64051,15.026324,5.297794,14.238053,9.295802,22.863175


In [22]:
valid_set.isnull().sum()

tanggal     0
stasiun     0
pm10        0
pm25        0
so2         0
co          0
o3          0
no2         0
max         0
critical    0
categori    0
dtype: int64

In [23]:
valid_set = nan_detector(valid_set)

In [24]:
valid_set.isnull().sum()

tanggal      0
stasiun      0
pm10         9
pm25        13
so2         15
co           3
o3           3
no2          4
max          0
critical     0
categori     0
dtype: int64

#### 5.1.3. Test Set

In [25]:
test_set.describe()

Unnamed: 0,tanggal,pm10,pm25,so2,co,o3,no2,max
count,272,272.0,272.0,272.0,272.0,272.0,272.0,272.0
mean,2021-06-27 22:19:24.705882368,51.121324,73.444853,32.360294,11.816176,31.194853,18.669118,77.827206
min,2021-01-01 00:00:00,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,17.0
25%,2021-03-24 12:00:00,40.0,59.0,23.0,9.0,21.0,13.0,61.0
50%,2021-07-12 00:00:00,54.0,76.0,32.0,11.0,28.0,18.0,77.0
75%,2021-09-29 06:00:00,61.0,91.0,43.0,14.0,38.25,24.0,92.0
max,2021-12-31 00:00:00,179.0,150.0,64.0,44.0,93.0,49.0,179.0
std,,18.589466,29.945903,14.567148,5.170456,16.550122,8.879725,24.049831


In [26]:
test_set.isnull().sum()

tanggal     0
stasiun     0
pm10        0
pm25        0
so2         0
co          0
o3          0
no2         0
max         0
critical    0
categori    0
dtype: int64

In [27]:
test_set = nan_detector(test_set)

In [28]:
test_set.isnull().sum()

tanggal      0
stasiun      0
pm10         9
pm25        18
so2         14
co           3
o3          10
no2          6
max          0
critical     0
categori     0
dtype: int64

### 5.2. Imputing

#### 5.2.1. Preparation of imputation values

In [29]:
impute_pm10 = int(train_set.pm10.mean())
impute_pm25 = int(train_set.pm25.mean())
impute_so2 = int(train_set.so2.mean())
impute_co = int(train_set.co.median())
impute_o3 = int(train_set.o3.median())
impute_no2 = int(train_set.no2.median())

In [30]:
impute_values = {
    "pm10" : impute_pm10,
    "pm25" : impute_pm25,
    "so2" : impute_so2,
    "co" : impute_co,
    "o3" : impute_o3,
    "no2" : impute_no2
}

In [31]:
impute_values

{'pm10': 52, 'pm25': 79, 'so2': 35, 'co': 11, 'o3': 29, 'no2': 18}

#### 5.2.2. Train Set

In [32]:
train_set.isnull().sum()

tanggal      0
stasiun      0
pm10        35
pm25        55
so2         68
co          10
o3          35
no2          9
max          0
critical     0
categori     0
dtype: int64

In [33]:
train_set.fillna(value = impute_values, inplace = True)

In [34]:
train_set.isnull().sum()

tanggal     0
stasiun     0
pm10        0
pm25        0
so2         0
co          0
o3          0
no2         0
max         0
critical    0
categori    0
dtype: int64

#### 5.2.3. Valid Set

In [35]:
valid_set.isnull().sum()

tanggal      0
stasiun      0
pm10         9
pm25        13
so2         15
co           3
o3           3
no2          4
max          0
critical     0
categori     0
dtype: int64

In [36]:
valid_set.fillna(value = impute_values, inplace = True)

In [37]:
valid_set.isnull().sum()

tanggal     0
stasiun     0
pm10        0
pm25        0
so2         0
co          0
o3          0
no2         0
max         0
critical    0
categori    0
dtype: int64

#### 5.2.4. Test Set

In [38]:
test_set.isnull().sum()

tanggal      0
stasiun      0
pm10         9
pm25        18
so2         14
co           3
o3          10
no2          6
max          0
critical     0
categori     0
dtype: int64

In [39]:
test_set.fillna(value = impute_values, inplace = True)

In [40]:
test_set.isnull().sum()

tanggal     0
stasiun     0
pm10        0
pm25        0
so2         0
co          0
o3          0
no2         0
max         0
critical    0
categori    0
dtype: int64

## 6. Encoding Stasiun

### 6.1. Fitting OHE Model

In [41]:
ohe_fit_data = np.array(config["range_stasiun"]).reshape(-1, 1)
ohe_station = utils.ohe_fit(ohe_fit_data)

In [42]:
utils.serialize_data(ohe_station, config["ohe_station_path"])

Serialized models/ohe_stasiun.pkl


### 6.2. Transforming Categoric Data

#### 6.2.1. Train Set

In [43]:
train_set = utils.ohe_transform_combine(ohe_station, train_set, "stasiun")
train_set

Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,tanggal,pm10,pm25,so2,co,o3,no2,max,critical,categori
1527,1.0,0.0,0.0,0.0,0.0,2021-04-03,25.0,42.0,20.0,8.0,18.0,22.0,42,PM25,BAIK
433,0.0,0.0,0.0,0.0,1.0,2021-11-09,57.0,86.0,35.0,16.0,19.0,30.0,86,PM25,TIDAK BAIK
1357,0.0,0.0,0.0,0.0,1.0,2021-02-16,24.0,35.0,21.0,7.0,22.0,9.0,35,PM25,BAIK
1136,0.0,1.0,0.0,0.0,0.0,2021-01-31,38.0,55.0,24.0,11.0,68.0,7.0,68,O3,TIDAK BAIK
1097,1.0,0.0,0.0,0.0,0.0,2021-01-23,72.0,108.0,14.0,43.0,44.0,20.0,108,PM25,TIDAK BAIK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1802,0.0,0.0,0.0,0.0,1.0,2021-08-04,46.0,74.0,33.0,7.0,26.0,18.0,74,PM25,TIDAK BAIK
1433,0.0,0.0,1.0,0.0,0.0,2021-12-02,35.0,55.0,42.0,7.0,29.0,9.0,55,PM25,TIDAK BAIK
977,0.0,1.0,0.0,0.0,0.0,2021-07-27,82.0,112.0,56.0,12.0,41.0,27.0,112,PM25,TIDAK BAIK
983,0.0,0.0,1.0,0.0,0.0,2021-07-02,60.0,84.0,47.0,7.0,23.0,18.0,84,PM25,TIDAK BAIK


#### 6.2.2. Valid Set

In [44]:
valid_set = utils.ohe_transform_combine(ohe_station, valid_set, "stasiun")
valid_set

Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,tanggal,pm10,pm25,so2,co,o3,no2,max,critical,categori
1394,1.0,0.0,0.0,0.0,0.0,2021-12-25,53.0,68.0,56.0,17.0,17.0,9.0,68,PM25,TIDAK BAIK
1692,1.0,0.0,0.0,0.0,0.0,2021-08-18,59.0,78.0,25.0,11.0,27.0,26.0,78,PM25,TIDAK BAIK
724,0.0,0.0,0.0,1.0,0.0,2021-03-22,52.0,78.0,41.0,15.0,20.0,11.0,78,PM25,TIDAK BAIK
1609,0.0,0.0,1.0,0.0,0.0,2021-04-25,59.0,88.0,19.0,13.0,27.0,19.0,88,PM25,TIDAK BAIK
707,0.0,0.0,0.0,1.0,0.0,2021-03-05,51.0,79.0,40.0,14.0,32.0,13.0,79,PM25,TIDAK BAIK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1167,0.0,0.0,1.0,0.0,0.0,2021-01-31,38.0,13.0,17.0,7.0,39.0,18.0,39,O3,BAIK
337,0.0,1.0,0.0,0.0,0.0,2021-11-03,52.0,79.0,14.0,11.0,41.0,27.0,52,PM10,TIDAK BAIK
1223,0.0,0.0,0.0,0.0,1.0,2021-01-25,55.0,79.0,19.0,29.0,67.0,13.0,67,CO,TIDAK BAIK
929,1.0,0.0,0.0,0.0,0.0,2021-07-10,64.0,83.0,35.0,7.0,24.0,23.0,83,PM25,TIDAK BAIK


#### 6.2.3. Test Set

In [45]:
test_set = utils.ohe_transform_combine(ohe_station, test_set, "stasiun")
test_set

Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,tanggal,pm10,pm25,so2,co,o3,no2,max,critical,categori
1823,0.0,0.0,0.0,0.0,1.0,2021-08-25,63.0,95.0,31.0,10.0,29.0,20.0,95,PM25,TIDAK BAIK
1533,1.0,0.0,0.0,0.0,0.0,2021-04-09,54.0,69.0,26.0,14.0,15.0,28.0,69,PM25,TIDAK BAIK
381,0.0,0.0,1.0,0.0,0.0,2021-11-17,47.0,66.0,49.0,9.0,28.0,11.0,66,PM25,TIDAK BAIK
871,0.0,0.0,0.0,1.0,0.0,2021-07-14,57.0,116.0,39.0,13.0,20.0,26.0,116,PM25,TIDAK BAIK
1746,0.0,0.0,1.0,0.0,0.0,2021-08-10,36.0,54.0,41.0,6.0,27.0,12.0,54,PM25,TIDAK BAIK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
714,0.0,0.0,0.0,1.0,0.0,2021-03-12,53.0,83.0,41.0,19.0,22.0,13.0,83,PM25,TIDAK BAIK
225,0.0,0.0,1.0,0.0,0.0,2021-10-14,61.0,88.0,51.0,12.0,32.0,17.0,88,PM25,TIDAK BAIK
548,0.0,0.0,0.0,1.0,0.0,2021-05-01,58.0,91.0,37.0,11.0,26.0,13.0,91,PM25,TIDAK BAIK
806,0.0,1.0,0.0,0.0,0.0,2021-07-11,52.0,79.0,54.0,10.0,54.0,13.0,54,SO2,TIDAK BAIK


## 7. Label Encoding

### 7.1. Fitting Label Encoding Model

In [46]:
le_categori = utils.le_fit(config["target_categories_new"])

In [47]:
utils.serialize_data(le_categori, "models/le_categori.pkl")

Serialized models/le_categori.pkl


### 7.2. Train Set

In [48]:
list(train_set.categori.unique()) == config["target_categories_new"]

True

In [49]:
train_set.categori = utils.le_transform(train_set.categori, le_categori)

### 7.3. Validation Set

In [50]:
len(set(valid_set.categori.unique()) - set(config["target_categories_new"])) == 0

True

In [51]:
valid_set.categori = utils.le_transform(valid_set.categori, le_categori)

### 7.4. Test Set

In [52]:
len(set(test_set.categori.unique()) - set(config["target_categories_new"])) == 0

True

In [53]:
test_set.categori = utils.le_transform(test_set.categori, le_categori)

## 8. Serialize Dataset

In [54]:
x_train, y_train = utils.split_predictor_target(train_set, config)
x_valid, y_valid = utils.split_predictor_target(valid_set, config)
x_test, y_test = utils.split_predictor_target(test_set, config)

In [55]:
utils.serialize_data(x_train, config["x_train_prep_path"])
utils.serialize_data(y_train, config["y_train_prep_path"])

utils.serialize_data(x_valid, config["x_valid_prep_path"])
utils.serialize_data(y_valid, config["y_valid_prep_path"])

utils.serialize_data(x_test, config["x_test_prep_path"])
utils.serialize_data(y_test, config["y_test_prep_path"])

Serialized data/processed/x_train_prep.pkl
Serialized data/processed/y_train_prep.pkl
Serialized data/processed/x_valid_prep.pkl
Serialized data/processed/y_valid_prep.pkl
Serialized data/processed/x_test_prep.pkl
Serialized data/processed/y_test_prep.pkl
