In [1]:
import numpy as np
import pandas as pd
import src.utils as utils
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

## 1. Todo List

1. Missing value handling<br>
1.1. pm10       : mean<br>
1.2. pm25       : mean<br>
1.3. so2        : mean<br>
1.4. co         : median<br>
1.5. o3         : median<br>
1.6. no2        : median<br>

2. Join kategori sedang dan tidak sehat menjadi tidak baik

3. Balancing label baik dan tidak baik

4. Outlier removal

## 2. Import Params

In [2]:
params_dir = "config/params.yaml"

In [3]:
params = utils.load_params(params_dir)
params

{'dataset_root_path': 'data/raw/',
 'dataset_combined_path': 'data/processed/combined.pkl',
 'dataset_cleaned_path': 'data/processed/cleaned.pkl',
 'x_train_path': 'data/processed/x_train.pkl',
 'y_train_path': 'data/processed/y_train.pkl',
 'x_valid_path': 'data/processed/x_valid.pkl',
 'y_valid_path': 'data/processed/y_valid.pkl',
 'x_test_path': 'data/processed/x_test.pkl',
 'y_test_path': 'data/processed/y_test.pkl',
 'ohe_station_path': 'models/ohe_stasiun.pkl',
 'datetime_columns': ['tanggal'],
 'int32_columns': ['pm10', 'pm25', 'so2', 'co', 'o3', 'no2', 'max'],
 'object_columns': ['stasiun', 'critical', 'categori'],
 'predictors': ['stasiun', 'pm10', 'pm25', 'so2', 'co', 'o3', 'no2'],
 'target': 'categori',
 'target_categories': ['BAIK', 'SEDANG', 'TIDAK SEHAT'],
 'target_categories_new': ['BAIK', 'TIDAK BAIK'],
 'missing_value_co': 11,
 'missing_value_no2': 18,
 'missing_value_o3': 29,
 'missing_value_pm10': {'BAIK': 28, 'TIDAK BAIK': 55},
 'missing_value_pm25': {'BAIK': 38, 'T

## 3. Load Dataset

In [4]:
x_train = utils.deserialize_data(params["x_train_path"])
y_train = utils.deserialize_data(params["y_train_path"])

x_valid = utils.deserialize_data(params["x_valid_path"])
y_valid = utils.deserialize_data(params["y_valid_path"])

x_test = utils.deserialize_data(params["x_test_path"])
y_test = utils.deserialize_data(params["y_test_path"])

In [5]:
train_set = utils.combine_dataframe([x_train, y_train], axis = 1)
valid_set = utils.combine_dataframe([x_valid, y_valid], axis = 1)
test_set = utils.combine_dataframe([x_test, y_test], axis = 1)

## 4. Join Categories

In [6]:
def join_cat(set_data, params):
    if params["target"] in set_data.columns.to_list():
        set_data = set_data.copy()
        set_data.categori.replace(params["target_categories"][1], params["target_categories"][2], inplace = True)
        set_data.categori.replace(params["target_categories"][2], params["target_categories_new"][1], inplace = True)
        return set_data
    else:
        raise RuntimeError("Kolom label tidak terdeteksi pada set data yang diberikan!")

### 4.1. Train set

In [7]:
train_set.categori.value_counts()

categori
SEDANG         914
TIDAK SEHAT    223
BAIK           132
Name: count, dtype: int64

In [8]:
train_set = join_cat(train_set, params)

In [9]:
train_set.categori.value_counts()

categori
TIDAK BAIK    1137
BAIK           132
Name: count, dtype: int64

### 4.2. Valid set

In [10]:
valid_set.categori.value_counts()

categori
SEDANG         196
TIDAK SEHAT     48
BAIK            28
Name: count, dtype: int64

In [11]:
valid_set = join_cat(valid_set, params)

In [12]:
valid_set.categori.value_counts()

categori
TIDAK BAIK    244
BAIK           28
Name: count, dtype: int64

### 4.3. Test set

In [13]:
test_set.categori.value_counts()

categori
SEDANG         195
TIDAK SEHAT     48
BAIK            29
Name: count, dtype: int64

In [14]:
test_set = join_cat(test_set, params)

In [15]:
test_set.categori.value_counts()

categori
TIDAK BAIK    243
BAIK           29
Name: count, dtype: int64

## 5. Handling Missing Value

In [16]:
def nan_detector(set_data):
    set_data = set_data.copy()
    set_data.replace(-1, np.nan, inplace = True)
    return set_data

### 5.1. Convert -1 to NaN

#### 5.1.1. Train Set

In [17]:
train_set.describe()

Unnamed: 0,pm10,pm25,so2,co,o3,no2
count,1269.0,1269.0,1269.0,1269.0,1269.0,1269.0
mean,51.152088,75.731284,33.360126,11.536643,30.835303,19.267928
std,17.246488,29.30646,14.710573,5.045186,15.319329,9.231295
min,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,42.0,60.0,25.0,8.0,21.0,13.0
50%,54.0,77.0,34.0,11.0,28.0,18.0
75%,62.0,93.0,43.0,14.0,37.0,25.0
max,100.0,174.0,82.0,44.0,151.0,65.0


In [18]:
train_set.isnull().sum()

stasiun     0
pm10        0
pm25        0
so2         0
co          0
o3          0
no2         0
categori    0
dtype: int64

In [19]:
train_set = nan_detector(train_set)

In [20]:
train_set.isnull().sum()

stasiun      0
pm10        35
pm25        55
so2         68
co          10
o3          35
no2          9
categori     0
dtype: int64

#### 5.1.2. Valid Set

In [21]:
valid_set.describe()

Unnamed: 0,pm10,pm25,so2,co,o3,no2
count,272.0,272.0,272.0,272.0,272.0,272.0
mean,51.496324,75.459559,34.341912,11.290441,32.533088,18.786765
std,17.315394,28.64051,15.026324,5.297794,14.238053,9.295802
min,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,43.0,60.0,26.0,8.0,23.0,12.0
50%,54.0,77.0,34.0,10.5,30.0,17.0
75%,62.0,95.0,45.0,13.0,41.0,24.25
max,94.0,150.0,80.0,47.0,85.0,62.0


In [22]:
valid_set.isnull().sum()

stasiun     0
pm10        0
pm25        0
so2         0
co          0
o3          0
no2         0
categori    0
dtype: int64

In [23]:
valid_set = nan_detector(valid_set)

In [24]:
valid_set.isnull().sum()

stasiun      0
pm10         9
pm25        13
so2         15
co           3
o3           3
no2          4
categori     0
dtype: int64

#### 5.1.1. Test Set

In [25]:
test_set.describe()

Unnamed: 0,pm10,pm25,so2,co,o3,no2
count,272.0,272.0,272.0,272.0,272.0,272.0
mean,51.121324,73.444853,32.360294,11.816176,31.194853,18.669118
std,18.589466,29.945903,14.567148,5.170456,16.550122,8.879725
min,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,40.0,59.0,23.0,9.0,21.0,13.0
50%,54.0,76.0,32.0,11.0,28.0,18.0
75%,61.0,91.0,43.0,14.0,38.25,24.0
max,179.0,150.0,64.0,44.0,93.0,49.0


In [26]:
test_set.isnull().sum()

stasiun     0
pm10        0
pm25        0
so2         0
co          0
o3          0
no2         0
categori    0
dtype: int64

In [27]:
test_set = nan_detector(test_set)

In [28]:
test_set.isnull().sum()

stasiun      0
pm10         9
pm25        18
so2         14
co           3
o3          10
no2          6
categori     0
dtype: int64

### 5.2. Imputing

#### 5.2.1. Preparation of imputation values

In [29]:
impute_pm10 = int(train_set.pm10.mean())
impute_pm25 = int(train_set.pm25.mean())
impute_so2 = int(train_set.so2.mean())
impute_co = int(train_set.co.median())
impute_o3 = int(train_set.o3.median())
impute_no2 = int(train_set.no2.median())

In [30]:
impute_values = {
    "pm10" : impute_pm10,
    "pm25" : impute_pm25,
    "so2" : impute_so2,
    "co" : impute_co,
    "o3" : impute_o3,
    "no2" : impute_no2
}

In [31]:
impute_values

{'pm10': 52, 'pm25': 79, 'so2': 35, 'co': 11, 'o3': 29, 'no2': 18}

#### 5.2.2. Train Set

In [32]:
train_set.isnull().sum()

stasiun      0
pm10        35
pm25        55
so2         68
co          10
o3          35
no2          9
categori     0
dtype: int64

In [33]:
train_set.fillna(value = impute_values, inplace = True)

In [34]:
train_set.isnull().sum()

stasiun     0
pm10        0
pm25        0
so2         0
co          0
o3          0
no2         0
categori    0
dtype: int64

#### 5.2.3. Valid Set

In [35]:
valid_set.isnull().sum()

stasiun      0
pm10         9
pm25        13
so2         15
co           3
o3           3
no2          4
categori     0
dtype: int64

In [36]:
valid_set.fillna(value = impute_values, inplace = True)

In [37]:
valid_set.isnull().sum()

stasiun     0
pm10        0
pm25        0
so2         0
co          0
o3          0
no2         0
categori    0
dtype: int64

#### 5.2.3. Test Set

In [38]:
test_set.isnull().sum()

stasiun      0
pm10         9
pm25        18
so2         14
co           3
o3          10
no2          6
categori     0
dtype: int64

In [39]:
test_set.fillna(value = impute_values, inplace = True)

In [40]:
test_set.isnull().sum()

stasiun     0
pm10        0
pm25        0
so2         0
co          0
o3          0
no2         0
categori    0
dtype: int64

## 6. Encoding Stasiun

### 6.1. Fitting OHE Model

In [41]:
def ohe_fit(fit_data: np.array) -> OneHotEncoder:
    ohe_object = OneHotEncoder(sparse_output = False)
    ohe_object.fit(fit_data)
    return ohe_object

In [42]:
ohe_fit_data = np.array(params["range_stasiun"]).reshape(-1, 1)
ohe_station = ohe_fit(ohe_fit_data)

In [43]:
utils.serialize_data(ohe_station, params["ohe_station_path"])

Serialized models/ohe_stasiun.pkl


### 6.2. Transforming Categoric Data

#### 6.2.1. Train Set

In [44]:
def ohe_transform(ohe_model: OneHotEncoder, data: np.array) -> pd.DataFrame:
    transformed = ohe_model.transform(data)
    transformed = pd.DataFrame(transformed, columns=list(ohe_model.categories_[0]))
    return transformed

def combine_ohetransformed_to_master(master_data: pd.DataFrame, ohe_transformed: pd.DataFrame, column_name: str = None):
    master_data = master_data.copy(deep=True)
    ohe_transformed = ohe_transformed.copy(deep=True)

    ohe_transformed.set_index(master_data.index, inplace=True)
    master_data = pd.concat([ohe_transformed, master_data], axis=1)

    if column_name != None:
        master_data.drop(columns=column_name, inplace=True)

    return master_data

def ohe_transform_combine(ohe_model: OneHotEncoder, master_data: pd.DataFrame, column_name: str = None):
    master_data = master_data.copy(deep=True)

    data = np.array(master_data[column_name].to_list()).reshape(-1, 1)

    transformed = ohe_transform(ohe_model, data)
    master_data = combine_ohetransformed_to_master(master_data, transformed, column_name)

    return master_data

In [45]:
train_set = ohe_transform_combine(ohe_station, train_set, "stasiun")
train_set

Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,pm10,pm25,so2,co,o3,no2,categori
1527,1.0,0.0,0.0,0.0,0.0,25.0,42.0,20.0,8.0,18.0,22.0,BAIK
433,0.0,0.0,0.0,0.0,1.0,57.0,86.0,35.0,16.0,19.0,30.0,TIDAK BAIK
1357,0.0,0.0,0.0,0.0,1.0,24.0,35.0,21.0,7.0,22.0,9.0,BAIK
1136,0.0,1.0,0.0,0.0,0.0,38.0,55.0,24.0,11.0,68.0,7.0,TIDAK BAIK
1097,1.0,0.0,0.0,0.0,0.0,72.0,108.0,14.0,43.0,44.0,20.0,TIDAK BAIK
...,...,...,...,...,...,...,...,...,...,...,...,...
1802,0.0,0.0,0.0,0.0,1.0,46.0,74.0,33.0,7.0,26.0,18.0,TIDAK BAIK
1433,0.0,0.0,1.0,0.0,0.0,35.0,55.0,42.0,7.0,29.0,9.0,TIDAK BAIK
977,0.0,1.0,0.0,0.0,0.0,82.0,112.0,56.0,12.0,41.0,27.0,TIDAK BAIK
983,0.0,0.0,1.0,0.0,0.0,60.0,84.0,47.0,7.0,23.0,18.0,TIDAK BAIK


#### 6.2.2. Valid Set

In [46]:
valid_set = ohe_transform_combine(ohe_station, valid_set, "stasiun")
valid_set

Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,pm10,pm25,so2,co,o3,no2,categori
1394,1.0,0.0,0.0,0.0,0.0,53.0,68.0,56.0,17.0,17.0,9.0,TIDAK BAIK
1692,1.0,0.0,0.0,0.0,0.0,59.0,78.0,25.0,11.0,27.0,26.0,TIDAK BAIK
724,0.0,0.0,0.0,1.0,0.0,52.0,78.0,41.0,15.0,20.0,11.0,TIDAK BAIK
1609,0.0,0.0,1.0,0.0,0.0,59.0,88.0,19.0,13.0,27.0,19.0,TIDAK BAIK
707,0.0,0.0,0.0,1.0,0.0,51.0,79.0,40.0,14.0,32.0,13.0,TIDAK BAIK
...,...,...,...,...,...,...,...,...,...,...,...,...
1167,0.0,0.0,1.0,0.0,0.0,38.0,13.0,17.0,7.0,39.0,18.0,BAIK
337,0.0,1.0,0.0,0.0,0.0,52.0,79.0,14.0,11.0,41.0,27.0,TIDAK BAIK
1223,0.0,0.0,0.0,0.0,1.0,55.0,79.0,19.0,29.0,67.0,13.0,TIDAK BAIK
929,1.0,0.0,0.0,0.0,0.0,64.0,83.0,35.0,7.0,24.0,23.0,TIDAK BAIK


### 6.1. Test Set

In [47]:
test_set = ohe_transform_combine(ohe_station, test_set, "stasiun")
test_set

Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,pm10,pm25,so2,co,o3,no2,categori
1823,0.0,0.0,0.0,0.0,1.0,63.0,95.0,31.0,10.0,29.0,20.0,TIDAK BAIK
1533,1.0,0.0,0.0,0.0,0.0,54.0,69.0,26.0,14.0,15.0,28.0,TIDAK BAIK
381,0.0,0.0,1.0,0.0,0.0,47.0,66.0,49.0,9.0,28.0,11.0,TIDAK BAIK
871,0.0,0.0,0.0,1.0,0.0,57.0,116.0,39.0,13.0,20.0,26.0,TIDAK BAIK
1746,0.0,0.0,1.0,0.0,0.0,36.0,54.0,41.0,6.0,27.0,12.0,TIDAK BAIK
...,...,...,...,...,...,...,...,...,...,...,...,...
714,0.0,0.0,0.0,1.0,0.0,53.0,83.0,41.0,19.0,22.0,13.0,TIDAK BAIK
225,0.0,0.0,1.0,0.0,0.0,61.0,88.0,51.0,12.0,32.0,17.0,TIDAK BAIK
548,0.0,0.0,0.0,1.0,0.0,58.0,91.0,37.0,11.0,26.0,13.0,TIDAK BAIK
806,0.0,1.0,0.0,0.0,0.0,52.0,79.0,54.0,10.0,54.0,13.0,TIDAK BAIK


## 7. Splitting Predictor Target

### 7.1. Train Set

In [48]:
x_train, y_train = utils.split_predictor_target(train_set, params)

In [49]:
x_train

Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,pm10,pm25,so2,co,o3,no2
1527,1.0,0.0,0.0,0.0,0.0,25.0,42.0,20.0,8.0,18.0,22.0
433,0.0,0.0,0.0,0.0,1.0,57.0,86.0,35.0,16.0,19.0,30.0
1357,0.0,0.0,0.0,0.0,1.0,24.0,35.0,21.0,7.0,22.0,9.0
1136,0.0,1.0,0.0,0.0,0.0,38.0,55.0,24.0,11.0,68.0,7.0
1097,1.0,0.0,0.0,0.0,0.0,72.0,108.0,14.0,43.0,44.0,20.0
...,...,...,...,...,...,...,...,...,...,...,...
1802,0.0,0.0,0.0,0.0,1.0,46.0,74.0,33.0,7.0,26.0,18.0
1433,0.0,0.0,1.0,0.0,0.0,35.0,55.0,42.0,7.0,29.0,9.0
977,0.0,1.0,0.0,0.0,0.0,82.0,112.0,56.0,12.0,41.0,27.0
983,0.0,0.0,1.0,0.0,0.0,60.0,84.0,47.0,7.0,23.0,18.0


In [50]:
y_train

1527          BAIK
433     TIDAK BAIK
1357          BAIK
1136    TIDAK BAIK
1097    TIDAK BAIK
           ...    
1802    TIDAK BAIK
1433    TIDAK BAIK
977     TIDAK BAIK
983     TIDAK BAIK
747     TIDAK BAIK
Name: categori, Length: 1269, dtype: object

### 7.2. Validation Set

In [51]:
x_valid, y_valid = utils.split_predictor_target(valid_set, params)

In [52]:
x_valid

Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,pm10,pm25,so2,co,o3,no2
1394,1.0,0.0,0.0,0.0,0.0,53.0,68.0,56.0,17.0,17.0,9.0
1692,1.0,0.0,0.0,0.0,0.0,59.0,78.0,25.0,11.0,27.0,26.0
724,0.0,0.0,0.0,1.0,0.0,52.0,78.0,41.0,15.0,20.0,11.0
1609,0.0,0.0,1.0,0.0,0.0,59.0,88.0,19.0,13.0,27.0,19.0
707,0.0,0.0,0.0,1.0,0.0,51.0,79.0,40.0,14.0,32.0,13.0
...,...,...,...,...,...,...,...,...,...,...,...
1167,0.0,0.0,1.0,0.0,0.0,38.0,13.0,17.0,7.0,39.0,18.0
337,0.0,1.0,0.0,0.0,0.0,52.0,79.0,14.0,11.0,41.0,27.0
1223,0.0,0.0,0.0,0.0,1.0,55.0,79.0,19.0,29.0,67.0,13.0
929,1.0,0.0,0.0,0.0,0.0,64.0,83.0,35.0,7.0,24.0,23.0


In [53]:
y_valid

1394    TIDAK BAIK
1692    TIDAK BAIK
724     TIDAK BAIK
1609    TIDAK BAIK
707     TIDAK BAIK
           ...    
1167          BAIK
337     TIDAK BAIK
1223    TIDAK BAIK
929     TIDAK BAIK
310     TIDAK BAIK
Name: categori, Length: 272, dtype: object

### 7.3. Test Set

In [54]:
x_test, y_test = utils.split_predictor_target(test_set, params)

In [55]:
x_test

Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,pm10,pm25,so2,co,o3,no2
1823,0.0,0.0,0.0,0.0,1.0,63.0,95.0,31.0,10.0,29.0,20.0
1533,1.0,0.0,0.0,0.0,0.0,54.0,69.0,26.0,14.0,15.0,28.0
381,0.0,0.0,1.0,0.0,0.0,47.0,66.0,49.0,9.0,28.0,11.0
871,0.0,0.0,0.0,1.0,0.0,57.0,116.0,39.0,13.0,20.0,26.0
1746,0.0,0.0,1.0,0.0,0.0,36.0,54.0,41.0,6.0,27.0,12.0
...,...,...,...,...,...,...,...,...,...,...,...
714,0.0,0.0,0.0,1.0,0.0,53.0,83.0,41.0,19.0,22.0,13.0
225,0.0,0.0,1.0,0.0,0.0,61.0,88.0,51.0,12.0,32.0,17.0
548,0.0,0.0,0.0,1.0,0.0,58.0,91.0,37.0,11.0,26.0,13.0
806,0.0,1.0,0.0,0.0,0.0,52.0,79.0,54.0,10.0,54.0,13.0


In [56]:
y_test

1823    TIDAK BAIK
1533    TIDAK BAIK
381     TIDAK BAIK
871     TIDAK BAIK
1746    TIDAK BAIK
           ...    
714     TIDAK BAIK
225     TIDAK BAIK
548     TIDAK BAIK
806     TIDAK BAIK
754     TIDAK BAIK
Name: categori, Length: 272, dtype: object

## 7. Label Encoding

In [64]:
def le_fit(label_data: dict) -> LabelEncoder:
    le_object = LabelEncoder()
    le_object.fit(label_data)
    return le_object

In [65]:
le_categori = le_fit(params["target_categories_new"])

In [66]:
utils.serialize_data(le_categori, "models/le_categori.pkl")

Serialized models/le_categori.pkl


### 7.1. Train Set

In [67]:
list(train_set.categori.unique()) == params["target_categories_new"]

True

In [68]:
train_set.categori = le_categori.transform(train_set.categori)

#### 7.4. Validation Set

In [71]:
len(set(valid_set.categori.unique()) - set(params["target_categories_new"])) == 0

True

In [72]:
valid_set.categori = le_categori.transform(valid_set.categori)

#### 7.5. Test Set

In [73]:
len(set(test_set.categori.unique()) - set(params["target_categories_new"])) == 0

True

In [74]:
test_set.categori = le_categori.transform(test_set.categori)

## 8. Dump Data Latih

In [82]:
params

{'dataset_root_path': 'data/raw/',
 'dataset_combined_path': 'data/processed/combined.pkl',
 'dataset_cleaned_path': 'data/processed/cleaned.pkl',
 'x_train_path': 'data/processed/x_train.pkl',
 'y_train_path': 'data/processed/y_train.pkl',
 'x_valid_path': 'data/processed/x_valid.pkl',
 'y_valid_path': 'data/processed/y_valid.pkl',
 'x_test_path': 'data/processed/x_test.pkl',
 'y_test_path': 'data/processed/y_test.pkl',
 'ohe_station_path': 'models/ohe_stasiun.pkl',
 'datetime_columns': ['tanggal'],
 'int32_columns': ['pm10', 'pm25', 'so2', 'co', 'o3', 'no2', 'max'],
 'object_columns': ['stasiun', 'critical', 'categori'],
 'predictors': ['stasiun', 'pm10', 'pm25', 'so2', 'co', 'o3', 'no2'],
 'target': 'categori',
 'target_categories': ['BAIK', 'SEDANG', 'TIDAK SEHAT'],
 'target_categories_new': ['BAIK', 'TIDAK BAIK'],
 'missing_value_co': 11,
 'missing_value_no2': 18,
 'missing_value_o3': 29,
 'missing_value_pm10': {'BAIK': 28, 'TIDAK BAIK': 55},
 'missing_value_pm25': {'BAIK': 38, 'T

In [84]:
x_train, y_train = utils.split_predictor_target(train_set, params)
x_valid, y_valid = utils.split_predictor_target(valid_set, params)
x_test, y_test = utils.split_predictor_target(test_set, params)

In [91]:
utils.serialize_data(x_train, "data/processed/x_train_prep.pkl")
utils.serialize_data(y_train, "data/processed/y_train_prep.pkl")

utils.serialize_data(valid_set, "data/processed/x_valid_prep.pkl")
utils.serialize_data(valid_set, "data/processed/y_valid_prep.pkl")

utils.serialize_data(test_set, "data/processed/x_test_prep.pkl")
utils.serialize_data(test_set, "data/processed/y_test_prep.pkl")

Serialized data/processed/x_train_prep.pkl
Serialized data/processed/y_train_prep.pkl
Serialized data/processed/x_valid_prep.pkl
Serialized data/processed/y_valid_prep.pkl
Serialized data/processed/x_test_prep.pkl
Serialized data/processed/y_test_prep.pkl
