In [51]:
import copy
import pandas as pd
import src.util as utils
from sklearn.model_selection import train_test_split

## Load Configuration File

In [52]:
config = utils.load_config()

## Load Dataset

In [53]:
dataset = pd.read_csv(config["dataset_path"])

In [54]:
dataset

Unnamed: 0,time,kemendagri_kabupaten_kode,kemendagri_kabupaten_nama,street,jam_level,median_length,median_delay_seconds,median_regular_speed,total_records,cause_type,median_seconds,median_speed,date,median_jam_level,id,geometry
0,2022-07-06 00:00:00.000,32.75,KOTA BEKASI,Tol Cikampek (Cikunir-Cikarang),4,869.0,803.5,67.32000,38,,674.0,4.355000,2022-07-06,4.0,1493116,"MULTILINESTRING ((106.972028 -6.250261, 106.97..."
1,2022-07-06 01:00:00.000,32.75,KOTA BEKASI,Tol Cikampek (Cikunir-Cikarang),4,1717.0,1049.0,67.48000,12,,1135.0,5.475000,2022-07-06,4.0,1493118,"MULTILINESTRING ((106.972028 -6.250261, 106.97..."
2,2022-07-06 01:00:00.000,32.75,KOTA BEKASI,Tol Cikampek (Cikunir-Cikarang),3,1985.5,394.5,67.57001,10,,494.0,14.075001,2022-07-06,3.0,1493117,"MULTILINESTRING ((106.972028 -6.250261, 106.97..."
3,2022-07-06 06:00:00.000,32.75,KOTA BEKASI,Wibawa Mukti 2,4,1913.0,1254.0,7.29000,11,,1486.0,4.970000,2022-07-06,4.0,1493127,"MULTILINESTRING ((106.951773 -6.313015, 106.95..."
4,2022-07-06 06:00:00.000,32.75,KOTA BEKASI,Cipendawa Baru,4,708.0,769.0,6.05000,5,,852.0,2.960000,2022-07-06,4.0,1493119,"MULTILINESTRING ((106.975595 -6.299589, 106.97..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14468,2022-09-04 21:00:00.000,32.75,KOTA BEKASI,,2,651.0,122.0,27.56000,7,,194.0,12.110000,2022-09-04,2.0,1583680,"MULTILINESTRING ((106.958862 -6.249992, 106.95..."
14469,2022-09-04 22:00:00.000,32.75,KOTA BEKASI,Tol Becakayu,2,2600.0,154.0,84.78500,4,,271.5,34.510002,2022-09-04,2.0,1583683,"MULTILINESTRING ((106.944381 -6.249804, 106.94..."
14470,2022-09-04 22:00:00.000,32.75,KOTA BEKASI,,3,651.0,255.0,28.23000,7,,327.0,7.150000,2022-09-04,3.0,1583684,"MULTILINESTRING ((106.958862 -6.249992, 106.95..."
14471,2022-09-06 00:00:00.000,32.75,KOTA BEKASI,Tol Cikampek (Cikunir-Cikarang),4,1507.0,399.0,71.86000,33,,475.0,10.600000,2022-09-06,4.0,1584500,"MULTILINESTRING ((106.97037 -6.251164, 106.968..."


## Data Validation

In [55]:
dataset.isnull().sum()

time                             0
kemendagri_kabupaten_kode        0
kemendagri_kabupaten_nama        0
street                         113
jam_level                        0
median_length                    0
median_delay_seconds             0
median_regular_speed             0
total_records                    0
cause_type                   14473
median_seconds                   0
median_speed                     0
date                             0
median_jam_level                 0
id                               0
geometry                         0
dtype: int64

In [56]:
dataset.dtypes

time                          object
kemendagri_kabupaten_kode    float64
kemendagri_kabupaten_nama     object
street                        object
jam_level                      int64
median_length                float64
median_delay_seconds         float64
median_regular_speed         float64
total_records                  int64
cause_type                   float64
median_seconds               float64
median_speed                 float64
date                          object
median_jam_level             float64
id                             int64
geometry                      object
dtype: object

In [57]:
dataset.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
kemendagri_kabupaten_kode,14473.0,32.75,0.0,32.75,32.75,32.75,32.75,32.75
jam_level,14473.0,3.441097,0.666461,1.0,3.0,4.0,4.0,4.0
median_length,14473.0,1406.26,1222.173324,500.0,732.0,1089.0,1607.0,15551.0
median_delay_seconds,14473.0,574.4575,297.477652,-821.0,382.0,543.0,716.0,3777.5
median_regular_speed,14473.0,20.92735,16.206334,3.23,12.73,17.045,22.02,333.86
total_records,14473.0,14.77745,15.529021,1.0,4.0,9.0,20.0,155.0
cause_type,0.0,,,,,,,
median_seconds,14473.0,714.8761,330.331465,111.0,505.0,672.0,870.0,4164.0
median_speed,14473.0,7.787513,5.871103,0.655,4.325,6.34,9.115,53.68
median_jam_level,14473.0,3.441097,0.666461,1.0,3.0,4.0,4.0,4.0


In [58]:
dataset.time = pd.to_datetime(dataset.time)

In [59]:
dataset.date = pd.to_datetime(dataset.date)

## Data Defense

In [60]:
def check_data(input_data: pd.DataFrame, config: dict, api: bool = False):
    input_data = copy.deepcopy(input_data)
    config = copy.deepcopy(config)
    len_input_data = len(input_data)

    if not api:
        assert input_data.select_dtypes("int").columns.to_list() == config["int_columns"], "an error occurs in int column(s)."
        assert input_data.select_dtypes("float").columns.to_list() == config["float_columns"], "an error occurs in float column(s)."
    else:
        int_columns = config["int_columns"]
        int_columns = int_columns[:2]

            # Last 1 column names in list of int columns are not used as predictor (Outcome)
        float_columns = config["float_columns"]
        float_idx = [1,2,3,6]
        float_columns = [float_columns[idx] for idx in float_idx]
        print(float_columns)
        
        assert input_data.select_dtypes("int").columns.to_list() == int_columns, "an error occurs in int column(s)."
        assert input_data.select_dtypes("float").columns.to_list() == float_columns, "an error occurs in float column(s)."
            
    assert input_data[config["float_columns"][1]].between(
                        config["range_median_length"][0],
                        config["range_median_length"][1]
                        ).sum() == len_input_data, "an error occurs in range_median_length."
    assert input_data[config["float_columns"][2]].between(
                        config["range_median_delay_seconds"][0],
                        config["range_median_delay_seconds"][1]
                        ).sum() == len_input_data, "an error occurs in range_median_delay_seconds."
    assert input_data[config["float_columns"][3]].between(
                        config["range_median_regular_speed"][0],
                        config["range_median_regular_speed"][1]
                        ).sum() == len_input_data, "an error occurs in range_median_regular_speed."
    assert input_data[config["float_columns"][6]].between(
                        config["range_median_speed"][0],
                        config["range_median_speed"][1]
                        ).sum() == len_input_data, "an error occurs in range_median_speed."
    assert input_data[config["int_columns"][0]].between(
                        config["range_jam_level"][0],
                        config["range_jam_level"][1]
                        ).sum() == len_input_data, "an error occurs in range_jam_level."
    assert input_data[config["int_columns"][1]].between(
                        config["range_total_records"][0],
                        config["range_total_records"][1]
                        ).sum() == len_input_data, "an error occurs in range_total_records."

In [61]:
check_data(dataset, config)

## Data Splitting

In [62]:
x = dataset[config["predictors"]].copy()
y = dataset[config["label"]].copy()

In [63]:
x

Unnamed: 0,jam_level,median_length,median_delay_seconds,median_regular_speed,total_records,median_speed
0,4,869.0,803.5,67.32000,38,4.355000
1,4,1717.0,1049.0,67.48000,12,5.475000
2,3,1985.5,394.5,67.57001,10,14.075001
3,4,1913.0,1254.0,7.29000,11,4.970000
4,4,708.0,769.0,6.05000,5,2.960000
...,...,...,...,...,...,...
14468,2,651.0,122.0,27.56000,7,12.110000
14469,2,2600.0,154.0,84.78500,4,34.510002
14470,3,651.0,255.0,28.23000,7,7.150000
14471,4,1507.0,399.0,71.86000,33,10.600000


In [64]:
y

0         674.0
1        1135.0
2         494.0
3        1486.0
4         852.0
          ...  
14468     194.0
14469     271.5
14470     327.0
14471     475.0
14472     346.0
Name: median_seconds, Length: 14473, dtype: float64

In [65]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)
cleaned_dataset = pd.concat([x, y], axis = 1)

In [66]:
utils.pickle_dump(dataset, config["dataset_cleaned_path"])

utils.pickle_dump(x_train, config["train_set_path"][0])
utils.pickle_dump(y_train, config["train_set_path"][1])

utils.pickle_dump(x_test, config["test_set_path"][0])
utils.pickle_dump(y_test, config["test_set_path"][1])