### 전처리 노트북 3/5: 변수 정규화
사용법: 
- 전처리 노트북 2를 실행 후 01_electric_train_features.csv, 01_electric_test_features.csv 를 asset_path에 저장 
- 만약 data_path가 없는 경로라면 생성해주세요
- 각 셀을 순서대로 실행

결과물:
- 02_train_input_transformed.csv
- 02_train_elec_transformed.csv
- 02_train_meta_transformed.csv
- 02_test_input_transformed.csv
- 02_test_meta_transformed.csv

In [1]:
# Standard library imports
from pathlib import Path
from datetime import datetime
import json
import pickle

from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm import tqdm

# Data handling and preprocessing
import pandas as pd
import numpy as np
from sklearn.preprocessing import QuantileTransformer, MinMaxScaler, StandardScaler, OneHotEncoder


# Parallel processing
from multiprocessing import Pool

# YAML file handling
import yaml

from preprocessing_utils import Backup, get_data_paths
asset_path, data_path = get_data_paths()

SAVE = True

In [2]:
train_data = pd.read_csv((Path(asset_path, '01_electric_train_features.csv')))
test_data  = pd.read_csv((Path(asset_path, '01_electric_test_features.csv')))

train_data['datetime'] = pd.to_datetime(train_data['datetime'])
test_data['datetime'] = pd.to_datetime(test_data['datetime'])

pre_num_train      = train_data['num'].values
pre_datetime_train = train_data['datetime'].values

pre_num_test      = test_data['num'].values
pre_datetime_test = test_data['datetime'].values

In [3]:
print(train_data.columns)
train_data.head()

Index(['is_dummy', 'num', 'lat', 'lon', 'district', 'units', 'year',
       'datetime', 'holiday', 'altitude', 'temp', 'humid', 'wind', 'rain',
       'tchi', 'dci', 'hi', 'wchi', 'atemp', 'sum_qctr', 'sum_load',
       'avg_load', 'n_mean_load', 'elec'],
      dtype='object')


Unnamed: 0,is_dummy,num,lat,lon,district,units,year,datetime,holiday,altitude,...,tchi,dci,hi,wchi,atemp,sum_qctr,sum_load,avg_load,n_mean_load,elec
0,0,4821,33.273132,126.544771,3,11,2021,2021-01-01 00:00:00,1,18.254837,...,-1.0,40.481506,63.479224,681.185362,-1.580046,6950,751.32,68.301818,68.606449,99.56
1,0,4821,33.273132,126.544771,3,11,2021,2021-01-01 01:00:00,1,26.118528,...,-0.6,40.576487,63.122127,702.765266,-1.669947,6950,692.6,62.963636,68.606449,91.78
2,0,4821,33.273132,126.544771,3,11,2021,2021-01-01 02:00:00,1,31.523261,...,-1.3,40.517872,63.42806,733.39076,-2.077127,6950,597.48,54.316364,68.606449,79.17
3,0,4821,33.273132,126.544771,3,11,2021,2021-01-01 03:00:00,1,33.736168,...,-0.2,39.665205,65.751762,683.542071,-2.043783,6950,553.48,50.316364,68.606449,73.34
4,0,4821,33.273132,126.544771,3,11,2021,2021-01-01 04:00:00,1,32.382356,...,-0.8,39.72829,65.665011,674.487163,-1.985171,6950,526.24,47.84,68.606449,69.73


In [4]:
print(test_data.columns)
test_data.head()

Index(['is_dummy', 'num', 'lat', 'lon', 'district', 'units', 'year',
       'datetime', 'holiday', 'altitude', 'temp', 'humid', 'wind', 'rain',
       'tchi', 'dci', 'hi', 'wchi', 'atemp'],
      dtype='object')


Unnamed: 0,is_dummy,num,lat,lon,district,units,year,datetime,holiday,altitude,temp,humid,wind,rain,tchi,dci,hi,wchi,atemp
0,0,4816,33.274294,126.272389,3,11,2023,2023-01-01 00:00:00,1,18.097368,3.0,68.6,2.9,0.0,-0.1,40.95762,60.453669,738.881591,-1.316025
1,0,4816,33.274294,126.272389,3,11,2023,2023-01-01 01:00:00,1,25.985794,3.1,69.4,2.7,0.0,0.3,41.016686,59.985268,724.527134,-1.043713
2,0,4816,33.274294,126.272389,3,11,2023,2023-01-01 02:00:00,1,31.427094,3.6,68.3,2.3,0.0,1.2,41.883312,57.956953,686.953076,-0.229598
3,0,4816,33.274294,126.272389,3,11,2023,2023-01-01 03:00:00,1,33.687383,4.0,69.2,3.1,0.0,1.1,42.38472,56.264272,725.197689,-0.314628
4,0,4816,33.274294,126.272389,3,11,2023,2023-01-01 04:00:00,1,32.384968,4.2,69.5,2.5,0.0,2.0,42.65331,55.431667,685.767983,0.339766


____

In [5]:
elec_variables = ['sum_qctr', 'sum_load', 'avg_load','n_mean_load', 'elec']

standard_transformed = ['temp', 'humid', 'tchi', 'dci', 'hi', 'wchi', 'atemp']
to_quantile_transformed = ['rain', 'units', 'sum_qctr', 'sum_load', 'avg_load','n_mean_load', 'elec']
min_max_transformed = ['lon', 'lat', 'wind', 'altitude']
categorical_to_onehot = [('district',[0,1,2,3])]
no_transform = ['is_dummy', 'num', 'year','datetime', 'holiday'] + elec_variables

In [6]:
input_cols = ['district_c0', 'district_c1', 'district_c2', 'district_c3', 'units_qt', 'lon_mm', 'lat_mm', 'holiday', 'altitude_mm', 'temp_st', 'humid_st', 'tchi_st', 'dci_st', 'hi_st', 'wchi_st', 'atemp_st', 'rain_qt', 'wind_mm']
elec_cols  = ['sum_qctr_qt', 'sum_load_qt', 'avg_load_qt', 'n_mean_load_qt', 'elec_qt', 'sum_qctr', 'sum_load', 'avg_load', 'n_mean_load', 'elec']
meta_cols  = ['is_dummy', 'num', 'year', 'datetime']

### Fitting Transforms on Train Data (Calculate Statistics etc..)

In [7]:
transforms = {}

train_value_dict = {}

for tag, cols in zip(['_st', '_qt', '_mm'], [standard_transformed, to_quantile_transformed, min_max_transformed]):
    for col in cols:

        col_values = train_data[[col]].values

        if col in standard_transformed:
            qt = StandardScaler()
        elif col in to_quantile_transformed:
            qt = QuantileTransformer(output_distribution='uniform')
        elif col in min_max_transformed:
            qt = MinMaxScaler()
        else:
            print('error')
            break
            
        col_transformed = qt.fit_transform(col_values)
        transforms[col] = qt

        train_value_dict[col + tag] = col_transformed.ravel()

        if col in elec_variables:
            with open(Path(data_path,f'{col}_transform.pkl'), 'wb') as f:
                pickle.dump(transforms[col], f)

for col, cat in categorical_to_onehot:
    col_values = train_data[col].values
    qt = OneHotEncoder(categories=[cat])
    col_transformed = qt.fit_transform(col_values.reshape(-1,1)).toarray()
    transforms[col] = qt
    for i in range(col_transformed.shape[1]):
        train_value_dict[col + f'_c{i}'] = col_transformed[:,i].ravel()

for col in no_transform:
    train_value_dict[col] = train_data[col].values.ravel()

In [8]:
train_value_dict.keys()

dict_keys(['temp_st', 'humid_st', 'tchi_st', 'dci_st', 'hi_st', 'wchi_st', 'atemp_st', 'rain_qt', 'units_qt', 'sum_qctr_qt', 'sum_load_qt', 'avg_load_qt', 'n_mean_load_qt', 'elec_qt', 'lon_mm', 'lat_mm', 'wind_mm', 'altitude_mm', 'district_c0', 'district_c1', 'district_c2', 'district_c3', 'is_dummy', 'num', 'year', 'datetime', 'holiday', 'sum_qctr', 'sum_load', 'avg_load', 'n_mean_load', 'elec'])

In [9]:
assert set(input_cols + elec_cols + meta_cols) == set(train_value_dict.keys())

### Apply Transforms on Test Data

In [10]:
test_value_dict = {}

for tag, cols in zip(['_st', '_qt', '_mm'], [standard_transformed, to_quantile_transformed, min_max_transformed]):
    for col in cols:
        if col in elec_variables: continue
        
        col_values = test_data[[col]].values

        qt = transforms[col]
        col_transformed = qt.transform(col_values)

        test_value_dict[col + tag] = col_transformed.ravel()

        if col in min_max_transformed:
            # trim the values to 0~1
            test_value_dict[col + tag] = np.clip(test_value_dict[col + tag], 0, 1).ravel()

for col, cat in categorical_to_onehot:
    
    qt = transforms[col]
    col_values = test_data[col].values
    col_transformed = qt.transform(col_values.reshape(-1,1)).toarray()
    for i in range(col_transformed.shape[1]):
        test_value_dict[col + f'_c{i}'] = col_transformed[:,i].ravel()

for col in no_transform:
    if col in elec_variables: continue
    test_value_dict[col] = test_data[col].values.ravel()


In [11]:
assert set(input_cols + meta_cols) == set(test_value_dict.keys()), print(set(input_cols + meta_cols) - set(test_value_dict.keys()))

In [12]:
train_whole_df = pd.DataFrame(train_value_dict)
test_whole_df = pd.DataFrame(test_value_dict)

In [13]:
train_input_df = train_whole_df[input_cols]
train_elec_df = train_whole_df[elec_cols]
train_meta_df = train_whole_df[meta_cols]

test_input_df = test_whole_df[input_cols]
test_meta_df = test_whole_df[meta_cols]

In [14]:
train_input_df.to_csv(Path(asset_path, '02_train_input_transformed.csv'), index=False)
train_elec_df.to_csv(Path(asset_path, '02_train_elec_transformed.csv'), index=False)
train_meta_df.to_csv(Path(asset_path, '02_train_meta_transformed.csv'), index=False)

test_input_df.to_csv(Path(asset_path, '02_test_input_transformed.csv'), index=False)
test_meta_df.to_csv(Path(asset_path, '02_test_meta_transformed.csv'), index=False)

___