### 전처리 노트북 4/5: Day Padding
설명:
- 본 전처리에서는 편의성을 위하여 연도마다 데이터가 독립적으로 구분되도록 yeartag를 지정하는 중임
- 시간 지연 변수를 추가할때 1월 1일에서 과거 연도의 데이터를 참조하지 않도록 하기 위해 실제 모델 학습에는 사용되지 않지만 같은 yeartag를 가지는 dummy 데이터를 생성함
- 1월 1일에서 과거의 정보가 없는 경우 (2020년, 그리고 중간 연도의 데이터가 존재하지 않는 격자인경우), 1월 1일의 정보를 그대로 복사하여 작년 12월 31일 정보로 추가

사용법: 
- 전처리 노트북 3를 실행 후 결과물들을 asset_path에 저장 
- 각 opt에 대하여 셀을 순서대로 실행

결과물:
- 03_train_input_padding.csv
- 03_train_elec_padding.csv
- 03_train_meta_padding.csv
- 03_test_input_padding.csv
- 03_test_meta_padding.csv

In [1]:
import yaml
from pathlib import Path
import pandas as pd
import numpy as np

import pandas as pd
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm import tqdm

from preprocessing_utils import Backup, get_data_paths
asset_path, data_path = get_data_paths()

SAVE_TO_DATA = False

PAD_HOUR = 240

In [2]:
# Daily pad is allowed because we copy the first values of the day.
assert PAD_HOUR % 24 == 0

In [3]:
input_cols = ['district_c0', 'district_c1', 'district_c2', 'district_c3', 'units_qt', 'lon_mm', 'lat_mm', 'holiday', 'altitude_mm', 'temp_st', 'humid_st', 'tchi_st', 'dci_st', 'hi_st', 'wchi_st', 'atemp_st', 'rain_qt', 'wind_mm']
elec_cols  = ['sum_qctr_qt', 'sum_load_qt', 'avg_load_qt', 'n_mean_load_qt', 'elec_qt', 'sum_qctr', 'sum_load', 'avg_load', 'n_mean_load', 'elec']
meta_cols  = ['is_dummy', 'num', 'year', 'datetime']

In [4]:
#load
train_input_df = pd.read_csv(Path(asset_path, '02_train_input_transformed.csv'))
train_elec_df = pd.read_csv(Path(asset_path, '02_train_elec_transformed.csv'))
train_meta_df = pd.read_csv(Path(asset_path, '02_train_meta_transformed.csv'))

test_input_df = pd.read_csv(Path(asset_path, '02_test_input_transformed.csv'))
test_meta_df = pd.read_csv(Path(asset_path, '02_test_meta_transformed.csv'))

In [16]:
opt = ['train', 'test'][0]

In [17]:
if opt == 'train':
    total_df = pd.concat([train_input_df, train_elec_df, train_meta_df], axis=1)
    total_df['datetime'] = pd.to_datetime(total_df['datetime'])

else:
    total_df = pd.concat([test_input_df, test_meta_df], axis=1)
    total_df['datetime'] = pd.to_datetime(total_df['datetime'])

In [18]:
pre_num = total_df['num'].values
pre_datetime = total_df['datetime'].values

In [19]:
Backup.save_with_key(total_df, f'backup1_{opt}')

get_backup = False
if get_backup:
    total_df = Backup.copy_data_with_key(f'backup1_{opt}')

In [20]:
# 기존 그룹화
grouped = total_df.groupby(['num', 'year'])

# 결과를 저장할 딕셔너리
result_dict = {}

# 그룹을 처리하는 함수 정의
def process_group(name_group):
    name, group = name_group
    num, year = name

    group['padding_dummy'] = 0
    
    first_datetime = group['datetime'].iloc[0]
    
    # 첫 datetime 이전 24시간 생성
    new_datetimes = pd.date_range(end=first_datetime, periods=PAD_HOUR+1, freq='h')[:-1]
    
    flag = False
    new_rows = []
    for idx, dts in enumerate(new_datetimes):
        search = total_df[(total_df['num'] == num) & (total_df['datetime'] == dts)]
        if search.shape[0] > 0:
            if search.shape[0] > 1:
                print('error')
            search = search.iloc[0].copy()
            search['padding_dummy'] = 1
            search['yeartag'] = year
            new_rows.append(search)
        else:
            new_row = group.iloc[idx].copy()
            new_row['datetime'] = dts
            new_row['padding_dummy'] = 1
            new_rows.append(new_row)

    new_rows = pd.concat(new_rows, axis=1).T
    
    group = pd.concat([new_rows, group]).sort_values(by='datetime').reset_index(drop=True)
    return name, group

# 병렬 처리
with ProcessPoolExecutor() as executor:
    futures = {executor.submit(process_group, item): item for item in grouped}
    
    for future in tqdm(as_completed(futures), total=len(grouped)):
        name, group = future.result()
        result_dict[name] = group

100%|██████████| 866/866 [34:01<00:00,  2.36s/it]  


In [21]:
# 'num' 열에서 값이 변경되는 위치의 인덱스 찾기
change_indices = total_df.index[total_df['num'].shift() != total_df['num']].tolist()
change_indices = np.asarray(change_indices, dtype=int)
num_order = total_df['num'].iloc[change_indices].values

print(len(num_order) == len(total_df['num'].unique()))

# Combine all groups into a single DataFrame
final_df = pd.concat(result_dict.values()).reset_index(drop=True)

# 'num' 열을 num_order 배열의 순서로 정렬하도록 설정
final_df['num'] = pd.Categorical(final_df['num'], categories=num_order, ordered=True)

# 'num'과 'datetime'을 기준으로 정렬
final_df = final_df.sort_values(by=['num', 'datetime']).reset_index(drop=True)

final_df.head()

True


Unnamed: 0,district_c0,district_c1,district_c2,district_c3,units_qt,lon_mm,lat_mm,holiday,altitude_mm,temp_st,...,sum_load,avg_load,n_mean_load,elec,is_dummy,num,year,datetime,padding_dummy,yeartag
0,0.0,0.0,0.0,1.0,0.066066,0.081734,0.0,1,0.369594,-1.332963,...,751.32,68.301818,68.606449,99.56,0,4821,2021,2020-12-22 00:00:00,1,
1,0.0,0.0,0.0,1.0,0.066066,0.081734,0.0,1,0.449759,-1.321778,...,692.6,62.963636,68.606449,91.78,0,4821,2021,2020-12-22 01:00:00,1,
2,0.0,0.0,0.0,1.0,0.066066,0.081734,0.0,1,0.504857,-1.332963,...,597.48,54.316364,68.606449,79.17,0,4821,2021,2020-12-22 02:00:00,1,
3,0.0,0.0,0.0,1.0,0.066066,0.081734,0.0,1,0.527416,-1.388888,...,553.48,50.316364,68.606449,73.34,0,4821,2021,2020-12-22 03:00:00,1,
4,0.0,0.0,0.0,1.0,0.066066,0.081734,0.0,1,0.513615,-1.388888,...,526.24,47.84,68.606449,69.73,0,4821,2021,2020-12-22 04:00:00,1,


In [22]:
assert (pre_num == final_df[final_df['padding_dummy'] == 0]['num'].values).all()
assert (pre_datetime ==  pd.to_datetime(final_df[final_df['padding_dummy'] == 0]['datetime'])).all()

In [23]:
Backup.save_with_key(final_df, f'backup2_{opt}')

get_backup = True
if get_backup:
    final_df = Backup.copy_data_with_key(f'backup2_{opt}')

In [24]:
a = final_df['padding_dummy'].values.astype(int)
b = final_df['is_dummy'].values.astype(int)

final_df['is_dummy'] = a + b - a * b 
final_df['is_dummy'] = final_df['is_dummy'].astype(int)

np.sum(final_df['is_dummy'] == 1), np.sum(final_df['padding_dummy'] == 1), np.sum(b == 1)

(207840, 207840, 0)

In [25]:
if opt == 'train':
    train_input_df = final_df[input_cols]
    train_elec_df = final_df[elec_cols]
    train_meta_df = final_df[meta_cols]

    train_input_df.to_csv(Path(asset_path, '03_train_input_padding.csv'), index=False)
    train_elec_df.to_csv(Path(asset_path, '03_train_elec_padding.csv'), index=False)
    train_meta_df.to_csv(Path(asset_path, '03_train_meta_padding.csv'), index=False)

else:
    test_input_df = final_df[input_cols]
    test_meta_df = final_df[meta_cols]

    test_input_df.to_csv(Path(asset_path, '03_test_input_padding.csv'), index=False)
    test_meta_df.to_csv(Path(asset_path, '03_test_meta_padding.csv'), index=False)

In [26]:
if SAVE_TO_DATA:
    if opt == 'train':
        train_input_df.to_csv(Path(data_path, 'train_input_variables.csv'), index=False)
        train_elec_df.to_csv(Path(data_path, 'train_elec_variables.csv'), index=False)
        train_meta_df.to_csv(Path(data_path, 'train_meta_information.csv'), index=False)
    else:
        test_input_df.to_csv(Path(data_path, 'test_input_variables.csv'), index=False)
        test_meta_df.to_csv(Path(data_path, 'test_meta_information.csv'), index=False)