In [125]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle

from datetime import datetime, timedelta

In [126]:
building_info = pd.read_csv("dacon_data/building_info.csv")
train_df = pd.read_csv("dacon_data/train.csv")
test_df = pd.read_csv("dacon_data/test.csv")
sample_submission_df = pd.read_csv("dacon_data/sample_submission.csv")

In [127]:
train_df["year"] = train_df.apply(lambda x:int(x["일시"].split(" ")[0][:4]), axis=1)
train_df["month"] = train_df.apply(lambda x:int(x["일시"].split(" ")[0][4:6]), axis=1)
train_df["day"] = train_df.apply(lambda x:int(x["일시"].split(" ")[0][6:]), axis=1)
train_df["hour"] = train_df.apply(lambda x:int(x["일시"].split(" ")[1]), axis=1)
train_df["date"] = train_df.apply(lambda x:datetime(x["year"], x["month"], x["day"]), axis=1)
train_df["dow"] = train_df.apply(lambda x:x["date"].weekday(), axis=1)

In [128]:
train_df["풍속(m/s)"] = train_df["풍속(m/s)"].interpolate()
train_df["습도(%)"] = train_df["습도(%)"].interpolate()

In [129]:
type_dict = dict()
for i, type_ in enumerate(building_info["건물유형"].unique()):
    type_dict[type_] = i

In [130]:
building_info["type"] = building_info.apply(lambda x:type_dict[x["건물유형"]], axis=1)

In [131]:
train_df2 = pd.merge(train_df, building_info[["건물번호", "연면적(m2)", "냉방면적(m2)", "type"]], on="건물번호", how="left")

In [132]:
train_df2["area_norm"] = np.log(train_df2["연면적(m2)"])

In [133]:
train_df2_train = train_df2[train_df2["date"] < datetime(2022, 8, 1)]
celcius_mean = train_df2_train["기온(C)"].mean()
celcius_std = train_df2_train["기온(C)"].std()
train_df2["celcius"] = (train_df2["기온(C)"] - celcius_mean) / celcius_std
train_df2["humidity"] = train_df2["습도(%)"] / 100

In [134]:
def get_week_no(date):
    target = date
    firstday = target.replace(day=1)
    if firstday.weekday() == 6:
        origin = firstday
    elif firstday.weekday() < 3:
        origin = firstday - timedelta(days=firstday.weekday() + 1)
    else:
        origin = firstday + timedelta(days=6 - firstday.weekday())
    return (target - origin).days // 7 + 1

In [135]:
train_df2["week_num"] = train_df2.apply(lambda x:get_week_no(x["date"]), axis=1)

In [47]:
value_features = ["전력소비량(kWh)", "celcius", "humidity", "풍속(m/s)", "area_norm"]
cat_features = ["week_num", "dow", "type", "hour", "건물번호"]

data_dir = "dacon_train"
seq_len = 336

if not os.path.exists(data_dir):
    os.makedirs(data_dir)

x_train_list, train_embedding_list, y_train_list = [], [], []
x_valid_list, valid_embedding_list, y_valid_list = [], [], []
test_dict = dict()

x_offsets = np.sort(np.concatenate((np.arange(-(seq_len - 1), 1, 1),)))
y_offsets = np.sort(np.arange(1, (168 + 1), 1))    

for num in train_df2["건물번호"].unique():
    temp_dict = dict()
    temp = train_df2[train_df2["건물번호"] == num]
    
    test_df = temp.iloc[-seq_len:]
    valid_df = temp[temp["date"] >= (datetime(2022, 8, 1) - timedelta(days=14))]
    train_df = temp[temp["date"] < datetime(2022, 8, 1)]
    
    train_embedding_ = train_df[cat_features].values
    train_value_ = train_df[value_features].values
    
    train_min_t = abs(min(x_offsets))
    train_max_t = abs(len(train_value_)) - abs(max(y_offsets))
    
    for t in range(train_min_t, train_max_t):
        x_train_list.append(train_value_[t + x_offsets, :])
        train_embedding_list.append(train_embedding_[t + x_offsets, :])
        y_train_list.append(train_value_[t + y_offsets, :1])
        
    valid_embedding_ = valid_df[cat_features].values
    valid_value_ = valid_df[value_features].values
    
    valid_min_t = abs(min(x_offsets))
    valid_max_t = abs(len(valid_value_)) - abs(max(y_offsets))
    
    for t in range(valid_min_t, valid_max_t):
        x_valid_list.append(valid_value_[t + x_offsets, :])
        valid_embedding_list.append(valid_embedding_[t + x_offsets, :])
        y_valid_list.append(valid_value_[t + y_offsets, :1])
        
    test_embedding_ = test_df[cat_features].values
    test_value_ = test_df[value_features].values
    
    test_min_t = abs(min(x_offsets))
    test_max_t = abs(len(test_value_)) - abs(max(y_offsets))
    
    x_test = test_value_[x_offsets, :].reshape(-1, seq_len, len(value_features))
    embedding_test = test_embedding_[x_offsets, :].reshape(-1, seq_len, len(cat_features))
    
    temp_dict["x"] = x_test
    temp_dict["e"] = embedding_test
    
    print(f"{num} x_size:{x_test.shape}, e_size:{embedding_test.shape}")
    test_dict[num] = temp_dict
    

x_train = np.stack(x_train_list, axis=0)
y_train = np.stack(y_train_list, axis=0)
embedding_train = np.stack(train_embedding_list, axis=0)

np.savez_compressed(f"{data_dir}/train.npz", x=x_train, y=y_train, e=embedding_train)
print(f"train x_size:{x_train.shape}, y_size:{y_train.shape}, e_size:{embedding_train.shape}")

x_valid = np.stack(x_valid_list, axis=0)
y_valid = np.stack(y_valid_list, axis=0)
embedding_valid = np.stack(valid_embedding_list, axis=0)

np.savez_compressed(f"{data_dir}/valid.npz", x=x_valid, y=y_valid, e=embedding_valid)
print(f"valid x_size:{x_valid.shape}, y_size:{y_valid.shape}, e_size:{embedding_valid.shape}")

with open(f"{data_dir}/test.pkl", "wb") as f:
    pickle.dump(test_dict, f)

1 x_size:(1, 336, 5), e_size:(1, 336, 5)
2 x_size:(1, 336, 5), e_size:(1, 336, 5)
3 x_size:(1, 336, 5), e_size:(1, 336, 5)
4 x_size:(1, 336, 5), e_size:(1, 336, 5)
5 x_size:(1, 336, 5), e_size:(1, 336, 5)
6 x_size:(1, 336, 5), e_size:(1, 336, 5)
7 x_size:(1, 336, 5), e_size:(1, 336, 5)
8 x_size:(1, 336, 5), e_size:(1, 336, 5)
9 x_size:(1, 336, 5), e_size:(1, 336, 5)
10 x_size:(1, 336, 5), e_size:(1, 336, 5)
11 x_size:(1, 336, 5), e_size:(1, 336, 5)
12 x_size:(1, 336, 5), e_size:(1, 336, 5)
13 x_size:(1, 336, 5), e_size:(1, 336, 5)
14 x_size:(1, 336, 5), e_size:(1, 336, 5)
15 x_size:(1, 336, 5), e_size:(1, 336, 5)
16 x_size:(1, 336, 5), e_size:(1, 336, 5)
17 x_size:(1, 336, 5), e_size:(1, 336, 5)
18 x_size:(1, 336, 5), e_size:(1, 336, 5)
19 x_size:(1, 336, 5), e_size:(1, 336, 5)
20 x_size:(1, 336, 5), e_size:(1, 336, 5)
21 x_size:(1, 336, 5), e_size:(1, 336, 5)
22 x_size:(1, 336, 5), e_size:(1, 336, 5)
23 x_size:(1, 336, 5), e_size:(1, 336, 5)
24 x_size:(1, 336, 5), e_size:(1, 336, 5)
2