In [None]:
import numpy as np

def find_most_similar_sequence(PV_data, time_stamps, x, pred_len):

    seq_len = len(x)
    total_len = len(PV_data)
    min_mse = np.inf
    best_start = 0
    
    for i in range(total_len - seq_len - pred_len + 1):
        candidate_seq = PV_data[i:i+seq_len]
        mse = np.mean((x - candidate_seq) ** 2)
        if mse < min_mse:
            min_mse = mse
            best_start = i
    
    R_x = PV_data[best_start:best_start+seq_len]
    Time_R_x = time_stamps[best_start:best_start+seq_len]
    R_x_next_step = PV_data[best_start+seq_len:best_start+seq_len+pred_len]
    Time_R_x_next_step = time_stamps[best_start+seq_len:best_start+seq_len+pred_len]

    return R_x, Time_R_x, R_x_next_step, Time_R_x_next_step

In [None]:
import pandas as pd
df = pd.read_csv('./dataset_hour_csv/solar_state_1_hour.csv')

In [None]:
# 分割数据
PV_data = df['Power (MW)'].values
time_stamps = df['date'].values

retrival_pool = PV_data[:int(len(PV_data)*0.5)]
retrical_time_stamps = time_stamps[:int(len(PV_data)*0.5)]

train_pv_data = PV_data[int(len(PV_data)*0.5):int(len(PV_data)*0.8)]
train_date = time_stamps[int(len(PV_data)*0.5):int(len(PV_data)*0.8)]

test_pv_data = PV_data[int(len(PV_data)*0.8):]
test_date = time_stamps[int(len(PV_data)*0.8):]

input_len = 48
pred_len = 24


In [4]:
PV_data.shape

(17544,)

In [5]:
# 训练集处理
train_input = []
train_input_time = []
train_retrival = []
train_retrival_time = []
train_retrival_label = []
train_retrival_label_time = []
train_label = []
train_label_time = []

for i in range(len(train_pv_data) - input_len - pred_len + 1):
    x = train_pv_data[i:i+input_len]
    label = train_pv_data[i+input_len:i+input_len+pred_len]
    label_time = train_date[i+input_len:i+input_len+pred_len]
    tx = train_date[i:i+input_len]
    R_x, Time_R_x, R_x_next_step, Time_R_x_next_step = find_most_similar_sequence(retrival_pool, retrical_time_stamps, x, pred_len)
    train_input.append(x)
    train_input_time.append(tx)
    train_retrival.append(R_x)
    train_retrival_time.append(Time_R_x)
    train_retrival_label.append(R_x_next_step)
    train_retrival_label_time.append(Time_R_x_next_step)
    train_label.append(label)
    train_label_time.append(label_time)
    
train_input = np.stack(train_input)
train_input_time = np.stack(train_input_time)
train_retrival = np.stack(train_retrival)
train_retrival_time = np.stack(train_retrival_time)
train_retrival_label = np.stack(train_retrival_label)
train_retrival_label_time = np.stack(train_retrival_label_time)
train_label = np.stack(train_label)
train_label_time = np.stack(train_label_time)

In [None]:
np.savez('./solar_1_48_24/train_dataset.npz',
        train_input=train_input,
        train_input_time=train_input_time,
        train_retrival=train_retrival,
        train_retrival_time=train_retrival_time,
        train_retrival_label=train_retrival_label,
        train_retrival_label_time=train_retrival_label_time,
        train_label=train_label,
        train_label_time=train_label_time
)

In [7]:
# 测试集处理
# 训练集处理
test_input = []
test_input_time = []
test_retrival = []
test_retrival_time = []
test_retrival_label = []
test_retrival_label_time = []
test_label = []
test_label_time = []

for i in range(len(test_pv_data) - input_len - pred_len + 1):
    x = test_pv_data[i:i+input_len]
    label = test_pv_data[i+input_len:i+input_len+pred_len]
    label_time = test_date[i+input_len:i+input_len+pred_len]
    tx = test_date[i:i+input_len]
    R_x, Time_R_x, R_x_next_step, Time_R_x_next_step = find_most_similar_sequence(retrival_pool, retrical_time_stamps, x, pred_len)
    test_input.append(x)
    test_input_time.append(tx)
    test_retrival.append(R_x)
    test_retrival_time.append(Time_R_x)
    test_retrival_label.append(R_x_next_step)
    test_retrival_label_time.append(Time_R_x_next_step)
    test_label.append(label)
    test_label_time.append(label_time)

    
test_input = np.stack(test_input)
test_input_time = np.stack(test_input_time)
test_retrival = np.stack(test_retrival)
test_retrival_time = np.stack(test_retrival_time)
test_retrival_label = np.stack(test_retrival_label)
test_retrival_label_time = np.stack(test_retrival_label_time)
test_label = np.stack(test_label)
test_label_time = np.stack(test_label_time)

In [None]:
np.savez('./solar_1_48_24/test_dataset.npz',
        test_input=test_input,
        test_input_time=test_input_time,
        test_retrival=test_retrival,
        test_retrival_time=test_retrival_time,
        test_retrival_label=test_retrival_label,
        test_retrival_label_time=test_retrival_label_time,
        test_label=test_label,
        test_label_time=test_label_time
         )