In [1]:

import pandas as pd
import os

WINDOW_SIZE = 10
PREDICT_SIZE = 1

# Paths to split datasets
TRAIN_FILE = "../data/processed/etth1_train.csv"
VAL_FILE = "../data/processed/etth1_val.csv"
OUTPUT_DIR = "../data/llm_preprocessed/"

os.makedirs(OUTPUT_DIR, exist_ok=True)

In [2]:
def generate_prompt_target_pairs(values, window_size, predict_size):
    inputs, targets = [], []
    for i in range(len(values) - window_size - predict_size):
        input_seq = values[i : i + window_size]
        target_seq = values[i + window_size : i + window_size + predict_size]
        prompt = f"Given past values: {input_seq}, predict next {predict_size} value(s):"
        target = str(target_seq)
        inputs.append(prompt)
        targets.append(target)
    return inputs, targets

In [3]:
# 🔹 1. Load train and val sets
train_df = pd.read_csv(TRAIN_FILE)
val_df = pd.read_csv(VAL_FILE)

train_values = train_df['OT'].tolist()
val_values = val_df['OT'].tolist()

In [4]:
# 🔹 2. Generate prompt-target pairs
train_inputs, train_targets = generate_prompt_target_pairs(train_values, WINDOW_SIZE, PREDICT_SIZE)
val_inputs, val_targets = generate_prompt_target_pairs(val_values, WINDOW_SIZE, PREDICT_SIZE)

In [6]:

# 🔹 3. Save the preprocessed LLM inputs
pd.DataFrame({"prompt": train_inputs, "completion": train_targets}).to_csv(f"{OUTPUT_DIR}/train.csv", index=False)
pd.DataFrame({"prompt": val_inputs, "completion": val_targets}).to_csv(f"{OUTPUT_DIR}/val.csv", index=False)


In [7]:
# 🔹 4. Print sample
print("Sample prompt-target pairs:")
for i in range(3):
    print(f"Prompt: {train_inputs[i]}")
    print(f"Completion: {train_targets[i]}\n")

print(f"✅ Preprocessing done!\n→ {len(train_inputs)} train samples saved to {OUTPUT_DIR}/train.csv\n→ {len(val_inputs)} val samples saved to {OUTPUT_DIR}/val.csv")

Sample prompt-target pairs:
Prompt: Given past values: [0.6910176320171663, 0.6362329675922095, 0.6362329675922095, 0.5814682574604971, 0.5196557975809077, 0.5042026826110103, 0.5365064746921488, 0.5435342320660694, 0.5140455566415751, 0.4297721787114119], predict next 1 value(s):
Completion: [0.4803442024190923]

Prompt: Given past values: [0.6362329675922095, 0.6362329675922095, 0.5814682574604971, 0.5196557975809077, 0.5042026826110103, 0.5365064746921488, 0.5435342320660694, 0.5140455566415751, 0.4297721787114119, 0.4803442024190923], predict next 1 value(s):
Completion: [0.4831393267017803]

Prompt: Given past values: [0.6362329675922095, 0.5814682574604971, 0.5196557975809077, 0.5042026826110103, 0.5365064746921488, 0.5435342320660694, 0.5140455566415751, 0.4297721787114119, 0.4803442024190923, 0.4831393267017803], predict next 1 value(s):
Completion: [0.464891087449195]

✅ Preprocessing done!
→ 13925 train samples saved to ../data/llm_preprocessed//train.csv
→ 1731 val samples s