In [3]:
# preprocess_for_llm.py

import pandas as pd
import os
from sklearn.model_selection import train_test_split

INPUT_FILE = "../data/processed/etth1_processed.csv"  # <-- your data
OUTPUT_DIR = "../data/llm_preprocessed/"
WINDOW_SIZE = 10  # how many past points to use
PREDICT_SIZE = 1  # how many future points to predict

os.makedirs(OUTPUT_DIR, exist_ok=True)

In [4]:
# 1. Load
df = pd.read_csv(INPUT_FILE)

# Assuming your timeseries column is named 'OT'
values = df['OT'].tolist()

In [5]:

# 2. Create prompt-target pairs
inputs = []
targets = []

for i in range(len(values) - WINDOW_SIZE - PREDICT_SIZE):
    input_seq = values[i : i + WINDOW_SIZE]
    target_seq = values[i + WINDOW_SIZE : i + WINDOW_SIZE + PREDICT_SIZE]
    
    prompt = f"Given past values: {input_seq}, predict next {PREDICT_SIZE} value(s):"
    target = str(target_seq)
    
    inputs.append(prompt)
    targets.append(target)


In [6]:
#show some part of traind and test data
print("Sample input-output pairs:")
for i in range(5):
    print(f"Input: {inputs[i]}")
    print(f"Target: {targets[i]}")
    print()


Sample input-output pairs:
Input: Given past values: [0.6910176320171663, 0.6362329675922095, 0.6362329675922095, 0.5814682574604971, 0.5196557975809077, 0.5042026826110102, 0.5365064746921488, 0.5435342320660694, 0.5140455566415751, 0.4297721787114119], predict next 1 value(s):
Target: [0.4803442024190923]

Input: Given past values: [0.6362329675922095, 0.6362329675922095, 0.5814682574604971, 0.5196557975809077, 0.5042026826110102, 0.5365064746921488, 0.5435342320660694, 0.5140455566415751, 0.4297721787114119, 0.4803442024190923], predict next 1 value(s):
Target: [0.4831393267017803]

Input: Given past values: [0.6362329675922095, 0.5814682574604971, 0.5196557975809077, 0.5042026826110102, 0.5365064746921488, 0.5435342320660694, 0.5140455566415751, 0.4297721787114119, 0.4803442024190923, 0.4831393267017803], predict next 1 value(s):
Target: [0.464891087449195]

Input: Given past values: [0.5814682574604971, 0.5196557975809077, 0.5042026826110102, 0.5365064746921488, 0.5435342320660694

In [7]:
# 3. Split into train/test
train_inputs, val_inputs, train_targets, val_targets = train_test_split(inputs, targets, test_size=0.2, random_state=42)

In [8]:
# 4. Save
pd.DataFrame({"prompt": train_inputs, "completion": train_targets}).to_csv(f"{OUTPUT_DIR}/train.csv", index=False)
pd.DataFrame({"prompt": val_inputs, "completion": val_targets}).to_csv(f"{OUTPUT_DIR}/val.csv", index=False)

print(f"✅ Preprocessing done! {len(train_inputs)} train samples, {len(val_inputs)} val samples saved at {OUTPUT_DIR}")

✅ Preprocessing done! 13927 train samples, 3482 val samples saved at ../data/llm_preprocessed/
