# Reading JSPN files and compiling the X and y vectors for LSTM

In [1]:
import os
import json
import numpy as np

DATA_DIR = "ml_data/"

X_list = []
y_list = []

param_keys = None
feature_keys = None

for fname in sorted(os.listdir(DATA_DIR)):
    if not fname.endswith(".json"):
        continue

    fpath = os.path.join(DATA_DIR, fname)
    with open(fpath, "r") as f:
        data = json.load(f)

    # -------- 1) check presence of outputs --------
    if "outputs" not in data:
        print(f"[WARN] File {fname} has no 'outputs' key, skipping")
        continue

    outputs = data["outputs"]

    if not isinstance(outputs, dict) or len(outputs) == 0:
        print(f"[WARN] File {fname} has empty or invalid 'outputs', skipping")
        continue

    # -------- 2) build time_keys --------
    try:
        time_keys = sorted(outputs.keys(), key=float)
    except ValueError:
        print(f"[WARN] File {fname} has non-numeric time keys: {list(outputs.keys())[:5]}")
        continue

    if len(time_keys) == 0:
        print(f"[WARN] File {fname} has no time steps, skipping")
        continue

    # -------- 3) params / target vector --------
    if "params" not in data:
        print(f"[WARN] File {fname} has no 'params' key, skipping")
        continue

    params = data["params"]
    if param_keys is None:
        param_keys = sorted(params.keys())
    y_vec = [params[k] for k in param_keys]
    y_list.append(y_vec)

    # -------- 4) feature names --------
    if feature_keys is None:
        sample_step = outputs[time_keys[0]]
        feature_keys = sorted(sample_step.keys())

    # Optional: check consistency of feature keys across files
    # current_keys = set(outputs[time_keys[0]].keys())
    # if set(feature_keys) != current_keys:
    #     print(f"[WARN] Feature mismatch in {fname}")

    # -------- 5) build (T, F) matrix for this file --------
    rows = []
    for t in time_keys:
        step_dict = outputs[t]
        row = [step_dict[k] for k in feature_keys]
        rows.append(row)

    X_sample = np.array(rows, dtype=np.float32)  # shape (T, F)
    X_list.append(X_sample)

# -------- 6) stack to arrays --------
X = np.stack(X_list, axis=0)  # (N, T, F)
y = np.stack(y_list, axis=0)  # (N, P)

print("X shape:", X.shape)
print("y shape:", y.shape)
print("param_keys:", param_keys)
print("first 5 feature_keys:", feature_keys[:5])

# %%


[WARN] File experiment_108.json has empty or invalid 'outputs', skipping
[WARN] File experiment_121.json has empty or invalid 'outputs', skipping
[WARN] File experiment_123.json has empty or invalid 'outputs', skipping
[WARN] File experiment_126.json has empty or invalid 'outputs', skipping
[WARN] File experiment_132.json has empty or invalid 'outputs', skipping
[WARN] File experiment_137.json has empty or invalid 'outputs', skipping
[WARN] File experiment_17.json has empty or invalid 'outputs', skipping
[WARN] File experiment_178.json has empty or invalid 'outputs', skipping
[WARN] File experiment_180.json has empty or invalid 'outputs', skipping
[WARN] File experiment_181.json has empty or invalid 'outputs', skipping
[WARN] File experiment_188.json has empty or invalid 'outputs', skipping
[WARN] File experiment_196.json has empty or invalid 'outputs', skipping
[WARN] File experiment_198.json has empty or invalid 'outputs', skipping
[WARN] File experiment_212.json has empty or invalid

# lets play!

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

# Assume X, y are numpy arrays:
# X: (N, T, F), y: (N, P)
N, T, F = X.shape
P = y.shape[1]   # number of parameters to predict

# 1) Train / test split
split = int(0.8 * N)
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

# 2) Tensors and dataloaders
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_ds = TensorDataset(
    torch.from_numpy(X_train).float(),
    torch.from_numpy(y_train).float()
)
test_ds = TensorDataset(
    torch.from_numpy(X_test).float(),
    torch.from_numpy(y_test).float()
)

# You can play with this:
batch_size = 32
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=len(test_ds), shuffle=False)

# 3) Model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size=32, num_layers=1, output_size=P):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # x: (batch, seq_len, input_size)
        out, _ = self.lstm(x)         # out: (batch, seq_len, hidden_size)
        out = out[:, -1, :]           # last timestep -> (batch, hidden_size)
        out = self.fc(out)            # (batch, output_size=P)
        return out

model = LSTMModel(input_size=F, hidden_size=32).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# 4) Train
epochs = 10
model.train()
for epoch in range(1, epochs + 1):
    epoch_loss = 0.0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)   # xb: (batch, T, F), yb: (batch, P)
        optimizer.zero_grad()
        preds = model(xb)                       # (batch, P)
        loss = criterion(preds, yb)             # MSE over all params
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item() * xb.size(0)
    epoch_loss /= len(train_loader.dataset)
    print(f"Epoch {epoch}/{epochs} - Train MSE: {epoch_loss:.6f}")

# 5) Evaluate
model.eval()
with torch.no_grad():
    for xb, yb in test_loader:
        xb, yb = xb.to(device), yb.to(device)
        preds = model(xb)
        test_mse = criterion(preds, yb).item()
print(f"Test MSE: {test_mse:.6f}")

# 6) Show a few predictions (for all params)
with torch.no_grad():
    sample_x = torch.from_numpy(X_test[:5]).float().to(device)
    sample_y = y_test[:5]   # shape (5, P)
    sample_preds = model(sample_x).cpu().numpy()  # (5, P)

for i in range(5):
    print(f"\nSample {i}:")
    print("  true:     ", sample_y[i])
    print("  predicted:", sample_preds[i])


Epoch 1/10 - Train MSE: 1492369628.715543
Epoch 2/10 - Train MSE: 1492353983.436950
Epoch 3/10 - Train MSE: 1492334189.231672
Epoch 4/10 - Train MSE: 1492318591.624633
Epoch 5/10 - Train MSE: 1492302163.706745
Epoch 6/10 - Train MSE: 1492287042.439883
Epoch 7/10 - Train MSE: 1492272258.627566
Epoch 8/10 - Train MSE: 1492257895.976540
Epoch 9/10 - Train MSE: 1492243716.129032
Epoch 10/10 - Train MSE: 1492229594.463343
Test MSE: 1646607872.000000

Sample 0:
  true:      [1.91380e+04 4.23950e+04 2.55920e+04 1.06400e+04 1.97700e+03 1.07479e+05
 2.17550e+04 1.00000e+00 7.40000e+01 3.00000e+00 1.20000e+01]
  predicted: [2.991295  3.4140208 3.6838093 3.086982  4.088244  3.8095102 3.7016006
 1.948569  2.9484906 1.7861248 2.6109757]

Sample 1:
  true:      [4.0488e+04 2.6503e+04 2.0591e+04 2.1608e+04 4.5677e+04 6.8478e+04
 4.9210e+04 2.0000e+00 6.6000e+01 3.0000e+00 1.1000e+01]
  predicted: [2.991295  3.4140208 3.6838095 3.086982  4.088244  3.8095107 3.701601
 1.948569  2.9484906 1.7861247 2.61