#### creates data interim.parquet 

bash scripts/make_interim.sh \
  --raw data/raw/*.csv \
  --out data/interim \
  --gap_hours 6 --max_sog 40



#### sanity check


In [5]:
# Set PWD to project root
import os
from pathlib import Path

# Hard-coded project root (notebook-safe)
root_dir = Path("/Users/alexanderschiotz/Desktop/DTU/Master/Deep Learning/projects/ais-mda")
if not root_dir.exists():
    raise FileNotFoundError(f"Configured project root does not exist: {root_dir}")

os.chdir(root_dir)
print(f"Set PWD to project root: {root_dir}")


Set PWD to project root: /Users/alexanderschiotz/Desktop/DTU/Master/Deep Learning/projects/ais-mda


In [6]:
import pandas as pd
df = pd.read_parquet("data/interim/interim.parquet")
df.groupby(["mmsi","segment_id"]).size().describe()
df[["mmsi","segment_id"]].drop_duplicates().head()

Unnamed: 0,mmsi,segment_id
0,111000001,0
500,111000002,0
1000,111000003,0


## Build Processed Tensors for Model Training:
### Trajectory task
```bash
bash scripts/make_processed.sh \
  --interim data/interim/interim.parquet \
  --task trajectory --window 64 --horizon 12 \
  --out data/processed/traj_w64_h12
```

### ETA task
```bash
bash scripts/make_processed.sh \
  --interim data/interim/interim.parquet \
  --task eta --window 64 \
  --out data/processed/eta_w64
```
### Anomaly task
```bash
bash scripts/make_processed.sh \
  --interim data/interim/interim.parquet \
  --task anomaly --window 64 --horizon 12 \
  --out data/processed/anom_w64_h12
```
⸻

### Train Models
GRU baseline (trajectory)
```bash
python -m src.train.train_traj --config configs/traj_gru_small.yaml
```

TPTrans hybrid (CNN + Transformer)
```bash
python -m src.train.train_traj --config configs/traj_tptrans_base.yaml
```

ETA prediction (GRU)
```bash
python -m src.train.train_eta --config configs/eta_gru.yaml
```

# trajectory metrics + a few sample trajectory plots to data/figures/
```bash
bash scripts/eval.sh
```

# or run individually:
```bash
python -m src.eval.evaluate_traj --processed_dir data/processed/traj_w64_h12 \
  --ckpt data/checkpoints/traj_model.pt --model tptrans --plot
```

```bash
python -m src.eval.evaluate_eta --processed_dir data/processed/eta_w64 \
  --ckpt data/checkpoints/eta_model.pt
```

Summary of improvements
	•	Ensures out_dir exists before saving anything.
	•	Saves scaler.npz for all tasks (helps stable training/eval).
	•	Saves window_mmsi.npy for trajectory and anomaly for vessel-wise splits.
	•	Keeps dtypes tidy (float32).
	•	Leaves your existing behavior intact otherwise.


# Rebuild trajectory with scaler + window_mmsi saved
```bash
bash scripts/make_processed.sh \
  --interim data/interim/interim.parquet \
  --task trajectory --window 64 --horizon 12 \
  --out data/processed/traj_w64_h12
```

# (optional) Rebuild ETA and anomaly similarly
```bash
bash scripts/make_processed.sh \
  --interim data/interim/interim.parquet \
  --task eta --window 64 \
  --out data/processed/eta_w64


bash scripts/make_processed.sh \
  --interim data/interim/interim.parquet \
  --task anomaly --window 64 --horizon 12 \
  --out data/processed/anom_w64_h12
```

Then retrain/evaluate:
```bash
python -m src.train.train_traj --config configs/traj_tptrans_base.yaml
python -m src.eval.evaluate_traj --processed_dir data/processed/traj_w64_h12 \
  --ckpt data/checkpoints/traj_model.pt --model tptrans --plot
```

 Run nested CV
```bash
 python -m src.train.nested_cv_traj \
  --processed_dir data/processed/traj_w64_h12 \
  --model tptrans \
  --outer_folds 5 \
  --inner_folds 3 \
  --max_trials 8
```

After changing it, rebuild + retrain
# Rebuild trajectory processed (creates target_scaler.npz)
bash scripts/make_processed.sh \
  --interim data/interim/interim.parquet \
  --task trajectory --window 64 --horizon 12 \
  --out data/processed/traj_w64_h12

# Retrain (TPTrans or GRU)
python -m src.train.train_traj --config configs/traj_tptrans_base.yaml

# Re-evaluate (now unscales predictions automatically)
python -m src.eval.evaluate_traj \
  --processed_dir data/processed/traj_w64_h12 \
  --ckpt data/checkpoints/traj_tptrans.pt \
  --model tptrans \
  --plot