# Generate Data Files

This notebook generates normalized test data files for trajectory-based experiments using the Porto (or San Francisco) dataset. It loads raw trajectory data, filters and normalizes it, prepares input and output sequences for model testing, and saves the processed data—including test trajectories, sequence lengths, and normalization parameters.

##### **Imports**

In [3]:
import os
# oneDNN warning suppression TF 2.4.1
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

import copy

import numpy as np
import pandas as pd
import pickle
import statistics

from scipy.stats import energy_distance, wasserstein_distance
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from math import radians
from sklearn.metrics.pairwise import haversine_distances

from utils.data import *
from utils.plots import *
from utils.metrics import *
from models import *
from apu_trajgen import *

import warnings
warnings.filterwarnings('ignore', category=UserWarning)

## **Generate Files**

In [4]:
# selected_dataset = "PORTO"
selected_dataset = "SANFRANCISCO"

#### **Load and Pre-process Data** 

In [5]:

# Load trajectory data for faster execution
data = load_data_from_pickle(DATASET[selected_dataset], TOTAL_TRAJS)

subset = "test" # or "train" 

# Get the data in the selected square
data = get_data_in_square(data = data, square = DATA_SQUARE[selected_dataset])

# Get trajectories min and max values
mins, maxs =  get_min_max_from_data(data)

# Get number of trajectories
num_of_traj = len(data)

# Normalize the data using the min and max values
normalization_ranges = {"min": mins, "max": maxs}

# Only keep the lat and lon columns for now
data = [data[i][COLUMNS] for i in range(num_of_traj)]

# Normalize the data using scaler or normalization ranges
scaler, data = normalize_trajectory_data(dataset = data, normalization_type = 'min-max')

MAX_K = 1 # Maximum k value for the model MAX_K>=1

# Create X and Y from the data
X, Y =  create_X_Y_from_data(data, num_of_traj, k=MAX_K)

# Train data preparation (the same format as the test data)
X_test, Y_test, test_traj_seq_lengths = test_data_preparation(TRAINING_TESTING_SAME_FILE = TRAINING_TESTING_SAME_FILE,
                                                                X = copy.deepcopy(X), Y = copy.deepcopy(Y),
                                                                num_of_traj = num_of_traj,
                                                                training_size = 0,
                                                                SEQ_LEN = SEQ_LEN,
                                                                NUM_FEATS = NUM_FEATS,
                                                                TESTING_FILE = None,
                                                                data = data)



#### **Save Files** 

In [6]:
# Save the data

save_pickle(X_test, DATA_FOLDER + selected_dataset.lower() + "_X_"+subset+".pkl")
save_pickle(Y_test, DATA_FOLDER + selected_dataset.lower() + "_Y_"+subset+".pkl") 

# Save the sequence lengths of the test trajectories
save_pickle(test_traj_seq_lengths, DATA_FOLDER + selected_dataset.lower() + "_seq_len_"+subset+".pkl") 

# Save the minimum and maximum values for each feature in your trajectory data
save_pickle(normalization_ranges, DATA_FOLDER + selected_dataset.lower() + "_normalization_ranges_"+subset+".pkl")