# Data Preprocessing for Synthetic Multi-Entity Multivariate Time-Series

**Author:** Bahar

This notebook demonstrates the preprocessing workflow for our **synthetic multi-entity time-series datasets** generated using the simulation in the Unity Environment.  


The steps include:  

1. **XML to CSV**  
   - Optional: Check Exported XML Files per Subfolder
       - In rare cases, some agents may not have any exported XML files.  
       - This step detects such cases and allows you to remove incomplete agents before preprocessing.
   - Save as CSV
   - Round Time
   
3. **Add Acceleration Using Savitzky–Golay Filter (SciPy)**  
   - Use `savgol_filter` with the second derivative to calculate acceleration.  
   
2. **Optional: Rename Files** 

> ⚠️ Notes  

        1. Some steps are optional and meant to help clean and standardize the data before further analysis.  
        2. Number of Agents in Train Station Waiting Area simulation is 20, and bi-directional Corridor Collision is 30.  
        3. Sampling Rate = 10 Hz.

**Tip:** The code is provided as a starting point and "as is"—you are welcome to optimize or adapt it as desired.

## Importing the required libraries

In [1]:
import glob
import os
import csv
import re
import math
import pandas as pd
import numpy as np
from os import listdir, path
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
from scipy.interpolate import CubicSpline
from scipy.signal import medfilt
from scipy import signal
import shutil

In [2]:
root_path = r'C:\myDrive\TuftsPhD2022\ProfMillerPhDResearch\PhDResearchProfMiller\Github-Official\BuildFiles_V2_CorrectxML'
Num_agents = 20

## Remove the wrapper files

In [3]:
# Removing unwanted xml files
pattern = os.path.join (root_path, "**", "wrapper-*")
fileList = glob.glob(pattern, recursive=True)
# Iterate over the list of filepaths & remove each file.
for filePath in fileList:
    try:
        os.remove(filePath)
    except OSError:
        print("Error while deleting file")

## Optional: Check Exported XML Files per Subfolder  

In rare cases, an agent may not have any exported XML files.  
You can detect such cases and remove those agents before preprocessing.  

- This step is optional.  
- Useful for ensuring that each agent has valid data before further processing.  

In [4]:
def count_subfolders_with_few_xml_files(parent_folder):
    # Initialize the counter
    subfolder_count_with_few_files = 0 

    for subfolder in os.listdir(parent_folder):
        subfolder_path = os.path.join(parent_folder, subfolder)
        if os.path.isdir(subfolder_path):
            xml_files = [f for f in os.listdir(subfolder_path) if f.endswith('.xml')]
            if len(xml_files) < Num_agents:
                subfolder_count_with_few_files += 1 

    # Print the total count
    print(f'Total number of subfolders with less than 20 XML files: {subfolder_count_with_few_files}')  

count_subfolders_with_few_xml_files(root_path)

Total number of subfolders with less than 20 XML files: 0


In [None]:
def move_subfolders_with_few_xml_files(parent_folder, destination_folder):
    # Ensure the destination folder exists
    os.makedirs(destination_folder, exist_ok=True)
    
    # Initialize the counter
    subfolder_count_with_few_files = 0

    for subfolder in os.listdir(parent_folder):
        subfolder_path = os.path.join(parent_folder, subfolder)
        if os.path.isdir(subfolder_path):
            xml_files = [f for f in os.listdir(subfolder_path) if f.endswith('.xml')]
            if len(xml_files) < Num_agents:
                subfolder_count_with_few_files += 1
                
                # Move the subfolder to the destination folder
                shutil.move(subfolder_path, os.path.join(destination_folder, subfolder))

    print(f'Total number of subfolders with less than 20 XML files moved: {subfolder_count_with_few_files}')

destination_folder_path = r'pathTo\lessThan'
move_subfolders_with_few_xml_files(root_path, destination_folder_path)

## Save XML files as CSV file

In [5]:
for subdir, dirs, files in os.walk(root_path):
    t=1;
    for file in files:
        if file.endswith('.xml'):
            # Read the contents of the XML file
            xml_file_path = os.path.join(subdir, file)
            cols = ["PX", "PY", "PZ", "RX", "RY", "RZ","runTime","AgentID"]
            rows = []
            try:
                xmlparse= ET.parse(xml_file_path)

                root = xmlparse.getroot()

                for i in root:
                    runTime= i.attrib['runningTime']
                    PX = i.find("PX").text
                    PY = i.find("PY").text
                    PZ = i.find("PZ").text
                    RX = i.find("RX").text
                    RY = i.find("RY").text
                    RZ = i.find("RZ").text
                    AgentID= t

                    rows.append({"PX": PX,
                        "PY": PY,
                        "PZ": PZ,
                        "RX": RX,
                        "RY": RY,
                        "RZ": RZ,
                        "runTime": runTime,
                        "AgentID":AgentID})
                df = pd.DataFrame(rows, columns=cols)
                df_new= df.drop_duplicates(subset='runTime', keep="last")

                with open (xml_file_path+'.csv', 'w', newline='') as result:
                    df_new.to_csv(result, index=False)
                    t=t+1
            except:
                pass

## Removing XML files

In [6]:
pattern = os.path.join (root_path, "**", "*.xml")
fileList = glob.glob(pattern, recursive=True)

for filePath in fileList:
    try:
        os.remove(filePath)
    except OSError:
        print("Error while deleting file")

## Rounding the time 

In [7]:
for root, dirs, files in os.walk(root_path):
    for file in files:
        if file.endswith(".csv"):
            filename = os.path.join(root, file)
            df = pd.read_csv(filename)
            # Create a new column that rounds down each value using math.floor()
            df.loc[0, "t"] = math.floor(df.loc[0, "runTime"]*10)/10
            for i in range(df["runTime"].count()-1):
                df.loc[i+1, "t"] = df.loc[i, "t"] + 0.1

            # Write the updated DataFrame back to the CSV file
            df.to_csv(filename, index=False)

In [8]:
for root, dirs, files in os.walk(root_path):
    for file in files:
        if file.endswith(".csv"):
            filename = os.path.join(root, file)
            df = pd.read_csv(filename)
            df['t'] = df['t'].round(1)

            df.to_csv(filename, index=False)

# Add Acceleration Using Savitzky–Golay Filter (SciPy)

**Notes:**
- We use `scipy.signal.savgol_filter` with the *second derivative* to calculate acceleration.  
- This filter produces smoother and more accurate results in our tests.  
- Others are welcome to experiment with different methods for calculating acceleration.

In [9]:
for subdir, dirs, files in os.walk(root_path):
    for file in files:
        if file.endswith('.csv'):
            # Read the csv files
            csv_file_path = os.path.join(subdir, file)
            df = pd.read_csv(csv_file_path)

            x = df['PX'].values
            y = df['PY'].values
            z = df['PZ'].values

            rx = df['RX'].values
            ry = df['RY'].values
            rz = df['RZ'].values

            t = df['t'].values
            ID= df['AgentID'].values

            # Calculate the acceleration based on the savgol_filter
            dt = 0.1
            ax= signal.savgol_filter(x, window_length=21, polyorder=3, deriv=2, delta=dt, mode="nearest")
            ay= signal.savgol_filter(y, window_length=21, polyorder=3, deriv=2, delta=dt, mode="nearest")
            az= signal.savgol_filter(z, window_length=21, polyorder=3, deriv=2, delta=dt, mode="nearest")

            # Save to a new CSV file
            filtered_data = pd.DataFrame({
                                  't': t,
                                  'x': x,
                                  'y': y,
                                  'z': z,
                                  'ax': ax,
                                  'ay': ay,
                                  'az': az,
                                  'rx':rx,
                                  'ry':ry,
                                  'rz':rz,
                                  'ID': ID})

            filtered_data.to_csv(csv_file_path, index=False)


## Optional: Rename Files  

You may optionally rename the files (e.g., based on `agent_id`) to make them easier to work with during preprocessing.  

In [10]:
csv_files = glob.glob(os.path.join(root_path, "**", "*.csv"), recursive=True)

for file in csv_files:
    try:
        # Read the Agent ID
        df = pd.read_csv(file, usecols=["ID"])
        agent_id = str(df["ID"].iloc[0])
        
        # new file path naming
        folder = os.path.dirname(file)
        new_name = f"Agent_{agent_id}.csv"
        new_path = os.path.join(folder, new_name)
        
        # Rename
        os.rename(file, new_path)
        print(f"Renamed: {file} -> {new_path}")
    
    except Exception as e:
        print(f"Skipping {file} due to error: {e}")

Renamed: C:\myDrive\TuftsPhD2022\ProfMillerPhDResearch\PhDResearchProfMiller\Github-Official\BuildFiles_V2_CorrectxML\MyOutput0\log-45909.4175649769.xml.csv -> C:\myDrive\TuftsPhD2022\ProfMillerPhDResearch\PhDResearchProfMiller\Github-Official\BuildFiles_V2_CorrectxML\MyOutput0\Agent_1.csv
Renamed: C:\myDrive\TuftsPhD2022\ProfMillerPhDResearch\PhDResearchProfMiller\Github-Official\BuildFiles_V2_CorrectxML\MyOutput0\log-45909.4175660417.xml.csv -> C:\myDrive\TuftsPhD2022\ProfMillerPhDResearch\PhDResearchProfMiller\Github-Official\BuildFiles_V2_CorrectxML\MyOutput0\Agent_2.csv
Renamed: C:\myDrive\TuftsPhD2022\ProfMillerPhDResearch\PhDResearchProfMiller\Github-Official\BuildFiles_V2_CorrectxML\MyOutput0\log-45909.4175671528.xml.csv -> C:\myDrive\TuftsPhD2022\ProfMillerPhDResearch\PhDResearchProfMiller\Github-Official\BuildFiles_V2_CorrectxML\MyOutput0\Agent_3.csv
Renamed: C:\myDrive\TuftsPhD2022\ProfMillerPhDResearch\PhDResearchProfMiller\Github-Official\BuildFiles_V2_CorrectxML\MyOutput0