In [1]:
# Python libraries
import os
from scipy.io import loadmat
import pandas as pd
import numpy as np
import itertools

# Load Matlab .mat files in Python
[source](https://towardsdatascience.com/how-to-load-matlab-mat-files-in-python-1f200e1287b5)

In [2]:
def get_info_mat(dirname, flag):
    """
    Generate a list of information about .mat files, including camera, day, sequence, and file path.

    Args:
    - dirname: Directory where the .mat files are located.
    - flag: A flag to determine the type of information ("pre" or "post").

    Returns:
    - info_mat: A list of lists, each containing [pathfile, camera, day, sequence].
    """

    # Define prefix and suffix for the filename
    prefix, suffix = "sequence_data", ".mat"

    # Define possible values for cameras based on the flag
    match flag:
        case "pre":
            cameras = ["_1", "_2", "_3", "_4", "_5"]
        case "post":
            cameras = ["1", "2", "3", "4", "5"]
        case _:
            return f"No match for {flag}, you can only choose between \"pre\" and \"post\""

    # Define values for days and sequences
    days = ["-1_"]
    sequences = ["1", "2"]

    # Store the info in a list
    info_mat = []
    for cam, day, seq in [(cam, day, seq) for cam in cameras for day in days for seq in sequences]:
        filename = f"{prefix}{cam}{day}{seq}{suffix}"
        pathfile = os.path.join(dirname, filename)
        info_mat.append([pathfile, cam[-1], day[1], seq])

    return info_mat


In [3]:
# def load_mat(info):
#     """
#     Load a .mat file specified in the info and return its data as a Python dictionary.

#     Args:
#     - info: Information about the .mat file, including the file path.

#     Returns:
#     - mat: Python dictionary containing the data from the .mat file.
#     """

#     pathfile = info[0]      # Get the path of the .mat file from the info
#     mat = loadmat(pathfile) # Load the .mat file and return its data as a Python dictionary

#     return mat

In [4]:
def is_correct_scn(trajs):
    """
    Check if a scene is admissible or correct. "Correct" means all trajectories are increasing,
    indicating that no vehicles are moving in the opposite direction on the motorway.

    Args:
    - trajs: A list of trajectories to be checked.

    Returns:
    - flag: True if the scene is correct, False otherwise.
    - wrong_path: The first incorrect trajectory (or None if all trajectories are correct).
    """
    
    flag = True  # Initialize the flag as True
    flag_zero = False  # Initialize a flag for detecting all-zero trajectories
    wrong_path = [None]  # Initialize a list to store the first incorrect trajectory (set to None initially)

    for traj in trajs:
        flag = all(earlier <= later for earlier, later in zip(traj, traj[1:]))  # Check if the trajectory is increasing
        flag_zero = all(v == 0 for v in traj)  # Check if the trajectory is all-zero
        if not flag:
            wrong_path = traj  # Store the first incorrect trajectory
            break
        if flag_zero:
            wrong_path = traj
            flag = False  # If all-zero trajectory found, set the flag to False and exit
            break

    return flag, wrong_path

In [5]:
def mat2pd(mat, info):
    """
    Convert data from a Python dictionary (mat) into a pandas DataFrame.

    Args:
    - mat: Python dictionary containing sequence data.
    - info: Information about the sequence (camera, day, sequence).

    Returns:
    - df: A pandas DataFrame containing scene data.
    """
    
    # Get the sequences stored in mat
    seqs = mat['sequences']
    nscene = seqs.shape[0]  # List containing the number of scenes in each sequence

    # Initialize lists to store information
    Xarr, Tarr, Nveh, cons_dis = [], [], [], []
    ic_list, wp_list = [], []

    for scn in range(0, nscene):  # Iterate over scenes
        tmp = seqs[scn][0][0][0]  # (xpos, t) for a scene
        x_scn, t_scn = tmp[0], tmp[1][0]  # x position and corresponding timestamps for a fixed scene
        
        # Check if the scene is correct and get the wrong path if not
        flag, wrong_path = is_correct_scn(x_scn)

        Xarr.append(x_scn)
        Tarr.append(t_scn)
        Nveh.append(len(x_scn))
        cons_dis.append(np.diff(x_scn, axis=0))  # Consecutive distances of vehicles in this scene
        ic_list.append(flag)
        wp_list.append(wrong_path)

    df = pd.DataFrame({'Tarr': Tarr,
                       'Xarr': Xarr,
                       'cons_dis': cons_dis,
                       'N. vehicles': Nveh,
                       'cam': info[1],
                       'day': info[2],
                       'seq': info[3],
                       'is_correct': ic_list,
                       'wrong_path': wp_list})

    return df

In [6]:
def df_purify(df):
    """
    Filter a DataFrame to remove rows with non-admissible trajectories and drop specific columns.

    Args:
    - df: A pandas DataFrame.

    Returns:
    - df_purified: A purified DataFrame with non-admissible rows removed and columns dropped.
    """
    
    # Consider only correct scenes
    cond = (df['is_correct'] == True)
    
    # Drop the 'is_correct' and 'wrong_path' columns
    df_purified = df[cond].drop(['is_correct', 'wrong_path'], axis=1)
    
    return df_purified

In [7]:
def standardize_data(df):
    """
    Standardize 'Xarr' and 'cons_dis' columns over a DataFrame by adding new columns.

    Args:
    - df: A pandas DataFrame.

    Returns:
    - df: The DataFrame with standardized columns added.
    """

    ## Standardize 'Xarr'
    Xarr = df['Xarr']
    # Calculate mean for all rows
    XarrRowsMean = [row.mean() for row in df['Xarr']]
    XarrMean = np.mean(XarrRowsMean)
    # Calculate standard deviation for all rows
    XarrRowsStd = [row.std() for row in df['Xarr']]
    XarrStd = np.mean(XarrRowsStd)
    # Standardize 'Xarr'
    Xarr_standardized = (Xarr - XarrMean) / XarrStd

    ## Standardize 'cons_dis'
    cons_dis_standardized = [np.diff(row, axis=0) for row in Xarr_standardized]
    
    # Add standardized columns to the DataFrame
    df['Xarr_std'] = Xarr_standardized
    df['XarrMean'] = XarrMean
    df['XarrStd'] = XarrStd
    df['cons_dis_std'] = cons_dis_standardized
    
    return

In [8]:
def load_dataset(dirname, flag):
    """
    Convert .mat files into a list of pandas DataFrames.

    Args:
    - dirname: Directory where the .mat files are located.
    - flag: A flag to determine the type of information ("pre" or "post").

    Returns:
    - merged_df: A concatenated DataFrame of all scenes.
    - dflist: A list of DataFrames, one for each .mat file.
    """

    # Get information about .mat files
    info_mat = get_info_mat(dirname, flag)

    counter = 1
    dflist = []  # Initialize a list to store all the DataFrames, one for each .mat file

    for info in info_mat:  # Iterate over all the sequences
        pathfile = info[0]       # Get the path of the .mat file from the info
        mat = loadmat(pathfile)  # Load the .mat file and return its data as a Python dictionary

        df = mat2pd(mat, info)  # Convert mat into a pandas DataFrame
        df['N. file'] = [counter] * len(df)  # Add a column for the file number

        # Remove incorrect paths and reset indexes starting from 0
        df_purified = df_purify(df).reset_index(drop=True)

        # Standardize data
        standardize_data(df_purified)

        dflist.append(df_purified)  # Append the purified and standardized DataFrame to the list
        counter += 1

    merged_df = pd.concat(dflist)  # Concatenate all DataFrames into one merged DataFrame

    return merged_df, dflist
