In [1]:
# Python libraries
import os
from scipy.io import loadmat
import pandas as pd
import numpy as np
import itertools

# Load Matlab .mat files in Python
[source](https://towardsdatascience.com/how-to-load-matlab-mat-files-in-python-1f200e1287b5)

In [2]:
def get_info_mat(dirname, flag):
    
    """
    Store in a list some info about .mat files, as:
    camera, day and sequence of the sample and the pathfile
    """
    
    # in order to construct the pathname
    prefix, suffix = "sequence_data", ".mat"
    match flag:
        case "pre":
            cameras = ["_1","_2","_3","_4","_5"]
        case "post":
            cameras = ["1","2","3","4","5"]
        case _:
            return f"No match for {flag}, you can only choose between \"pre\" and \"post\""
    days = ["-1_"]
    sequences = ["1","2"]    
    
    # store the info in a list
    info_mat = []
    for cam,day,seq in itertools.product(cameras,days,sequences):
        filename = prefix+cam+day+seq+suffix
        pathfile = os.path.join(dirname, filename)
                
        info_mat.append([pathfile,cam[-1],day[1],seq])

        
    return info_mat

In [3]:
def load_mat(info):
    
    """
    Given an entry of info_mat, I load the .mat file and returns a python dictionary (as data struct).
    """
    pathfile = info[0]      # get the path of the .mat file
    mat = loadmat(pathfile) # it returns a python dictionary (as data struct).
    
    return mat

In [4]:
def is_correct_scn(trajs):
    
    """
    Check if a scene is admissible/correct or not. "Correct" means if all the trajs are increasing,
    so there are not vehicles which are going in the oppoiste way in the motorway.
    """
    
    flag = True
    wrong_path = [None]
    for traj in trajs:
        flag = all(earlier <= later for earlier, later in zip(traj, traj[1:]))
        if flag == False:
            wrong_path = traj
            break
    return flag, wrong_path

In [5]:
def mat2pd(mat,info):
    
    """
    Given a mat (python dict), this fun converts mat into a pd dataframe.
    """
    
    # Get the sequences stored in mat
    seqs = mat['sequences']
    nscene = seqs.shape[0] # list containing all the number of scenes in each sequences
    #print(f"This sequence has shape: {seqs.shape}, so it has {nscene} scenes")
    
    # initialize the list to store info
    Xarr, Tarr, Nveh, cons_dis = [],[], [], []
    ic_list, wp_list = [], []
    
    for scn in range(0,nscene): # run over scenes
                
        tmp = seqs[scn][0][0][0] # (xpos,t) for a scene
        x_scn, t_scn = tmp[0], tmp[1][0]     # x position and correspondin timestamps for a fixed scene
        flag, wrong_path = is_correct_scn(x_scn)
    
        Xarr.append(x_scn)
        Tarr.append(t_scn)
        Nveh.append(len(x_scn))
        cons_dis.append(np.diff(x_scn,axis=0))      # consecutive distances of vehicles in this scene
        ic_list.append(flag)
        wp_list.append(wrong_path)

    df = pd.DataFrame({'Tarr': Tarr,\
                       'Xarr': Xarr,\
                       'cons_dis': cons_dis,\
                       'N. vehicles': Nveh,\
                       'cam': info[1],\
                       'day': info[2],\
                       'seq': info[3],\
                       'is_correct': ic_list,\
                       'wrong_path': wp_list})
    
    return df

In [6]:
def df_purify(df):
    
    """
    Avoid rows with not admissible trajs.
    """
    
    # Consider only correct scenes and drop useless columns
    cond = (df['is_correct'] == True)
    df_purified = df[cond].drop(['is_correct','wrong_path'], axis=1)
    
    return df_purified

In [7]:
def standardize_data(df):
    
    """
    df_standardized = standardize_data(df)

    Standardize Xarr and cons_dis over a df, by adding columns to df.
    """
    
    ## Xarr
    Xarr = df['Xarr']
    # Mean
    XarrRowsMean = [row.mean() for row in df['Xarr']]
    XarrMean = np.mean(XarrRowsMean)
    # STD
    XarrRowsStd = [row.std() for row in df['Xarr']] # standard deviation for all the scenes
    XarrStd = np.mean(XarrRowsStd) # mean standard deviation in the df
    # Xarr standardized
    Xarr_standardized = (Xarr - XarrMean)/XarrStd

    ## Cons Dis
    cons_dis_standardized = [np.diff(row,axis=0) for row in Xarr_standardized]
    
    
    # Create a new df with new columns
    df['Xarr_std'] = Xarr_standardized
    df['XarrMean'] = XarrMean
    df['XarrStd'] = XarrStd
    df['cons_dis_std'] = cons_dis_standardized
    
    return

In [8]:
def load_dataset(dirname, flag):
    
    "Converting mat into a list of pd dataframe"
    
    info_mat = get_info_mat(dirname, flag)
    counter = 1
    dflist = [] # initialize a list to store all the df, one for each .mat file
    dflist2 = []

    for info in info_mat: # run over all the sequences
    
        mat = load_mat(info) # load .mat
        df = mat2pd(mat,info) # convert mat into a pd dataframe
        df['N. file'] = [counter]*len(df)
        
        # avoid uncorrect paths and take indexes starting from 0
        df_purified = df_purify(df).reset_index(drop=True)
        
        # standardize data
        standardize_data(df_purified)
        
        dflist.append(df_purified)
                
        counter += 1

    merged_df = pd.concat(dflist)
    
    return merged_df, dflist

## Testing

In [9]:
# load .mat
par_dir = os.path.dirname(os.getcwd()) # parent dir
dir_name = os.path.dirname(par_dir) + "/NN-interaction"
merged_df, dflist = load_dataset(dir_name, 'post')

FileNotFoundError: [Errno 2] No such file or directory: '/home/andrea/hetzner/Andrea/Università/master-thesis/mycodes/simulazioni/NN-interaction/sequence_data1-1_1.mat'

In [None]:
dflist[0]

In [None]:
df = dflist[0]
df_standardized = standardize_data(df)

In [None]:
df_standardized