In [1]:
# Python libraries
import os
from scipy.io import loadmat
import pandas as pd
import numpy as np
import itertools

## Some info about the dataset

We have one-dimensional traffic data (x-coordinate) and the data contain 3+ vehicles in one lane.

First, we interpolate the position data with timestamps to a
reference time discretization. We filtered sequences of data where two or more vehicles are present in the camera frame. This yields a database with various sequences of different length and with different number of vehicles.

### How to read these filenames?
* The first number refers to the camera (1,...,5)
* The second number is the day
* The third number refers to the sequence

### What do .mat files contain?
These .mat files contain a cell array with several struct objects, each of these structs contains position and timestamps for the cars involved in that sequence.

### Observation
* sequence_data_1-1_1 is preprocessed
* sequence_data1-1_1 is postprocessed

## Loading .mat files

In [19]:
par_dir = os.path.dirname(os.path.dirname(os.getcwd())) # parent dir of the actual one
dir_name = par_dir + "/NN-interaction"
base_filename = "sequence_data1-1_1"
suffix = '.mat'
pathfile = os.path.join(dir_name, base_filename + suffix)

In [20]:
mat = loadmat(pathfile) #The loadmat method returns a python dictionary (as data struct).

## Formatting the data

In [4]:
mat

{'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Wed Aug 18 16:20:03 2021',
 '__version__': '1.0',
 '__globals__': [],
 'sequences': array([[array([[(array([[29.37332567, 34.22142943, 38.50653306, 43.68772327, 39.53432096,
                         42.9844338 ],
                        [58.35845393, 63.04878549, 67.56118147, 73.20173178, 49.09293513,
                         56.45433487],
                        [60.14504406, 65.53744755, 70.75192501, 78.36474838, 79.44444985,
                         86.59041956]]), array([[ 9.44,  9.64,  9.84, 10.04, 10.24, 10.44]]))]],
               dtype=[('Xarr', 'O'), ('Tarr', 'O')])                                            ],
        [array([[(array([[47.39131046, 53.82017734, 60.3346048 ],
                        [51.39004619, 55.54426992, 60.39859453],
                        [71.53057024, 77.25613794, 82.38676944]]), array([[14.24, 14.44, 14.64]]))]],
               dtype=[('Xarr', 'O'), ('Tarr', 'O')])                  

## Convert into a pandas data frame

In [None]:
mat.keys()

In [None]:
type(mat['sequences']), mat['sequences'].shape, mat['sequences'][0].shape

The struct of mat['sequences'] is given by:
* (136, 1) means that we have 136 sequences
* each sequence is made by two arrays: one corresponds to the x-positions of 2+ vehicles, the other to the corresponding timestamps.

In [None]:
matrix = mat['sequences']
Xarr, Tarr, data = [],[], []
nseq = matrix.shape[0]

for seq in range(0,nseq):
    tmp = matrix[seq][0][0][0]
    Xarr.append(tmp[0].tolist())
    Tarr.append(tmp[1][0].tolist())

    sequence = []
    for j in range(0,len(Xarr[seq])):
        l1, l2 = Xarr[seq][j], Tarr[seq]
        scene = [] # for each vehicles of a sequence I have a scene
        for a,b in zip(l1,l2):
            scene.append([a,b]) # couple (xpos, time)
        sequence.append(scene)
    data.append(sequence)
    
# attenzione che c'è qualche pos = 0 da rimuovere

In [None]:
len(data), len(Xarr), len(Tarr)

In [None]:
len(data[0])

In [None]:
columns = ["ciao"]
df = pd.DataFrame(data)

In [None]:
df
# columns are vehicles
# rows are scene
# each entry is the couple (x,t) of the vehicles j in the sequence i

In [None]:
print("There are",df.isnull().sum().sum(),"missing values in total.")

In [None]:
df[0][0]