In [1]:
# Python libraries
import os
from scipy.io import loadmat
import pandas as pd
import numpy as np
import itertools

# Load Matlab .mat files in Python
[source](https://towardsdatascience.com/how-to-load-matlab-mat-files-in-python-1f200e1287b5)

## Some info about the dataset

We have one-dimensional traffic data (x-coordinate) and the data contain 3+ vehicles in one lane.

First, we interpolate the position data with timestamps to a
reference time discretization. We filtered sequences of data where two or more vehicles are present in the camera frame. This yields a database with various sequences of different length and with different number of vehicles.

### How to read these filenames?
* The first number refers to the camera (1,...,5)
* The second number is the day
* The third number refers to the sequence

### What do .mat files contain?
These .mat files contain a cell array with several struct objects, each of these structs contains position and timestamps for the cars involved in that sequence.

### Observation
* sequence_data_1-1_1 is preprocessed
* sequence_data1-1_1 is postprocessed

## Loading .mat files

In [9]:
par_dir = os.path.dirname(os.getcwd()) # parent dir
dir_name = par_dir + "/NN-interaction"
base_filename = "sequence_data1-1_1"
suffix = '.mat'
pathfile = os.path.join(dir_name, base_filename + suffix)

In [10]:
mat = loadmat(pathfile) #The loadmat method returns a python dictionary (as data struct).

## Formatting the data

In [11]:
mat

{'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Wed Aug 18 16:20:03 2021',
 '__version__': '1.0',
 '__globals__': [],
 'sequences': array([[array([[(array([[29.37332567, 34.22142943, 38.50653306, 43.68772327, 39.53432096,
                         42.9844338 ],
                        [58.35845393, 63.04878549, 67.56118147, 73.20173178, 49.09293513,
                         56.45433487],
                        [60.14504406, 65.53744755, 70.75192501, 78.36474838, 79.44444985,
                         86.59041956]]), array([[ 9.44,  9.64,  9.84, 10.04, 10.24, 10.44]]))]],
               dtype=[('Xarr', 'O'), ('Tarr', 'O')])                                            ],
        [array([[(array([[47.39131046, 53.82017734, 60.3346048 ],
                        [51.39004619, 55.54426992, 60.39859453],
                        [71.53057024, 77.25613794, 82.38676944]]), array([[14.24, 14.44, 14.64]]))]],
               dtype=[('Xarr', 'O'), ('Tarr', 'O')])                  

## Convert into a pandas data frame

In [12]:
mat.keys()

dict_keys(['__header__', '__version__', '__globals__', 'sequences'])

In [13]:
type(mat['sequences']), mat['sequences'].shape, mat['sequences'][0].shape

(numpy.ndarray, (34, 1), (1,))

The struct of mat['sequences'] is given by:
* (136, 1) means that we have 136 sequences
* each sequence is made by two arrays: one corresponds to the x-positions of 2+ vehicols, the other to the corresponding timestamps.

In [14]:
matrix = mat['sequences']
Xarr, Tarr, data = [],[], []
nseq = matrix.shape[0]

for seq in range(0,nseq):
    tmp = matrix[seq][0][0][0]
    Xarr.append(tmp[0].tolist())
    Tarr.append(tmp[1][0].tolist())

    sequence = []
    for j in range(0,len(Xarr[seq])):
        l1, l2 = Xarr[seq][j], Tarr[seq]
        scene = [] # for each vehicole of a sequence I have a scene
        for a,b in zip(l1,l2):
            scene.append([a,b]) # couple (xpos, time)
        sequence.append(scene)
    data.append(sequence)
    
# attenzione che c'è qualche pos = 0 da rimuovere

In [15]:
len(data), len(Xarr), len(Tarr)

(34, 34, 34)

In [16]:
len(data[0])

3

In [17]:
columns = ["ciao"]
df = pd.DataFrame(data)

In [18]:
df
# columns are vehicoles
# rows are scene
# each entry is the couple (x,t) of the vehicole j in the sequence i

Unnamed: 0,0,1,2,3
0,"[[29.373325674060805, 9.440000000000001], [34....","[[58.35845392899927, 9.440000000000001], [63.0...","[[60.14504406261975, 9.440000000000001], [65.5...",
1,"[[47.39131045666992, 14.240000000000002], [53....","[[51.39004618507265, 14.240000000000002], [55....","[[71.53057023567337, 14.240000000000002], [77....",
2,"[[39.20354853492598, 23.64], [44.5337306759091...","[[46.59381455709539, 23.64], [51.9240661660022...","[[67.10312149777134, 23.64], [73.2257052346478...",
3,"[[2.7333870214228604, 72.44], [7.9033894781644...","[[24.601255140716756, 72.44], [30.821354526237...","[[57.93441824708709, 72.44], [64.3348821421965...",
4,"[[21.74922515152677, 84.24000000000001], [27.6...","[[30.532749273267207, 84.24000000000001], [35....","[[72.12361338780065, 84.24000000000001], [78.6...",
5,"[[29.92533961013666, 131.64], [36.085487468240...","[[32.50621832641032, 131.64], [38.328353365185...","[[68.80830847052482, 131.64], [74.913909345537...",
6,"[[48.201888758537635, 148.04], [52.58811573357...","[[61.00463006069239, 148.04], [66.185036366166...","[[73.44573965300417, 148.04], [78.829366838328...",
7,"[[28.43130968766962, 161.64], [34.763434870437...","[[35.84145807203191, 161.64], [40.981627755239...","[[66.68112380788443, 161.64], [74.046846522531...",
8,"[[1.6731532650716647, 164.04], [7.263156064323...","[[29.867335503716497, 164.04], [36.37347718386...","[[69.96041499143246, 164.04], [75.515942114992...",
9,"[[7.567157827915416, 215.64], [13.423172011187...","[[33.01539370067994, 215.64], [39.963578614057...","[[66.04304808214894, 215.64], [73.527741365789...",


In [19]:
print("There are",df.isnull().sum().sum(),"missing values in total.")

There are 32 missing values in total.


In [20]:
df[0][0]

[[29.373325674060805, 9.440000000000001],
 [34.22142942994519, 9.64],
 [38.50653305682816, 9.840000000000002],
 [43.68772327214238, 10.040000000000001],
 [39.534320959607726, 10.240000000000002],
 [42.98443379996635, 10.440000000000001]]