In [106]:
import os
from __future__ import print_function

import numpy as np
import pandas as pd

from settings import Paths

In [107]:
paths = Paths(
    '/media/blazaid/Saca1/Phd/data',
    'raw_csvs', 'synced_csvs', 'tmp'
)

TIME = 'time'  # The name of the time column which will be used for syncing
SYNC_FREQ = 10  # hZ
SUBJECTS = ['miguel']
DATASETS = ['train', 'test']
SENSORS = ['can', 'gps-position', 'gps-speed', 'kinect-image', 'lidar']

def ds_filename(subject, dataset, sensor, extension):
    return '-'.join([subject, dataset, sensor]) + '.' + extension

We'll load the dataframes into memory to ease the work. They are not to heavy because the binary data (pointclouds and images) have been saved apart.

In [108]:
dataframes = {}

# All the dataframes will be stored classified by subject, dataset and sensor
for subject in SUBJECTS:
    print('Loading subject ' + subject)
    subject_dfs = {}
    # Each subject can have 1+ dataset types, so we store them sepparate
    for dataset in DATASETS:
        print('\tLoading dataset ' + dataset)
        dataset_dfs = {}
        # The same with sensors. We store them inside its dataset's dictionary
        for sensor in SENSORS:
            print('\t\tLoading sensor ' + sensor + ' ... ', end='')
            # Now we have all for locate the sensor. Let's get its name ...
            filename = ds_filename(subject, dataset, sensor, 'csv')
            # ... load it ...
            df = pd.read_csv(os.path.join(paths.raw_csvs, filename))
            # ... and store it in its dictionary
            dataset_dfs[sensor] = df
            print('done')
        subject_dfs[dataset] = dataset_dfs
    dataframes[subject] = subject_dfs

Loading subject miguel
	Loading dataset train
		Loading sensor can ... done
		Loading sensor gps-position ... done
		Loading sensor gps-speed ... done
		Loading sensor kinect-image ... done
		Loading sensor lidar ... done
	Loading dataset test
		Loading sensor can ... done
		Loading sensor gps-position ... done
		Loading sensor gps-speed ... done
		Loading sensor kinect-image ... done
		Loading sensor lidar ... done


We will replace in each dataframe the `secs` and `nsecs` columns for a `time` column which will have the same information.

In [109]:
for subject in SUBJECTS:
    for dataset in DATASETS:
        for sensor in SENSORS:
            df = dataframes[subject][dataset][sensor]
            df[TIME] = df['secs'] + df['nsecs'] * pow(10, -9)
            df.drop(['secs', 'nsecs'], axis=1, inplace=True)

## Translating CAN messages

The CAN messages are written as they come. We need to sepparate them into different columns, one for each message type, and then translate them to their real values.

In [147]:
import math

def convert(msg, init_byte, n_bits, first_bit, resolution, offset):
    endByte = int(init_byte - math.ceil((n_bits + first_bit) / 8.0) + 1)
    totbits = (init_byte - endByte + 1) * 8
    ss = msg[endByte * 2: (init_byte + 1) * 2]
    s = int(ss, 16)
    if n_bits > totbits or n_bits < 0:
        return None
    mask = '1' * (n_bits - 1)
    #for j in range(n_bits - 1):
    #    mask = mask << 1
    #    mask = mask + 1
    #mask = mask << first_bit
    res = s & int(mask, 2)
    res = res >> first_bit
    return res*resolution+offset


df = dataframes['miguel']['test']['can']
print(df.columns)
df['frame_type'] = df['frame'].str[1:4]
for index, row in df.iterrows():
    if row['frame_type'] == '412':
        print(row['frame'][4:])
        print(convert(row['frame'][4:], 1, 9, 0, 1, 0))
        break

Index([u'frame', u'time', u'frame_type'], dtype='object')
8FE1C003425002106E328
225


In [105]:
frames_dfs = []
for frame_type in  df['frame_type'].unique():
    print(frame_type)

418
374
208
424
346
236
231
696
2F2
412


array(['418', '374', '208', '424', '346', '236', '231', '696', '2F2',
       '412'], dtype=object)

## Synchronizing all sensors

We will change the name of the columns because they can be duped. For this, we'll append the set name to the column.

In [97]:
for subject in SUBJECTS:
    for dataset in DATASETS:
        for sensor in SENSORS:
            df = dataframes[subject][dataset][sensor]
            mapping = {
                column: sensor + '_' + column
                for column in df.columns
            }
            df.rename(columns=mapping, inplace=True)

The next step is synchronizing the dataset. For this purpose, the first step is get the rows of all the datasets that are nearest in time and adjust the starting index in each row to that position. We'll define the distance between times as its MSE.

In [6]:
def starting_indices(dfs, columns):
    def error(dfs, rows, cols):
        return sum(
            pow(dfs[i].loc[rows[i], cols[i]] - dfs[i + 1].loc[rows[i + 1], cols[i + 1]], 2)
            for i in range(len(dfs) - 1)
        )

    # We start in 0 index for all the dataframes. This will be the best position (for now).
    indices = [0 for _ in dfs]
    min_error = error(dfs, indices, columns)
    possible_indices = [(min_error, indices)]
    while possible_indices:
        del possible_indices[:]  # .clear() doesn't exists in python2
        # We go one by one over all the dfs.
        for i_df, df in enumerate(dfs):
            # If there is a row over the current one, we check it's contents
            if indices[i_df] < len(df.index):
                new_indices = indices[:]
                new_indices[i_df] += 1
                # Is the new time difference better?
                this_error = error(dfs, new_indices, columns)
                if this_error <= min_error:
                    possible_indices.append((this_error, new_indices))

        # Si hay filas mejores que la actual, cogemos la mejor
        if possible_indices:
            possible_indices.sort()
            min_error, indices = possible_indices[0]

    return indices

    # If it's necessary to remove columns, now it's the moment
    if exclude_columns:
        print(exclude_columns)
        master_df = master_df[[col for col in master_df.columns if col not in exclude_columns]]

for subject in SUBJECTS:
    print('Adjusting dataset starting time for ' + subject)
    for dataset in DATASETS:
        sensors = dataframes[subject][dataset].keys()
        dfs = [dataframes[subject][dataset][sensor] for sensor in sensors]
        time_columns = [sensor + '_' + TIME for sensor in sensors]
        # Get the indexes of the nearest rows
        indices = starting_indices(dfs, time_columns)
        print('\tNearest rows in subsets: ' + ', '.join(map(str, indices)))
        # Adjust the df to that starting indexes
        for sensor, df, index in zip(sensors, dfs, indices):
            dataframes[subject][dataset][sensor] = df.shift(-index)

Adjusting dataset starting time for miguel
	Nearest rows in subsets: 0, 2, 1, 42, 0
	Nearest rows in subsets: 0, 4, 3, 113, 1


For the sake of clarity, after synchronizing all the dataframes, their timestamps will be set as relative to the smallest one

In [7]:
for subject in SUBJECTS:
    print('Adjusting dataset minimum time for ' + subject)
    for dataset in DATASETS:
        print('\tDataset: ' + dataset)
        dfs = [dataframes[subject][dataset][sensor] for sensor in SENSORS]
        time_columns = [sensor + '_' + TIME for sensor in SENSORS]
        minimum_value = min(df[tc].min() for df, tc in zip(dfs, time_columns))
        print('\t\tMinimum time between sensors: ' + str(minimum_value))
        for df, tc in zip(dfs, time_columns):
            df[tc] -= minimum_value

Adjusting dataset minimum time for miguel
	Dataset: train
		Minimum time between sensors: 1517236091.42862
	Dataset: test
		Minimum time between sensors: 1517237130.141652


In [11]:
def syncronize_dataframes(dfs, time_columns, freq=10, exclude_columns=None):
    master_df = pd.DataFrame(columns=[col for df in dfs for col in df])
    rows = [0 for _ in dfs]
    step = 0
    time = 1 / freq
    half_time = time / 2
    while all(row < len(df) - 1 for df, row in zip(dfs, rows)):
        data_row = []
        for df_i, (df, row, col) in enumerate(zip(dfs, rows, time_columns)):
            possible_values = []
            next_row = None
            for i in range(len(df) - row):
                value = df.loc[row + i, col]
                diff = step * time - value
                if -half_time < diff < half_time:
                    # We're inside the thresshold so we take the value
                    possible_values.append((value, row + i))
                elif diff < -half_time:
                    # We're over the thresshold, so no more values should be taken
                    break

            if possible_values:
                possible_values.sort()
                _, row = possible_values[0]
                possible_values.clear()

                data_row.extend(list(df.loc[row, :]))
                rows[df_i] = row + 1
            else:
                data_row.extend([np.nan for _ in df.columns])

        master_df.loc[step] = data_row

        step += 1

    # If there are starting or ending rows with null data, we remove them too
    while master_df[time_columns].loc[0, :].isnull().any():
        master_df = master_df[1:]
        master_df.reset_index(drop=True, inplace=True)
    while master_df[time_columns].loc[len(master_df) - 1, :].isnull().any():
        master_df = master_df[:-1]
        master_df.reset_index(drop=True, inplace=True)

    # If it's necessary to remove columns, now it's the moment
    if exclude_columns:
        master_df = master_df[[col for col in master_df.columns if col not in exclude_columns]]

    return master_df

master_datasets = {}
for subject in SUBJECTS:
    print('Synchronizing dataframes for ' + subject)
    subject_dfs = {}
    for dataset in DATASETS:
        print('\tSynchronizing {} at {} hz'.format(dataset, SYNC_FREQ))
        dfs = [dataframes[subject][dataset][sensor] for sensor in SENSORS]
        tcs = [sensor + '_' + TIME for sensor in SENSORS]
        freq = SYNC_FREQ
        print('\tDataframe created')
        subject_dfs[dataset] = syncronize_dataframes(dfs, tcs, freq=freq)
    master_datasets[subject] = subject_dfs

KeyboardInterrupt: 

In [None]:
master_datasets[SUBJECTS[0]][DATASETS[0]]

In [None]:
# Traducir la columna de 