In [1]:
%matplotlib notebook

import os, glob
import shutil
from pprint import pprint

from __future__ import print_function

import pandas as pd
import matplotlib as mpl
import numpy as np
import matplotlib.pyplot as plt
from pynsia.pointcloud import Deepmap

figsize = (8, 4)

In [2]:
ORIG_PATH = '/media/blazaid/Saca/Phd/data/curated'
DEST_PATH = '/media/blazaid/Saca/Phd/data/datasets'
DMS_DIR = 'deepmaps'
SUBJECTS = 'edgar', 'jj', 'miguel'
MOMENTS_BEFORE = [5, 10, 20]

In [3]:
DM_DIR = os.path.join(ORIG_PATH, 'deepmaps')
CF = 'cf'
LC = 'lc'
MOMENTS_BEFORE.sort()

In [4]:
def load_sequences(path, subjects):
    sequences = {}
    for dataset in ('training', 'validation'):
        print('{} dataset'.format(dataset))
        sequences[dataset] = {}
        for submodel in (CF, LC):
            print('\t{} model'.format(submodel))
            sequences[dataset][submodel] = {}
            for subject in subjects:
                print('\t\tLoading data for subject {} .. '.format(subject), end='')
                file_pattern = os.path.join(path, '{}_{}_{}_*.csv'.format(submodel, subject, dataset))
                sequences[dataset][submodel][subject] = [
                    pd.read_csv(filepath, index_col=None, engine='python')
                    for filepath in glob.glob(file_pattern)
                ]
                print('{} loaded'.format(len(sequences[dataset][submodel][subject])))

    for dataset in ('training', 'validation'):
        for submodel in (LC, CF):
            sequences[dataset][submodel]['all'] = []
            for subject in subjects:
                sequences[dataset][submodel]['all'].extend(sequences[dataset][submodel][subject])

    return sequences

In [5]:
base_sequences = load_sequences(ORIG_PATH, SUBJECTS)

training dataset
	cf model
		Loading data for subject edgar .. 10 loaded
		Loading data for subject jj .. 6 loaded
		Loading data for subject miguel .. 12 loaded
	lc model
		Loading data for subject edgar .. 1584 loaded
		Loading data for subject jj .. 1958 loaded
		Loading data for subject miguel .. 1914 loaded
validation dataset
	cf model
		Loading data for subject edgar .. 3 loaded
		Loading data for subject jj .. 5 loaded
		Loading data for subject miguel .. 7 loaded
	lc model
		Loading data for subject edgar .. 484 loaded
		Loading data for subject jj .. 660 loaded
		Loading data for subject miguel .. 660 loaded


In [6]:
if not os.path.isdir(DEST_PATH):
    os.makedirs(DEST_PATH)
if not os.path.isdir(os.path.join(DEST_PATH, DMS_DIR)):
    os.makedirs(os.path.join(DEST_PATH, DMS_DIR))

for filename in glob.glob(os.path.join(DEST_PATH, '*')):
    if not os.path.isdir(filename):
        os.remove(filename)
for filename in glob.glob(os.path.join(DEST_PATH, DMS_DIR, '*')):
    os.remove(filename)

In [8]:
for dataset in base_sequences:
    for submodel in base_sequences[dataset]:
        for subject in base_sequences[dataset][submodel]:
            print('Building datasets')
            dfs = base_sequences[dataset][submodel][subject]
            # And now, construct the dataset
            print('\tBuilding {} {} dataset for moments {} ...'.format(submodel, dataset, subject), end='')
            moments_suffix = 't-' + '-'.join([] + ['t{}'.format(x) for x in MOMENTS_BEFORE])
            filename = '{}-{}-{}-{}.csv'.format(submodel, subject, dataset, moments_suffix)
            
            if submodel == CF:
                datasets = pd.concat(dfs, ignore_index=True) 
            else:
                datasets = []
                temporal_columns = ['Acceleration', 'Next TLS status', 'Deepmap', 'Relative speed']
                for df in dfs:
                    # Generate the dataframes with the shifted times
                    subset = df
                    for moment in MOMENTS_BEFORE:
                        temp_df = df.shift(moment)
                        
                        suffix = ' t_{}'.format(moment)
                        for column in temporal_columns:
                            subset[column + suffix] = temp_df[column]
                    subset = subset[max(MOMENTS_BEFORE):]
                    datasets.append(subset)
                
                datasets = pd.concat(datasets, ignore_index=True)
            
            print('done')
            print('\tSaving dataset {} ... '.format(filename), end='')
            datasets.to_csv(os.path.join(DEST_PATH, filename), index=False)
            print('done')
            if submodel == LC:
                print('\tSaving deepmaps ... ', end='')
                dm_columns = [c for c in datasets.columns if c.startswith('Deepmap')]
                for index, row in datasets.iterrows():
                    for column in dm_columns:
                        if not os.path.exists(os.path.join(DEST_PATH, row[column])):
                            shutil.copy(
                                os.path.join(ORIG_PATH, row[column]),
                                os.path.join(DEST_PATH, DMS_DIR),
                            )
                print('done')

Building datasets
	Building cf training dataset for moments all ...done
	Saving dataset cf-all-training-t-t5-t10-t20.csv ... done
Building datasets
	Building cf training dataset for moments miguel ...done
	Saving dataset cf-miguel-training-t-t5-t10-t20.csv ... done
Building datasets
	Building cf training dataset for moments jj ...done
	Saving dataset cf-jj-training-t-t5-t10-t20.csv ... done
Building datasets
	Building cf training dataset for moments edgar ...done
	Saving dataset cf-edgar-training-t-t5-t10-t20.csv ... done
Building datasets
	Building lc training dataset for moments all ...done
	Saving dataset lc-all-training-t-t5-t10-t20.csv ... done
	Saving deepmaps ... 

KeyboardInterrupt: 