In [1]:
import os
import datetime
import numpy as np
import pandas as pd
import torch

In [2]:
def get_csv_files_from_dir(dir):
    files = []
    for file in os.listdir(dir):
        f = os.path.join(dir, file)
        if os.path.isfile(f) and f.endswith('.csv'):
            files.append(f)
    return files

In [3]:
def transform_cp_data(data, error=False, velocity=False, acceleration=False):
    data = data.sort_values(by=['rs_id', 'id', 'time']).reset_index(drop=True)
    data = data.drop(columns=['load_time'])

    if not error:
        data = data.drop(columns=[f'{axis}_err' for axis in ('x', 'y', 'z')])

    if not velocity:
        data = data.drop(columns=[f'v_{axis}_est' for axis in ('x', 'y', 'z')])

    if not acceleration:
        data = data.drop(columns=[f'a_{axis}_est' for axis in ('x', 'y', 'z')])

    data = data.dropna()

    return data

In [4]:
def describe_cp_data(data):
    return data.groupby(by=['rs_id', 'id'])['time'].describe()[['count', 'min', 'max']]

In [5]:
def get_track(data, rs_id, id):
    return data[(data['rs_id'] == rs_id) & (data['id'] == id)]

In [6]:
def get_tracks_timeranges_intersection(track_1, track_2):
    t_min_1, t_max_1 = track_1['time'].min(), track_1['time'].max()
    t_min_2, t_max_2 = track_2['time'].min(), track_2['time'].max()

    t_min, t_max = max(t_min_1, t_min_2), min(t_max_1, t_max_2)
    return t_min, t_max

In [7]:
def merge_tracks(track_1, track_2, track_length, error, velocity, acceleration):
    t_min, t_max = get_tracks_timeranges_intersection(track_1, track_2)

    # dt_1_mean = track_1['time'].diff().mean()
    # dt_2_mean = track_2['time'].diff().mean()
    # dt = max(dt_1_mean, dt_2_mean)
    # t_min, t_max = t_min - int(dt/2), t_max + int(dt/2)
    
    if t_min >= t_max:
        return None

    t_mid = t_min + int(0.5 * (t_max - t_min))

    track = pd.concat([track_1, track_2])

    if len(track) < track_length:
        return None
                    
    track['dt_mid'] = np.abs(track['time'] - t_mid)
    track = track.sort_values(by=['dt_mid']).head(track_length)
    
    if len(track['id'].unique()) == 1:
        return None
    
    track = track.sort_values(by=['time']).reset_index(drop=True)
    track = track[
        ['time', 'x', 'y', 'z'] + \
        (['x_err', 'y_err', 'z_err'] if error else []) + \
        (['v_x_est', 'v_y_est', 'v_z_est'] if velocity else []) + \
        (['a_x_est', 'a_y_est', 'a_z_est'] if acceleration else [])
    ]
    
    return track

In [8]:
def generate_tl_data_from_cp_data_file(file, track_length, error, velocity, acceleration):
    row_length = 4 + int(error) * 3 + int(velocity) * 3 + int(acceleration) * 3
    x, y = torch.empty((0, track_length * row_length)), torch.empty((0, 1))
    cp_data = pd.read_csv(file)

    cp_data = transform_cp_data(cp_data, error, velocity, acceleration)
    
    tracks = [(rs_id, id) for rs_id in cp_data['rs_id'].unique() for id in cp_data[cp_data['rs_id'] == rs_id]['id'].unique()]
    n = len(tracks)
    
    for i in range(n):
        for j in range(i + 1, n):
            rs_id_1, id_1 = tracks[i]
            rs_id_2, id_2 = tracks[j]
        
            if rs_id_1 == rs_id_2:
                continue
        
            track_1 = get_track(cp_data, rs_id_1, id_1)
            track_2 = get_track(cp_data, rs_id_2, id_2)
                    
            track = merge_tracks(track_1, track_2, track_length, error, velocity, acceleration)
            if track is None:
                continue
        
            label = int(id_1 == id_2)
            x_cur = torch.tensor(track.values, dtype=torch.float32).reshape((-1,)).unsqueeze(0)
            y_cur = torch.tensor([label], dtype=torch.float32).unsqueeze(0)

            x = torch.cat((x, x_cur), 0)
            y = torch.cat((y, y_cur), 0)
        
    return x, y

In [9]:
def generate_tl_data_from_cp_data_dir(dir, track_length, error, velocity, acceleration):
    row_length = 4 + int(error) * 3 + int(velocity) * 3 + int(acceleration) * 3
    x, y = torch.empty((0, track_length * row_length)), torch.empty((0, 1))
    files = get_csv_files_from_dir(dir)

    for file in files:
        print(f'- file {file} ', end='')
        x_cur, y_cur = generate_tl_data_from_cp_data_file(file, track_length, error, velocity, acceleration)
        print(f'({x_cur.shape[0]} rows)')
        
        x = torch.cat((x, x_cur), 0)
        y = torch.cat((y, y_cur), 0)

    return x, y

In [10]:
cp_data_dir = 'CP_data'
tl_data_dir = 'TL_data'

track_length = 16

error        = False
velocity     = False
acceleration = False

In [11]:
for data_usage_aim in ('train', 'test'):
    print(f'Processing {cp_data_dir}/{data_usage_aim}')
    
    x, y = generate_tl_data_from_cp_data_dir(f'{cp_data_dir}/{data_usage_aim}', track_length, error, velocity, acceleration)
    
    if x.shape[0] != 0:
        print(f'Saving to {tl_data_dir}/{data_usage_aim} ({x.shape[0]} rows)')
        torch.save(x, f'{tl_data_dir}/{data_usage_aim}/x.pt')
        torch.save(y, f'{tl_data_dir}/{data_usage_aim}/y.pt')
    else:
        print(f'Nothing to save to {tl_data_dir}/{data_usage_aim}')
    
    print()

Processing CP_data/train
- file CP_data/train/50ao_25rs_xyz_quadratic.csv (1563 rows)
- file CP_data/train/1ao_2rs_x-linear.csv (0 rows)
Saving to TL_data/train (1563 rows)

Processing CP_data/test
- file CP_data/test/1ao_2rs.csv (0 rows)
Nothing to save to TL_data/test

