## Imports

In [1]:
import pandas as pd
import sys, os, importlib
from datetime import datetime

In [2]:
pd.set_option('display.max_rows', 100)

if os.path.join('..','0_funcoes_base') not in sys.path:
    sys.path.append(os.path.join('..','0_funcoes_base')) 

df_manipulator = importlib.import_module('df_manipulator')
file_manipulator = importlib.import_module('file_manipulator')


## Dados de configuração

In [3]:
input_config = {
    'file': {
        'ref_dir':'./out/',
        'filename':'1_export_lunar_20200517_20200620.csv',
        'delimiter': ';'
    },
    'max_consecutive_nan': 10,
    'df_resample': 100,
    'df_resample_freq': 'ms'
}

output_config = {
    'file': {
        'ref_dir': './out',
        'delimiter':';',
        'with_header': True,
        'prefix': '2_'
    },
    'mapping_window_width': (1500,500),
    'mapping_window_freq': 'milliseconds'
}

## Funções

In [4]:
def find_speed_bump_timestamps_relationship(df, indexes, mapping_window_width, mapping_window_freq, verbose=False):
    result = {}

    min_idx = min(df.index)
    max_idx = max(df.index)

    for sb_id in indexes:
        result[sb_id] = {}
        for sb_idx in indexes[sb_id]:
            result[sb_id][sb_idx] = {}

            min_range = sb_idx - pd.Timedelta(mapping_window_width[0], mapping_window_freq)
            max_range = sb_idx + pd.Timedelta(mapping_window_width[1], mapping_window_freq)
            
            min_range = min_range if min_range >= min_idx else min_idx
            max_range = max_range if max_range <= max_idx else max_idx
            
            search_values = pd.date_range(start=min_range, end=max_range, freq=str(input_config['df_resample']) + ' ' + input_config['df_resample_freq'])

            if verbose:
                print(f'\nsb_idx {sb_idx} -> [{min_range},{max_range}]')

            # Get min value from axis Z between min_range and max_range
            # The new timestamp found is the real

            try:
                real_timestamp = search_values[df.loc[search_values].z.argmin()]
            except:
                #print(search_values)
                print('\n\n\n', df.loc[search_values].z.argmin())
                print('##########################')
            result[sb_id][sb_idx] = real_timestamp
    return result

def fix_manual_event_timestamp(df, mapping_window_width, mapping_window_freq, speed_bump_id=None, verbose=False):
    _df = df.copy()

    sb_indexes = df_manipulator.get_speed_bumps_idx(_df, speed_bump_id)

    mapping_speed_bump_timestamps = find_speed_bump_timestamps_relationship(_df, sb_indexes, mapping_window_width=mapping_window_width, mapping_window_freq=mapping_window_freq, verbose=verbose)

    for sb_id in mapping_speed_bump_timestamps:
        if len(mapping_speed_bump_timestamps[sb_id].keys()) < 1:
            continue
        _df['speed_bump_id'][_df.index.isin(list(mapping_speed_bump_timestamps[sb_id].values()))] = sb_id
        _df['speed_bump_id'][_df.index.isin(list(mapping_speed_bump_timestamps[sb_id].keys()))]   = 0

    return _df

## Carrega dataframe

In [5]:
df = df_manipulator.load_dataframes(input_config['file']['filename'], input_config['file']['ref_dir'], input_config['file']['delimiter'])
df.timestamp = pd.to_datetime(df.timestamp)
df_manipulator.set_index(df, 'timestamp', True)

df.head(10)

Unnamed: 0_level_0,id,speedBumpId,x,y,z,lat,lng
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-05-17 20:14:32.715,b78d7d2f-3fd1-4d81-81ee-d00afc039814,0,0.843323,8.838654,3.714386,-22.919592,-42.473961
2020-05-17 20:14:32.816,1c59acca-2bb8-4686-9e93-95aa791f6f70,0,-2.240402,10.248825,3.108658,-22.919592,-42.473961
2020-05-17 20:14:32.918,de01a2a1-dd19-4081-921f-890907e5ca4d,0,-1.998596,10.84259,2.215622,-22.919592,-42.473961
2020-05-17 20:14:33.020,8864e0ac-54c0-4b94-b018-cd764c6dcef0,0,-2.326599,9.702957,2.493347,-22.919592,-42.473961
2020-05-17 20:14:33.218,f609cd0d-603f-4732-affc-517fab1da2d4,0,-0.231674,8.345444,2.601089,-22.919592,-42.473961
2020-05-17 20:14:33.318,b7ad14ef-2f9b-4086-952c-bc72747d81d4,0,-0.238846,9.590424,3.173294,-22.919592,-42.473961
2020-05-17 20:14:33.420,6a6758d1-c0b7-4a9d-8d61-ad886cd4d4bf,0,0.017334,8.21376,3.173294,-22.919592,-42.473961
2020-05-17 20:14:33.521,8355917c-1c52-4493-9b2f-5f1953eec118,0,0.874451,9.37973,3.443848,-22.919592,-42.473961
2020-05-17 20:14:33.621,cc883132-4cac-4753-80a1-9bac47f1d434,0,-0.260391,9.386917,2.984161,-22.919592,-42.473961
2020-05-17 20:14:33.821,56c4192c-c294-4db4-87f4-cd548d91fe7c,0,-1.919586,10.229675,3.017685,-22.919592,-42.473961


In [6]:
df_manipulator.apply_describe(df)

Shape:
 (18104, 7)
Types:

float64 :  ['x' 'y' 'z' 'lat' 'lng']
object :  ['id']
int64 :  ['speedBumpId']





Unnamed: 0,null_sum,null_pct,dtypes,count,mean,median,min,max
id,0,0.0,object,18104,,,0004aec3-c5a7-4282-82eb-79cb3c18d2a7,fffc405f-2901-45ab-9e6f-1bce809e8977
lat,0,0.0,float64,18104,-22.887996,-22.872597,-22.9352,-22.8346
lng,0,0.0,float64,18104,-42.407432,-42.359461,-42.4957,-42.3168
speedBumpId,0,0.0,int64,18104,0.006131,0.0,0,2
x,0,0.0,float64,18104,-0.437377,-0.466293,-8.20197,5.52878
y,0,0.0,float64,18104,9.322921,9.389313,3.04707,16.038
z,0,0.0,float64,18104,2.955087,2.657356,-11.6588,15.6758


In [7]:
df.speedBumpId.value_counts()

0    18003
1       91
2       10
Name: speedBumpId, dtype: int64

## Mapeia nome das colunas para snake_case

In [8]:
df = df_manipulator.rename_columns(df)

## Regulariza intervalos do index (timestamp)

In [9]:
df_out = df.resample(str(input_config['df_resample']) + input_config['df_resample_freq']).ffill(limit=10)
df_out.dropna(inplace=True, how='all', subset=['x','y','z','lat','lng'])

df_out.speed_bump_id = df_out.speed_bump_id.astype('int64', copy=False, errors='ignore')

In [10]:
df_out

Unnamed: 0_level_0,id,speed_bump_id,x,y,z,lat,lng
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-05-17 20:14:32.800,b78d7d2f-3fd1-4d81-81ee-d00afc039814,0,0.843323,8.838654,3.714386,-22.919592,-42.473961
2020-05-17 20:14:32.900,1c59acca-2bb8-4686-9e93-95aa791f6f70,0,-2.240402,10.248825,3.108658,-22.919592,-42.473961
2020-05-17 20:14:33.000,de01a2a1-dd19-4081-921f-890907e5ca4d,0,-1.998596,10.842590,2.215622,-22.919592,-42.473961
2020-05-17 20:14:33.100,8864e0ac-54c0-4b94-b018-cd764c6dcef0,0,-2.326599,9.702957,2.493347,-22.919592,-42.473961
2020-05-17 20:14:33.200,8864e0ac-54c0-4b94-b018-cd764c6dcef0,0,-2.326599,9.702957,2.493347,-22.919592,-42.473961
...,...,...,...,...,...,...,...
2020-06-20 14:41:23.800,9045a063-725e-4d4f-abf7-d73d8bba8a89,0,0.548843,8.189819,6.620956,-22.871247,-42.340474
2020-06-20 14:41:23.900,9045a063-725e-4d4f-abf7-d73d8bba8a89,0,0.548843,8.189819,6.620956,-22.871247,-42.340474
2020-06-20 14:41:24.000,145b6c26-33d5-4c20-b585-3e620dd871ec,0,0.146622,8.970322,5.715942,-22.871481,-42.340609
2020-06-20 14:41:24.100,22d48c68-3114-4d47-9e98-7aae0bb74ecb,0,0.589539,11.125107,4.638550,-22.871481,-42.340609


## Ajusta momento da marcação de quebra-mola

In [11]:
teste = pd.DatetimeIndex(['2020-05-17 20:15:01.300000', '2020-05-17 20:15:01.400000',
               '2020-05-17 20:15:01.500000', '2020-05-17 20:15:01.600000',
               '2020-05-17 20:15:01.700000', '2020-05-17 20:15:01.800000',
               '2020-05-17 20:15:01.900000',        '2020-05-17 20:15:02',
               '2020-05-17 20:15:02.100000', '2020-05-17 20:15:02.200000',
               '2020-05-17 20:15:02.300000', '2020-05-17 20:15:02.400000',
               '2020-05-17 20:15:02.500000', '2020-05-17 20:15:02.600000',
               '2020-05-17 20:15:02.700000', '2020-05-17 20:15:02.800000',
               '2020-05-17 20:15:02.900000',        '2020-05-17 20:15:03',
               '2020-05-17 20:15:03.100000', '2020-05-17 20:15:03.200000',
               '2020-05-17 20:15:03.300000', '2020-05-17 20:15:03.400000',
               '2020-05-17 20:15:03.500000', '2020-05-17 20:15:03.600000',
               '2020-05-17 20:15:03.700000', '2020-05-17 20:15:03.800000'])

In [12]:
teste2 = pd.DatetimeIndex(['2020-05-17 20:15:01.300000', '2020-05-17 20:15:01.400000',
               '2020-05-17 20:15:01.500000', '2020-05-17 20:15:01.600000',
               '2020-05-17 20:15:01.700000', '2020-05-17 20:15:01.800000',
               '2020-05-17 20:15:01.900000',
               '2020-05-17 20:15:02.100000', '2020-05-17 20:15:02.200000',
               '2020-05-17 20:15:02.300000', '2020-05-17 20:15:02.400000',
               '2020-05-17 20:15:02.500000', '2020-05-17 20:15:02.600000',
               '2020-05-17 20:15:02.700000', '2020-05-17 20:15:02.800000',
               '2020-05-17 20:15:02.900000',
               '2020-05-17 20:15:03.100000', '2020-05-17 20:15:03.200000',
               '2020-05-17 20:15:03.300000', '2020-05-17 20:15:03.400000',
               '2020-05-17 20:15:03.500000', '2020-05-17 20:15:03.600000',
               '2020-05-17 20:15:03.700000', '2020-05-17 20:15:03.800000'])

In [16]:
df_out = fix_manual_event_timestamp(df_out, speed_bump_id=None, mapping_window_width=output_config['mapping_window_width'], mapping_window_freq=output_config['mapping_window_freq'], verbose=False)

In [17]:
df_out.speed_bump_id.value_counts()

0    21248
1       88
2        9
Name: speed_bump_id, dtype: int64

## Salva o dataframe de saída

In [18]:
out_filename = file_manipulator.get_out_filename(output_config['file']['prefix'], output_config['file']['ref_dir'], input_config['file']['filename'])

df_out.to_csv(out_filename, sep=output_config['file']['delimiter'], header=output_config['file']['with_header'])

print(f'O arquivo {out_filename} foi gerado!')

O arquivo ./out/2_export_lunar_20200517_20200620.csv foi gerado!
