## Imports

In [452]:
import pandas as pd
import sys, os, importlib
from datetime import datetime
import numpy as np

In [453]:
pd.set_option('display.max_rows', 100)

if os.path.join('..','0_funcoes_base') not in sys.path:
    sys.path.append(os.path.join('..','0_funcoes_base')) 

df_manipulator = importlib.import_module('df_manipulator')
date_manipulator = importlib.import_module('date_manipulator')
plot_manipulator = importlib.import_module('plot_manipulator')


## Dados de configuração

In [454]:
input_config = {
    'file': {
        'ref_dir':'./out/',
        'filename':'1_export_lunar_20200517.csv',
        'delimiter': ';'
    },
    'max_consecutive_nan': 10,
    'df_resample': 100,
    'df_resample_freq': 'ms'
}

output_config = {
    'file': {
        'ref_dir': './out',
        'delimiter':';',
        'with_header': True
    }
}

## Funções

In [455]:
def get_speed_bumps_idx(df, speed_bump_id=None):
    result = {}
    if speed_bump_id == 0:
        return result 

    result = {int(sb_id): 0 for sb_id in list(df.speedBumpId.value_counts().keys()) if speed_bump_id is None or sb_id == speed_bump_id}
    for sb_id in result:
        result[sb_id] = df[df.speedBumpId == sb_id].index.to_list()
    return result
    

def get_out_filename(prefix):
    return os.path.join(output_config['file']['ref_dir'], prefix + input_config['file']['filename'].replace('1_', ''))

def find_speed_bump_timestamps_relationship(df, indexes, mapping_window_width, mapping_window_freq, verbose=False):
    result = {}

    min_idx = min(df.index)
    max_idx = max(df.index)

    for sb_id in indexes:
        result[sb_id] = {}
        for sb_idx in indexes[sb_id]:
            result[sb_id][sb_idx] = {}

            min_range = sb_idx - pd.Timedelta(mapping_window_width, mapping_window_freq)
            max_range = sb_idx + pd.Timedelta(mapping_window_width, mapping_window_freq)
            
            min_range = min_range if min_range >= min_idx else min_idx
            max_range = max_range if max_range <= max_idx else max_idx
            
            search_values = pd.date_range(start=min_range, end=max_range, freq=str(input_config['df_resample']) + ' ' + input_config['df_resample_freq'])

            if verbose:
                print(f'\nsb_idx {sb_idx} -> [{min_range},{max_range}]')

            # Get min value from axis Z between min_range and max_range
            # The found new timestamp is the real
            real_timestamp = search_values[df.loc[search_values].z.argmin()]
            result[sb_id][sb_idx] = real_timestamp
    return result

def fix_manual_event_timestamp(df, speed_bump_id=None, verbose=False):
    _df = df.copy()

    sb_indexes = get_speed_bumps_idx(_df, speed_bump_id)
    sb_indexes.pop(0, None)

    mapping_speed_bump_timestamps = find_speed_bump_timestamps_relationship(_df, sb_indexes, mapping_window_width=2000, mapping_window_freq='milliseconds', verbose=verbose)

    for sb_id in mapping_speed_bump_timestamps:
        if len(mapping_speed_bump_timestamps[sb_id].keys()) < 1:
            continue
        _df['speedBumpId'][_df.index.isin(list(mapping_speed_bump_timestamps[sb_id].values()))] = sb_id
        _df['speedBumpId'][_df.index.isin(list(mapping_speed_bump_timestamps[sb_id].keys()))]   = 0

    return _df

## Carrega dataframe

In [456]:
df = df_manipulator.load_dataframe(input_config['file']['filename'], input_config['file']['ref_dir'], input_config['file']['delimiter'])
df.timestamp = pd.to_datetime(df.timestamp)
df_manipulator.set_index(df, 'timestamp', True)

In [457]:
df

Unnamed: 0_level_0,id,speedBumpId,x,y,z,lat,lng
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-05-17 20:10:48.346,d198e52b-bd5b-424a-8ea5-d391fbdb7b20,0,-3.399200,9.516205,1.573975,-22.921991,-42.473372
2020-05-17 20:10:48.543,a9df8305-e4ed-4530-ab78-e1aa3ca9ffbd,0,-2.987396,10.203339,1.291458,-22.921991,-42.473372
2020-05-17 20:10:48.743,74dedc33-8d9b-4aa6-a352-df1ee9364ed7,0,-4.376038,9.746048,1.806213,-22.921991,-42.473372
2020-05-17 20:10:48.843,4a64f1bf-ea98-49db-baff-d7d08baf2dce,0,-3.698471,9.317490,1.849304,-22.921991,-42.473372
2020-05-17 20:10:48.944,25a3976b-a812-4bf2-a2fe-113d7b01cd11,0,-5.039230,9.372559,0.522919,-22.921991,-42.473372
...,...,...,...,...,...,...,...
2020-05-17 20:35:51.161,0f15d4aa-c4c3-40ba-9d1f-1746e938a5eb,0,0.386032,10.105179,1.660172,-22.925816,-42.484631
2020-05-17 20:35:51.261,c044a484-bb8b-421b-8cc8-a6ad4713e28f,0,-1.378494,9.920822,2.608276,-22.925816,-42.484631
2020-05-17 20:35:51.361,c9d94f9a-6211-4198-b39b-70db0ef2e890,0,-1.215683,9.523392,2.505325,-22.925816,-42.484631
2020-05-17 20:35:51.461,2fa87037-71b8-42e0-bd89-64bff33cfead,0,-0.435181,8.999054,2.871628,-22.925816,-42.484631


In [458]:
df_manipulator.apply_describe(df)

Shape:
 (11084, 7)
Types:

float64 :  ['x' 'y' 'z' 'lat' 'lng']
int64 :  ['speedBumpId']
object :  ['id']





Unnamed: 0,null_sum,null_pct,dtypes,count,mean,median,min,max
id,0,0.0,object,11084,,,0000103f-58f2-4b67-956e-9fc49ea2a845,ffe9ed07-2030-4a2d-ad61-18af6217f012
lat,0,0.0,float64,11084,-22.915948,-22.917111,-22.9355,-22.8901
lng,0,0.0,float64,11084,-42.479699,-42.476972,-42.4957,-42.4683
speedBumpId,0,0.0,int64,11084,0.005594,0.0,0,2
x,0,0.0,float64,11084,-0.66446,-0.635094,-12.7126,5.52878
y,0,0.0,float64,11084,9.333137,9.382126,3.04707,15.6933
z,0,0.0,float64,11084,2.891374,2.695656,-8.12733,15.8889


In [459]:
df.speedBumpId.value_counts()

0    11032
1       42
2       10
Name: speedBumpId, dtype: int64

## Regulariza intervalos do index (timestamp)

In [460]:
df_out = df.resample(str(input_config['df_resample']) + input_config['df_resample_freq']).ffill()
df_out.dropna(inplace=True, how='all')

df_out.speedBumpId = df_out.speedBumpId.astype('int64', copy=False, errors='ignore')

In [461]:
df_out

Unnamed: 0_level_0,id,speedBumpId,x,y,z,lat,lng
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-05-17 20:10:48.400,d198e52b-bd5b-424a-8ea5-d391fbdb7b20,0,-3.399200,9.516205,1.573975,-22.921991,-42.473372
2020-05-17 20:10:48.500,d198e52b-bd5b-424a-8ea5-d391fbdb7b20,0,-3.399200,9.516205,1.573975,-22.921991,-42.473372
2020-05-17 20:10:48.600,a9df8305-e4ed-4530-ab78-e1aa3ca9ffbd,0,-2.987396,10.203339,1.291458,-22.921991,-42.473372
2020-05-17 20:10:48.700,a9df8305-e4ed-4530-ab78-e1aa3ca9ffbd,0,-2.987396,10.203339,1.291458,-22.921991,-42.473372
2020-05-17 20:10:48.800,74dedc33-8d9b-4aa6-a352-df1ee9364ed7,0,-4.376038,9.746048,1.806213,-22.921991,-42.473372
...,...,...,...,...,...,...,...
2020-05-17 20:35:51.100,72df7adf-cb5f-4f91-b76d-f08fa6da21ed,0,-0.700928,9.250443,1.770294,-22.925816,-42.484631
2020-05-17 20:35:51.200,0f15d4aa-c4c3-40ba-9d1f-1746e938a5eb,0,0.386032,10.105179,1.660172,-22.925816,-42.484631
2020-05-17 20:35:51.300,c044a484-bb8b-421b-8cc8-a6ad4713e28f,0,-1.378494,9.920822,2.608276,-22.925816,-42.484631
2020-05-17 20:35:51.400,c9d94f9a-6211-4198-b39b-70db0ef2e890,0,-1.215683,9.523392,2.505325,-22.925816,-42.484631


In [462]:
df_manipulator.apply_describe(df_out)

Shape:
 (15032, 7)
Types:

float64 :  ['x' 'y' 'z' 'lat' 'lng']
int64 :  ['speedBumpId']
object :  ['id']





Unnamed: 0,null_sum,null_pct,dtypes,count,mean,median,min,max
id,0,0.0,object,15032,,,0000103f-58f2-4b67-956e-9fc49ea2a845,ffe9ed07-2030-4a2d-ad61-18af6217f012
lat,0,0.0,float64,15032,-22.916766,-22.921991,-22.9355,-22.8901
lng,0,0.0,float64,15032,-42.478807,-42.476058,-42.4957,-42.4683
speedBumpId,0,0.0,int64,15032,0.007118,0.0,0,2
x,0,0.0,float64,15032,-1.631164,-0.77037,-12.7126,5.52878
y,0,0.0,float64,15032,9.100066,9.252838,3.04707,15.6933
z,0,0.0,float64,15032,2.323634,2.409546,-8.12733,15.8889


In [463]:
df_out.speedBumpId.value_counts()

0    14941
1       75
2       16
Name: speedBumpId, dtype: int64

## Ajusta momento da marcação de quebra-mola

In [464]:
df_out = fix_manual_event_timestamp(df_out, speed_bump_id=None, verbose=False)

In [465]:
df_out.speedBumpId.value_counts()

0    14979
1       43
2       10
Name: speedBumpId, dtype: int64

## Salva o dataframe de saída

In [466]:
out_filename = get_out_filename(prefix='2_')
df_out.to_csv(out_filename, sep=output_config['file']['delimiter'], header=output_config['file']['with_header'])
print(f'O arquivo {out_filename} foi gerado!')

O arquivo ./out/2_export_lunar_20200517.csv foi gerado!
