# Imports

In [220]:
import pandas as pd
import sys, os, importlib
import numpy as np
from datetime import datetime

In [221]:
pd.set_option('display.max_rows', 100)

if os.path.join('..','0_funcoes_base') not in sys.path:
    sys.path.append(os.path.join('..','0_funcoes_base')) 

df_manipulator = importlib.import_module('df_manipulator')
date_manipulator = importlib.import_module('date_manipulator')
file_manipulator = importlib.import_module('file_manipulator')

# Dados de configuração

In [222]:
input_config = {
    'file': {
        'ref_dir':'../../1_dados_raw/',
        'filenames':['export_lunar_20200517.csv', 'export_lunar_20200620.csv'],
        'delimiter': ';',
        'header': ['id', 'speedBumpId','timestamp','x','y','z','lat','lng']
    }
}

output_config = {
    'file': {
        'ref_dir': './out',
        'prefix': '1_',
        'delimiter':';',
        'with_header': True
    },
    'speed_bump_after_seconds': 20,
    'speed_bump_before_seconds': 10
}

# Funções

In [223]:
def remove_invalid_timestamps(df):
    df = df_manipulator.filter_by_dates(df, start_date='2020-01-01', end_date=datetime.utcnow(), format='%Y-%m-%d')
    return df

def update_timestamp_ticks_to_datetime(df):
    df['new_timestamp'] = df.apply(lambda row: date_manipulator.ticks_to_datetime(row.timestamp), axis=1)

    df = df_manipulator.remove_column(df, 'timestamp', False)
    df.rename(columns={'new_timestamp':'timestamp'},inplace=True)

    df_manipulator.set_index(df, 'timestamp', True)
    df.sort_index(inplace=True)
    return df

def remove_invalid_accelerometer_values(df, filtered_columns, invalid_value = 0):
    _df = df.copy()
    
    if len(filtered_columns) > 0:
        for filtered_column in filtered_columns:
            _df = _df[_df[filtered_column] != invalid_value]
    return _df

def remove_duplicated_timestamp(df):
    _df = df.copy()

    # Set 'timestamp' column as index
    if _df.index.name != 'timestamp':
        _df.set_index('timestamp', inplace=True)
    
    # Put duplicated timestamp with speedBump event to last occurrence
    _df.sort_values(by=['timestamp', 'speedBumpId'], inplace=True)

    # Remove duplicated timestamp
    _df = _df.loc[~_df.index.duplicated(keep='last')]
    return _df

# Carrega dataframe bruto

In [224]:
df = df_manipulator.load_dataframes(input_config['file']['filenames'], input_config['file']['ref_dir'], input_config['file']['delimiter'], input_config['file']['header'])

df.head(10)

Unnamed: 0,id,speedBumpId,timestamp,x,y,z,lat,lng
0,d198e52b-bd5b-424a-8ea5-d391fbdb7b20,0,1589757048346,-3.3992,9.516205,1.573975,-22.921991,-42.473372
1,a9df8305-e4ed-4530-ab78-e1aa3ca9ffbd,0,1589757048543,-2.987396,10.203339,1.291458,-22.921991,-42.473372
2,74dedc33-8d9b-4aa6-a352-df1ee9364ed7,0,1589757048743,-4.376038,9.746048,1.806213,-22.921991,-42.473372
3,4a64f1bf-ea98-49db-baff-d7d08baf2dce,0,1589757048843,-3.698471,9.31749,1.849304,-22.921991,-42.473372
4,25a3976b-a812-4bf2-a2fe-113d7b01cd11,0,1589757048944,-5.03923,9.372559,0.522919,-22.921991,-42.473372
5,427b4485-2b4c-496e-b4f4-a8aa82809882,0,1589757049044,-4.332932,10.064484,0.788681,-22.921991,-42.473372
6,2ae693c7-78a7-409a-bbea-01fdb261fb98,0,1589757049144,-4.457428,8.809921,1.250763,-22.921991,-42.473372
7,93fa9e78-42f3-4a12-9749-399f589da9ac,0,1589757049245,-5.563553,9.022995,1.456665,-22.921991,-42.473372
8,f9704728-66b4-443a-8d87-b985409f6ba2,0,1589757049345,-4.095917,9.188202,1.528488,-22.921991,-42.473372
9,d3ef84b9-fdcd-4c40-b521-189985c69515,0,1589757049547,-4.591507,9.365372,2.237167,-22.921991,-42.473372


# Detalhes do dataframe bruto

In [225]:
df.speedBumpId.value_counts()

0    24933
1       91
2       10
Name: speedBumpId, dtype: int64

In [226]:
df_manipulator.apply_describe(df)

Shape:
 (25034, 8)
Types:

float64 :  ['x' 'y' 'z' 'lat' 'lng']
int64 :  ['speedBumpId' 'timestamp']
object :  ['id']





Unnamed: 0,null_sum,null_pct,dtypes,count,mean,median,min,max
id,0,0.0,object,25034,,,0000103f-58f2-4b67-956e-9fc49ea2a845,fffc405f-2901-45ab-9e6f-1bce809e8977
lat,0,0.0,float64,25034,-22.88493,-22.87186,-22.9355,-22.8342
lng,0,0.0,float64,25034,-42.40036,-42.35508,-42.4957,-42.3168
speedBumpId,0,0.0,int64,25034,0.00443397,0.0,0,2
timestamp,0,0.0,int64,25034,1591377000000.0,1592673000000.0,1589757048346,1592674891762
x,0,0.0,float64,25034,-0.3993261,-0.4040527,-12.7126,5.52878
y,0,0.0,float64,25034,9.315071,9.367767,3.04707,16.038
z,0,0.0,float64,25034,2.957983,2.759109,-11.6588,15.8889


# Remove NaN

In [227]:
df_out = df_manipulator.remove_nan(df)

# Remove timestamp inválido e muda formato

In [228]:
df_out = update_timestamp_ticks_to_datetime(df_out)
df_out = remove_invalid_timestamps(df_out)

# Remove valores de acelerômetro inválidos

In [229]:
df_out = remove_invalid_accelerometer_values(df_out, filtered_columns=['x','y','z'], invalid_value=0)

# Remove timestamp duplicado

In [230]:
df_out = remove_duplicated_timestamp(df_out)

# Remove registros distantes

In [231]:
speed_bump_timestamps = df_out.loc[(df_out['speedBumpId'] > 0) & (df_out['speedBumpId'] != df_out['speedBumpId'].shift(-1))].index

In [232]:
res = np.zeros(len(df_out['speedBumpId']))

i = 0
first_speed_bump_before = None
for idx, val in df_out['speedBumpId'].iloc[0:].iteritems():

    speed_bump_after_idx = np.argmax(speed_bump_timestamps>=idx)
    first_speed_bump_after  = speed_bump_timestamps[speed_bump_after_idx]
    first_speed_bump_before =  None if speed_bump_after_idx == 0 else speed_bump_timestamps[speed_bump_after_idx-1]
    
    if (first_speed_bump_after - idx).seconds > output_config['speed_bump_after_seconds'] and (first_speed_bump_before is None or (first_speed_bump_before is not None and (idx - first_speed_bump_before).seconds >= output_config['speed_bump_before_seconds'])):
        res[i] = True
    else:
        res[i] = False
    i +=1

df_out['far'] = res.astype(bool)
df_out = df_out[df_out['far'] != True]
df_out.drop(['far'], axis=1, inplace=True)

# Detalhes do dataframe filtrado

In [233]:
df_out.head(10)

Unnamed: 0_level_0,id,speedBumpId,x,y,z,lat,lng
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-05-17 20:14:32.715,b78d7d2f-3fd1-4d81-81ee-d00afc039814,0,0.843323,8.838654,3.714386,-22.919592,-42.473961
2020-05-17 20:14:32.816,1c59acca-2bb8-4686-9e93-95aa791f6f70,0,-2.240402,10.248825,3.108658,-22.919592,-42.473961
2020-05-17 20:14:32.918,de01a2a1-dd19-4081-921f-890907e5ca4d,0,-1.998596,10.84259,2.215622,-22.919592,-42.473961
2020-05-17 20:14:33.020,8864e0ac-54c0-4b94-b018-cd764c6dcef0,0,-2.326599,9.702957,2.493347,-22.919592,-42.473961
2020-05-17 20:14:33.218,f609cd0d-603f-4732-affc-517fab1da2d4,0,-0.231674,8.345444,2.601089,-22.919592,-42.473961
2020-05-17 20:14:33.318,b7ad14ef-2f9b-4086-952c-bc72747d81d4,0,-0.238846,9.590424,3.173294,-22.919592,-42.473961
2020-05-17 20:14:33.420,6a6758d1-c0b7-4a9d-8d61-ad886cd4d4bf,0,0.017334,8.21376,3.173294,-22.919592,-42.473961
2020-05-17 20:14:33.521,8355917c-1c52-4493-9b2f-5f1953eec118,0,0.874451,9.37973,3.443848,-22.919592,-42.473961
2020-05-17 20:14:33.621,cc883132-4cac-4753-80a1-9bac47f1d434,0,-0.260391,9.386917,2.984161,-22.919592,-42.473961
2020-05-17 20:14:33.821,56c4192c-c294-4db4-87f4-cd548d91fe7c,0,-1.919586,10.229675,3.017685,-22.919592,-42.473961


In [234]:
df_out.speedBumpId.value_counts()

0    18003
1       91
2       10
Name: speedBumpId, dtype: int64

In [235]:
df_manipulator.apply_describe(df_out)

Shape:
 (18104, 7)
Types:

float64 :  ['x' 'y' 'z' 'lat' 'lng']
int64 :  ['speedBumpId']
object :  ['id']





Unnamed: 0,null_sum,null_pct,dtypes,count,mean,median,min,max
id,0,0.0,object,18104,,,0004aec3-c5a7-4282-82eb-79cb3c18d2a7,fffc405f-2901-45ab-9e6f-1bce809e8977
lat,0,0.0,float64,18104,-22.887996,-22.872597,-22.9352,-22.8346
lng,0,0.0,float64,18104,-42.407432,-42.359461,-42.4957,-42.3168
speedBumpId,0,0.0,int64,18104,0.006131,0.0,0,2
x,0,0.0,float64,18104,-0.437377,-0.466293,-8.20197,5.52878
y,0,0.0,float64,18104,9.322921,9.389313,3.04707,16.038
z,0,0.0,float64,18104,2.955087,2.657356,-11.6588,15.6758


# Salva o dataframe de saída

In [236]:
out_filename = file_manipulator.get_out_filename(output_config['file']['prefix'], output_config['file']['ref_dir'], input_config['file']['filenames'])

df_out.to_csv(out_filename, sep=output_config['file']['delimiter'], header=output_config['file']['with_header'])

print(f'O arquivo {out_filename} foi gerado!')

O arquivo ./out/1_export_lunar_20200517_20200620.csv foi gerado!
