## Imports

In [3]:
import pandas as pd
import sys, os, importlib
from datetime import datetime

In [4]:
pd.set_option('display.max_rows', 100)

if os.path.join('..','0_funcoes_base') not in sys.path:
    sys.path.append(os.path.join('..','0_funcoes_base')) 

df_manipulator = importlib.import_module('df_manipulator')
date_manipulator = importlib.import_module('date_manipulator')
file_manipulator = importlib.import_module('file_manipulator')

## Dados de configuração

In [5]:
input_config = {
    'file': {
        'ref_dir':'../../1_dados_raw/',
        'filename':'export_lunar_20200606.csv',
        'delimiter': ';',
        'header': ['id', 'speedBumpId','timestamp','x','y','z','lat','lng']
    }
}

output_config = {
    'file': {
        'ref_dir': './out',
        'prefix': '1_',
        'delimiter':';',
        'with_header': True
    }
}

## Funções

In [6]:
def remove_invalid_timestamps(df):
    df = df_manipulator.filter_by_dates(df, start_date='2000-01-01', end_date=datetime.utcnow(), format='%Y-%m-%d')
    return df

def update_timestamp_ticks_to_datetime(df):
    df['new_timestamp'] = df.apply(lambda row: date_manipulator.ticks_to_datetime(row.timestamp), axis=1)

    df = df_manipulator.remove_column(df, 'timestamp', False)
    df.rename(columns={'new_timestamp':'timestamp'},inplace=True)

    df_manipulator.set_index(df, 'timestamp', True)
    df.sort_index(inplace=True)

    return df

def remove_invalid_accelerometer_values(df, filter_columns, invalid_value = 0):
    _df = df.copy()
    
    if len(filter_columns) > 0:
        for filter_column in filter_columns:
            _df = _df[_df[filter_column] != invalid_value]
    return _df

def remove_duplicated_timestamp(df):
    _df = df.copy()

    # Set 'timestamp' column as index
    if _df.index.name != 'timestamp':
        _df.set_index('timestamp', inplace=True)
    
    # Put duplicated timestamp with speedBump event to last occurrence
    _df.sort_values(by=['timestamp', 'speedBumpId'], inplace=True)

    # Remove duplicated timestamp
    _df = _df.loc[~_df.index.duplicated(keep='last')]

    return _df

## Carrega dataframe bruto

In [7]:
df = df_manipulator.load_dataframe(input_config['file']['filename'], input_config['file']['ref_dir'], input_config['file']['delimiter'], input_config['file']['header'])

## Dataframe bruto

In [8]:
df.speedBumpId.value_counts()

0    1352
Name: speedBumpId, dtype: int64

In [9]:
df

Unnamed: 0,id,speedBumpId,timestamp,x,y,z,lat,lng
0,60e2430c-3460-4710-bbd7-5c5e8f615a06,0,1591439988292,-1.850143,9.758026,0.072815,-22.926679,-42.485770
1,b7d0f823-e3f0-47cb-a852-f903871da96c,0,1591439988489,-2.331390,10.428391,-0.674179,-22.926679,-42.485770
2,e715ebe0-31e5-4b55-af0f-577b0e24e657,0,1591439988590,-1.682556,10.327835,-0.714890,-22.926679,-42.485770
3,b6b55f66-c874-4a8b-98aa-ccee60dc7cdf,0,1591439988690,-1.876480,9.937592,0.518127,-22.926679,-42.485770
4,3dc60d1f-6ce4-4e73-b99d-ce89cdf334ee,0,1591439988793,-1.625092,9.683792,1.226822,-22.926679,-42.485770
...,...,...,...,...,...,...,...,...
1347,385c5e83-d5b8-407a-8c23-7959ee267794,0,1591440198096,-2.431946,9.611969,1.243576,-22.922163,-42.474801
1348,1ec0d72c-0edb-4659-a274-50e2d95b1383,0,1591440198197,-2.372086,9.506622,1.439896,-22.922163,-42.474801
1349,3a0fa63c-cfa3-4f4f-ba07-b58ffdea3fa9,0,1591440198298,-1.617905,9.518600,1.585938,-22.922163,-42.474801
1350,5b6d821c-14b3-452d-b72d-243b7b7e931c,0,1591440198499,0.261536,9.753235,1.535660,-22.922163,-42.474801


## Detalhes do dataframe bruto

In [10]:
df_manipulator.apply_describe(df)

Shape:
 (1352, 8)
Types:

float64 :  ['x' 'y' 'z' 'lat' 'lng']
int64 :  ['speedBumpId' 'timestamp']
object :  ['id']





Unnamed: 0,null_sum,null_pct,dtypes,count,mean,median,min,max
id,0,0.0,object,1352,,,0054b5b8-4f58-4456-ae0d-2c1dbe41cb58,fff333dc-429b-46a4-bff8-7efe787d31da
lat,0,0.0,float64,1352,-22.92203,-22.92213,-22.9267,-22.9172
lng,0,0.0,float64,1352,-42.47832,-42.47792,-42.4858,-42.474
speedBumpId,0,0.0,int64,1352,0.0,0.0,0,0
timestamp,0,0.0,int64,1352,1591440000000.0,1591440000000.0,1591439988292,1591440198599
x,0,0.0,float64,1352,-1.123985,-1.232437,-4.56038,3.13937
y,0,0.0,float64,1352,9.682023,9.683792,7.30635,12.0972
z,0,0.0,float64,1352,1.185883,1.147812,-2.48898,6.63052


## Remove NaN

In [11]:
df_out = df_manipulator.remove_nan(df)

## Remove timestamp inválido e muda formato

In [12]:
df_out = update_timestamp_ticks_to_datetime(df_out)
df_out = remove_invalid_timestamps(df_out)

## Remove valores de acelerômetro inválidos

In [13]:
df_out = remove_invalid_accelerometer_values(df_out, filter_columns=['x','y','z'], invalid_value=0)

## Remove timestamp duplicado

In [14]:
df_out = remove_duplicated_timestamp(df_out)

## Dataframe filtrado

In [15]:
df_out

Unnamed: 0_level_0,id,speedBumpId,x,y,z,lat,lng
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-06-06 07:39:48.292,60e2430c-3460-4710-bbd7-5c5e8f615a06,0,-1.850143,9.758026,0.072815,-22.926679,-42.485770
2020-06-06 07:39:48.489,b7d0f823-e3f0-47cb-a852-f903871da96c,0,-2.331390,10.428391,-0.674179,-22.926679,-42.485770
2020-06-06 07:39:48.590,e715ebe0-31e5-4b55-af0f-577b0e24e657,0,-1.682556,10.327835,-0.714890,-22.926679,-42.485770
2020-06-06 07:39:48.690,b6b55f66-c874-4a8b-98aa-ccee60dc7cdf,0,-1.876480,9.937592,0.518127,-22.926679,-42.485770
2020-06-06 07:39:48.793,3dc60d1f-6ce4-4e73-b99d-ce89cdf334ee,0,-1.625092,9.683792,1.226822,-22.926679,-42.485770
...,...,...,...,...,...,...,...
2020-06-06 07:43:18.096,385c5e83-d5b8-407a-8c23-7959ee267794,0,-2.431946,9.611969,1.243576,-22.922163,-42.474801
2020-06-06 07:43:18.197,1ec0d72c-0edb-4659-a274-50e2d95b1383,0,-2.372086,9.506622,1.439896,-22.922163,-42.474801
2020-06-06 07:43:18.298,3a0fa63c-cfa3-4f4f-ba07-b58ffdea3fa9,0,-1.617905,9.518600,1.585938,-22.922163,-42.474801
2020-06-06 07:43:18.499,5b6d821c-14b3-452d-b72d-243b7b7e931c,0,0.261536,9.753235,1.535660,-22.922163,-42.474801


## Detalhes do dataframe filtrado

In [16]:
df_out.speedBumpId.value_counts()

0    1073
Name: speedBumpId, dtype: int64

In [17]:
df_manipulator.apply_describe(df_out)

Shape:
 (1073, 7)
Types:

float64 :  ['x' 'y' 'z' 'lat' 'lng']
object :  ['id']
int64 :  ['speedBumpId']





Unnamed: 0,null_sum,null_pct,dtypes,count,mean,median,min,max
id,0,0.0,object,1073,,,0054b5b8-4f58-4456-ae0d-2c1dbe41cb58,fff333dc-429b-46a4-bff8-7efe787d31da
lat,0,0.0,float64,1073,-22.92201,-22.922134,-22.9267,-22.9172
lng,0,0.0,float64,1073,-42.479375,-42.479985,-42.4858,-42.474
speedBumpId,0,0.0,int64,1073,0.0,0.0,0,0
x,0,0.0,float64,1073,-1.087616,-1.22287,-4.56038,3.13937
y,0,0.0,float64,1073,9.67677,9.683792,7.30635,12.0972
z,0,0.0,float64,1073,1.32769,1.250763,-2.48898,6.63052


## Salva o dataframe de saída

In [18]:
out_filename = file_manipulator.get_out_filename(output_config['file']['prefix'], output_config['file']['ref_dir'], input_config['file']['filename'])

df_out.to_csv(out_filename, sep=output_config['file']['delimiter'], header=output_config['file']['with_header'])

print(f'O arquivo {out_filename} foi gerado!')

O arquivo ./out/1_export_lunar_20200606.csv foi gerado!
