## Imports

In [15]:
import pandas as pd
import sys, os, importlib
import numpy as np

from datetime import datetime
from pandas import DataFrame
from pandas import concat

In [16]:
pd.set_option('display.max_rows', 100)

if os.path.join('..','0_funcoes_base') not in sys.path:
    sys.path.append(os.path.join('..','0_funcoes_base')) 

df_manipulator = importlib.import_module('df_manipulator')
dt_manipulator = importlib.import_module('date_manipulator')
plot_manipulator = importlib.import_module('plot_manipulator')
file_manipulator = importlib.import_module('file_manipulator')



## Dados de configuração

In [17]:
input_config = {
    'file': {
        'ref_dir':'./out/',
        'filename':'3_export_lunar_20200517.csv',
        'delimiter': ';'
    }
}

output_config = {
    'file': {
        'ref_dir': './out',
        'delimiter':';',
        'with_header': True,
        'prefix':'4_'
    },
    'delimited_intervals': [5, 10, 20, 30, 40, 50],
    'start_region': 3,
    'end_region': 2,
    'cusum_K': 0.5
}

## Funções

In [18]:
def map_region_into_df(df, speed_bump_id, region_id, relative_start_region, relative_end_region, verbose=False):
    _df = df.copy()

    # Get indexes with speedbump
    sb_indexes_by_id  = df_manipulator.get_speed_bumps_idx(_df, speed_bump_id=speed_bump_id)
    speed_bumps       = sum(list(sb_indexes_by_id.values()), [])
    speed_bumps       = [sb.to_pydatetime() for sb in speed_bumps]

    available_min_idx = min(_df.index)
    available_max_idx = max(_df.index)

    region_by_speed_bump_timestamp = []

    for sb_id in sb_indexes_by_id:
        for sb_idx in sb_indexes_by_id[sb_id]:
            if verbose:
                print(f'Region {region_id}: [{sb_id}] {sb_idx} -> from {sb_idx-relative_start_region} to {sb_idx+relative_end_region}')

            region_by_speed_bump_timestamp.extend([dt_manipulator.get_date_window(start_date=sb_idx-relative_start_region, end_date=sb_idx+relative_end_region, bound=(available_min_idx, available_max_idx))])

    if len(region_by_speed_bump_timestamp) < 1:
        return _df

    #print(region_by_speed_bump_timestamp)
    #return
    # Rules
    ## Region 0: without speedbump
    ## Region 1: just one speedbump

    final_region_timestamps = []
    for region_timestamp in region_by_speed_bump_timestamp:
        
        # Apply region rules
        if region_id == 0:
            if len((set(speed_bumps) & set(region_timestamp))) > 0:
                #print(f'Failed 1: {len((set(speed_bumps) & set(region_timestamp)))}')
                continue
        # elif region_id == 2:
        #     if len((set(speed_bumps) & set(region_timestamp))) > 1:
        #         #print(f'set(speed_bumps): {set(speed_bumps)}')
        #         #print(f'\n\nset(region_timestamp): {set(region_timestamp)}')
        #         print(f'\n\n/\: {set(speed_bumps) & set(region_timestamp)}')
        #         print(f'\n\nFailed 2: {len((set(speed_bumps) & set(region_timestamp)))}')
        #         continue

        final_region_timestamps.extend(region_timestamp)

    final_region_timestamps = list(set(final_region_timestamps))

    _df['region_id'][_df.index.isin(final_region_timestamps)] = region_id

    return _df

def cusum(x,mean=0,K=0):
    """
    Tabular CUSUM per Montgomery,D. 1996 "Introduction to Statistical Process Control" p318
    x    : series to analyze
    mean : expected process mean
    K    : reference value, allowance, slack value-- suggest K=1/2 of the shift to be detected.
    Returns:
    x  Original series
    Cp positive CUSUM
    Cm negative CUSUM
    """
    Cp=(x*0).copy()
    Cm=Cp.copy()
    for ii in np.arange(len(x)):
        if ii == 0:
            Cp[ii]=x[ii]
            Cm[ii]=x[ii]
        else:
            Cp[ii]=np.max([0,(x[ii]-mean-K)+Cp[ii-1]])
            Cm[ii]=np.max([0,(mean-K)-x[ii]+Cm[ii-1]])
    return({'x':x, 'Cp': Cp, 'Cm': Cm})

## Carrega dataframe

In [19]:
df = df_manipulator.load_dataframe(input_config['file']['filename'], input_config['file']['ref_dir'], input_config['file']['delimiter'])
df.timestamp = pd.to_datetime(df.timestamp)
df_manipulator.set_index(df, 'timestamp', True)

In [20]:
df

Unnamed: 0_level_0,speed_bump_id,y,z,lat,lng
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-05-17 20:10:48.400,0,9.516205,1.573975,-22.921991,-42.473372
2020-05-17 20:10:48.500,0,9.516205,1.573975,-22.921991,-42.473372
2020-05-17 20:10:48.600,0,10.203339,1.291458,-22.921991,-42.473372
2020-05-17 20:10:48.700,0,10.203339,1.291458,-22.921991,-42.473372
2020-05-17 20:10:48.800,0,9.746048,1.806213,-22.921991,-42.473372
...,...,...,...,...,...
2020-05-17 20:35:51.100,0,9.250443,1.770294,-22.925816,-42.484631
2020-05-17 20:35:51.200,0,10.105179,1.660172,-22.925816,-42.484631
2020-05-17 20:35:51.300,0,9.920822,2.608276,-22.925816,-42.484631
2020-05-17 20:35:51.400,0,9.523392,2.505325,-22.925816,-42.484631


## Adiciona regiões nos registros

Seja *sb(t)* o instante que o veículo passou pelo quebra-mola.

Serão classificadas **dois tipos** de regiões:

- Região 1
    - Momentos da direção aonde não consta quebra-molas próximos;
    - Intervalo:  \[sb(t-inf), sb(t-4)\] V \[sb(t+2), sb(t+inf)\]
- Região 2
    - Momento que precedem quebra-mola;
    - Inclui a ação do motorista frear para quebra-mola, passar pelo mesmo e acelerar;
    - Intervalo:  \[sb(t-4), sb(t+2)\]


**Obs:** sb(t-x), aonde x possui unidade em segundo.

In [21]:
df.speed_bump_id.value_counts()

0    14078
1       45
Name: speed_bump_id, dtype: int64

In [22]:
df_out = df.copy()
df_out['region_id'] = 0

df_out = map_region_into_df(df_out, speed_bump_id=1, region_id=1, relative_start_region=pd.Timedelta(output_config['start_region'], 's'), relative_end_region=pd.Timedelta(output_config['end_region'], 's'), verbose=False)

In [23]:
df_out.region_id.value_counts()

0    12105
1     2018
Name: region_id, dtype: int64

## Adiciona normalização, média móvel, variância e diff do eixo Z e Y

In [24]:
df_out['z-norm'] = (df_out['z'] - df_out['z'].mean())/df_out['z'].std()
df_out['y-norm'] = (df_out['y'] - df_out['y'].mean())/df_out['y'].std()

for i in output_config['delimited_intervals']:    
    df_out[f'z-diff_{str(i)}'] = df_out['z-norm'].diff(periods=i)
    df_out[f'z-mean_{str(i)}'] = df_out['z-norm'].rolling(i).mean()
    df_out[f'z-std_{str(i)}']  = df_out['z-norm'].rolling(i).std()
    
    if i == 5 or i == 10:
        df_out[f'z-corr-std_{str(i)}'] = df_out['z-norm'].rolling(i).corr(df_out[f'z-std_{str(i)}'])
        df_out[f'y-corr-z-mean_{str(i)}'] = df_out['y-norm'].rolling(i).corr(df_out[f'z-mean_{str(i)}'])

## Adiciona CuSum features

In [25]:
result = cusum(df_out['z-norm'].values, mean=df_out['z-norm'].values.mean(), K = output_config['cusum_K'])

df_out['cp'] = result['Cp']
df_out['cm'] = result['Cm']

In [26]:
df_out.dropna(how='any', inplace=True)

## Salva o dataframe de saída

In [28]:
out_filename = file_manipulator.get_out_filename(output_config['file']['prefix'], output_config['file']['ref_dir'], input_config['file']['filename'])

df_out.to_csv(out_filename, sep=output_config['file']['delimiter'], header=output_config['file']['with_header'])

print(f'O arquivo {out_filename} foi gerado!')

O arquivo ./out/4_export_lunar_20200517.csv foi gerado!
