In [1]:
%matplotlib notebook

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime, timedelta
import time
import utils

import os

root = '../../dataset'
t_path = ['dataset_1790789260.csv',
          'dataset_1095394822.csv',
          'dataset_479911147.csv',
          'dataset_665254893.csv',
          
        ] 

s_path = ['dataset_320725673.csv',
          'dataset_401881001.csv',
          'dataset_1992961554.csv',
          'dataset_852552682.csv',
         ] 

l_path = ['dataset_327382690.csv',
          'dataset_395223984.csv',
          'dataset_1999618571.csv',
         ]

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 17
fig_size[1] = 7
plt.rcParams["figure.figsize"] = fig_size

Try to train an ARIMA model on the previous time window to evaluate if the current contains outliers.

We want to keep a certain number of measurements to train the model.
For now use an amount equal to the expected number of measurements - in this way we keep lose only a partial amount of data.

In [3]:
from pmdarima import auto_arima

def ARIMA_slider(df, mins = 30, val_index = 2, ts_index = 1, tr_secs = 10):
    delta = np.timedelta64(mins,'m')
    tr = np.timedelta64(tr_secs, 's')
    expected_num_measurements = int(delta / tr)
    
    win_col = ['id', 'start', 'stop', '#out', '#in', '#miss']
    win_ret = pd.DataFrame(columns = win_col)
    
    point_col = ['id', 'ts', 'val', 'pred', 'l_thresh', 'h_thresh', 'is_in']
    point_ret = pd.DataFrame(columns = point_col)
    
    gen = utils.windows_generator(df, delta, ts_index)
    
    name = df.iloc[0,0]
    prev_window = pd.DataFrame()
    
    for start, end, window in gen:
    
        if len(window) == 0:
            continue
            
        #we have enough data?
        if len(prev_window) < expected_num_measurements:
            prev_window = prev_window.append(window)
            continue    
        
        # We have enough data! -> determine best model
        win_values = prev_window[2].values
        
        model = auto_arima(win_values, error_action="ignore", suppress_warnings=True)
        model.fit(win_values)
        
        # predict next values 
        preds, conf_int = model.predict(n_periods = int(expected_num_measurements), return_conf_int=True)
        
        n_out = 0
        n_in = 0
        
        for i in range(len(window)):
            frame = window.iloc[i,:]
            ts = frame[ts_index].astype('datetime64[ms]')
            value = frame[val_index]
            
            predicted = preds[i]
            lci = conf_int[i][0]
            hci = conf_int[i][1]
            
            if lci < value < hci:
                out = 1
                n_in += 1
            else:
                out = -1
                n_out +=1
            
            prev_window = prev_window.append(frame)
            prev_window = prev_window.iloc[1:,:]
                
            new_data = pd.DataFrame([[name, ts, value, predicted, lci, hci, out]], columns = point_col)
            point_ret = point_ret.append(new_data, sort = False)
        
        win_data = pd.DataFrame([[name, start, end, n_out, n_in, expected_num_measurements-n_in-n_out]], columns = win_col)
        win_ret = win_ret.append(win_data, sort = False)

              
    return win_ret, point_ret

for x in [t_path[0], l_path[0], s_path[0]]:
    path = os.path.join(root, x)
    df = pd.read_csv(path, header=None)
    
    gen = utils.windows_generator(df, np.timedelta64(1,'D'))
    for start, end, df in gen:
        break
    
    #ARIMA_slider(df).plot.line(x=0, y=[1,2,3,4])
    y, x = ARIMA_slider(df)
    fig, ax = plt.subplots()
    x.plot(ax = ax, x = 1, y = [2,4,5], kind='line', title = path)
    for l in x.values:
        ts = l[1]
        val = l[2]
        boolean = l[6]
        if boolean == -1:
            ax.plot(ts, val, 'ro-')
    ax.xaxis_date()
    '''
    values = x['value'].values
    predictions = x['predicted'].values
    error = values-predictions
    squared_error = np.square(error)
    plt.plot(np.arange(len(squared_error)), squared_error)
    plt.show()
    '''
    
    fig, ax = plt.subplots()
    fig.tight_layout()
    fig.subplots_adjust(left = 0.05, bottom=0.3)

    y.plot(x = 1, y = [3,4,5], ax = ax, kind = 'bar', stacked = 'True', colormap = 'summer', title = path)

    ax.set_xlabel('window start time')
    ax.set_ylabel('#number of measurements')
    ax.xaxis_date()
    ax.legend()
    ax.grid()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [4]:
for x in [t_path[1], l_path[1], s_path[1]]:
    path = os.path.join(root, x)
    df = pd.read_csv(path, header=None)
    
    gen = utils.windows_generator(df, np.timedelta64(1,'D'))
    for start, sto, df in gen:
        break
    
    #ARIMA_slider(df).plot.line(x=0, y=[1,2,3,4])
    y, x = ARIMA_slider(df)
    fig, ax = plt.subplots()
    x.plot(ax = ax, x = 1, y = [2,4,5], kind='line', title = path)
    for l in x.values:
        ts = l[1]
        val = l[2]
        boolean = l[6]
        if boolean == -1:
            ax.plot(ts, val, 'ro-')
    ax.xaxis_date()
    '''
    values = x['value'].values
    predictions = x['predicted'].values
    error = values-predictions
    squared_error = np.square(error)
    plt.plot(np.arange(len(squared_error)), squared_error)
    plt.show()
    '''
    
    fig, ax = plt.subplots()
    fig.tight_layout()
    fig.subplots_adjust(left = 0.05, bottom=0.3)

    y.plot(x = 1, y = [3,4,5], ax = ax, kind = 'bar', stacked = 'True', colormap = 'summer', title = path)

    ax.set_xlabel('window start time')
    ax.set_ylabel('#number of measurements')
    ax.xaxis_date()
    ax.legend()
    ax.grid()



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>