# Data Preparation / Time Stamps Alignment
### - detect_time_step

The function infers, i.e. automatically deduce from the input data, the minimum time step (frequency) that can be used for the input time series.

### - align_time_grid

The function aligns the frequency of the input time series with the output frequency given as an argument using the specified aggregation function.

### - clean_ts_integrate

The function takes as input a time-indexed series with measurements of type cumulative or onChange and returns a time series converted to the instantaneous type. The input time series is transformed to an instantaneous metric based on the measurementReadingType. For example, if the measurementReadingType is cumulative, the counter rollover (reset) must be taken into account in the transformation process.

In [36]:
# Import libraries 

import pandas as pd
from os import getcwd
from os.path import join
import matplotlib.pyplot as plt

from ai_toolbox.data_preparation import detect_time_step, align_time_grid, clean_ts_integrate

In [29]:
# Get filename of the dataset we will use (it is in biggpy/ai_toolbox/notebooks/datasets)
# It should be located automatically if you don't move the notebook file to another location


internal_temp_filename = join(getcwd(), "datasets", "internal_temperature_data.csv")

In [30]:
#pd.set_option('display.max_rows', None)
# Import internal temperature data at 15-minute granularity (irregular)

df = pd.read_csv(
    internal_temp_filename,
    sep=',',
    parse_dates=True,
    infer_datetime_format=True,
    index_col=0)
df.index.name = "timestamp"
df

Unnamed: 0_level_0,internal_temperature.actual
timestamp,Unnamed: 1_level_1
2019-03-15 15:00:00+00:00,22.7
2019-03-15 15:15:00+00:00,22.7
2019-03-15 15:45:00+00:00,22.7
2019-03-15 16:00:00+00:00,22.7
2019-03-15 16:15:00+00:00,22.7
...,...
2021-11-14 20:15:00+00:00,17.0
2021-11-14 21:00:00+00:00,17.0
2021-11-14 21:30:00+00:00,17.0
2021-11-14 22:00:00+00:00,16.9


In [31]:
# Detect time step of an irregular series of data

time_step, frequencies = detect_time_step(df)

In [32]:
# Time step of the time series (best guess => most frequent time step detected)

time_step

'30T'

In [33]:
# Analysis of the frequencies detected

frequencies

Unnamed: 0,freq_count,freqstr
0 days 00:30:00,29343,30T
0 days 00:45:00,825,45T
0 days 01:00:00,117,H
0 days 00:15:00,90,15T
0 days 01:30:00,10,90T
0 days 01:15:00,7,75T
0 days 01:45:00,3,105T
0 days 03:00:00,2,3H
0 days 23:45:00,1,1425T
0 days 14:30:00,1,870T


In [34]:
# Let's align the data at 1 hour granularity using the median for example 

df_aligned = align_time_grid(df, '1H', "median")
df_aligned

Unnamed: 0_level_0,internal_temperature.actual
timestamp,Unnamed: 1_level_1
2019-03-15 15:00:00+00:00,22.70
2019-03-15 16:00:00+00:00,22.70
2019-03-15 17:00:00+00:00,18.90
2019-03-15 18:00:00+00:00,17.40
2019-03-15 19:00:00+00:00,15.60
...,...
2021-11-14 18:00:00+00:00,17.15
2021-11-14 19:00:00+00:00,17.10
2021-11-14 20:00:00+00:00,17.00
2021-11-14 21:00:00+00:00,17.00


In [40]:
# Generate a cumulative time series (counter)
# Here, each value is the cumulative sum of the ones before

counter = [30, 1, 20, 28, 44, 0, 2, 11, 56, 0, 23, 89, 10, 32, 45, 19]
ts_counter = pd.Series(data=counter, index=pd.date_range('12/11/2021', freq='15min', periods=len(counter)))
ts_counter

2021-12-11 00:00:00    30
2021-12-11 00:15:00     1
2021-12-11 00:30:00    20
2021-12-11 00:45:00    28
2021-12-11 01:00:00    44
2021-12-11 01:15:00     0
2021-12-11 01:30:00     2
2021-12-11 01:45:00    11
2021-12-11 02:00:00    56
2021-12-11 02:15:00     0
2021-12-11 02:30:00    23
2021-12-11 02:45:00    89
2021-12-11 03:00:00    10
2021-12-11 03:15:00    32
2021-12-11 03:30:00    45
2021-12-11 03:45:00    19
Freq: 15T, dtype: int64

In [41]:
# Transform to the instantaneous measurement type

clean_ts_integrate(ts_counter, "counter")

2021-12-11 00:00:00    30.0
2021-12-11 00:15:00     1.0
2021-12-11 00:30:00    19.0
2021-12-11 00:45:00     8.0
2021-12-11 01:00:00    16.0
2021-12-11 01:15:00     0.0
2021-12-11 01:30:00     2.0
2021-12-11 01:45:00     9.0
2021-12-11 02:00:00    45.0
2021-12-11 02:15:00     0.0
2021-12-11 02:30:00    23.0
2021-12-11 02:45:00    66.0
2021-12-11 03:00:00    10.0
2021-12-11 03:15:00    22.0
2021-12-11 03:30:00    13.0
2021-12-11 03:45:00    19.0
Freq: 15T, dtype: float64

In [43]:
# Generate a delta time series
# Here, each value is expressed as the difference from the last record of the measurement

delta = [30.0, -29.0, 19.0, 8.0, 16.0, -44.0, 2.0, 9.0, 45.0, -56.0, 23.0, 66.0, -79.0, 22.0, 13.0, -26.0]
ts_delta = pd.Series(data=delta, index=pd.date_range('12/11/2021', freq='15min', periods=len(delta)))
ts_delta

2021-12-11 00:00:00    30.0
2021-12-11 00:15:00   -29.0
2021-12-11 00:30:00    19.0
2021-12-11 00:45:00     8.0
2021-12-11 01:00:00    16.0
2021-12-11 01:15:00   -44.0
2021-12-11 01:30:00     2.0
2021-12-11 01:45:00     9.0
2021-12-11 02:00:00    45.0
2021-12-11 02:15:00   -56.0
2021-12-11 02:30:00    23.0
2021-12-11 02:45:00    66.0
2021-12-11 03:00:00   -79.0
2021-12-11 03:15:00    22.0
2021-12-11 03:30:00    13.0
2021-12-11 03:45:00   -26.0
Freq: 15T, dtype: float64

In [44]:
# Transform to instantaneous

clean_ts_integrate(ts_delta, "delta")

2021-12-11 00:00:00    30.0
2021-12-11 00:15:00     1.0
2021-12-11 00:30:00    20.0
2021-12-11 00:45:00    28.0
2021-12-11 01:00:00    44.0
2021-12-11 01:15:00     0.0
2021-12-11 01:30:00     2.0
2021-12-11 01:45:00    11.0
2021-12-11 02:00:00    56.0
2021-12-11 02:15:00     0.0
2021-12-11 02:30:00    23.0
2021-12-11 02:45:00    89.0
2021-12-11 03:00:00    10.0
2021-12-11 03:15:00    32.0
2021-12-11 03:30:00    45.0
2021-12-11 03:45:00    19.0
Freq: 15T, dtype: float64