# Import Modules

In [1]:
# Modules for Data Pre-processing #
import numpy as np
import pandas as pd
import os
import sys
from ev_utilities.db_utils import DataExtractor

In [63]:
def preprocess_data(dataframe):
    # Convert the logtime index to a column and reset the index
    dataframe['logtime'] = dataframe.index
    dataframe = dataframe.reset_index(drop=True)

    # Extract the unique log times
    log_times = dataframe['logtime'].unique()

    # Convert the log times to numerical values
    log_times_mapping = {log_time: i for i, log_time in enumerate(log_times)}
    dataframe['logtime'] = dataframe['logtime'].map(log_times_mapping)

    # Extract the parameters
    parameters = [col for col in dataframe.columns if col!='logtime']
    
    print(parameters)
    print(len(log_times))
    print(len(parameters))
    print(len(dataframe))

    # Create a 3D tensor to hold the timeseries data
    timeseries_data = np.zeros((len(log_times), len(parameters), len(dataframe)))

    # Populate the tensor with the data
    for i, log_time in enumerate(log_times):
        log_time_data = dataframe[dataframe['logtime'] == log_times_mapping[log_time]]
        for j, parameter in enumerate(parameters):
            timeseries_data[i, j, :] = log_time_data[parameter].values

    # Return the preprocessed data
    return timeseries_data

In [64]:
length = 1000
dataframe = pd.DataFrame({'index':pd.date_range(start='2022/10/23', 
                                                end='2023/01/01', 
                                                periods=length),
                          'col_1':np.random.rand(1000), 
                          'col_2':np.random.rand(1000)}).set_index('index', drop=True)

In [65]:
dataframe.index.name='LogTime'

In [66]:
preprocess_data(dataframe)[0]

['col_1', 'col_2']
1000
2
1000


array([[0.44290194, 0.44290194, 0.44290194, ..., 0.44290194, 0.44290194,
        0.44290194],
       [0.17598661, 0.17598661, 0.17598661, ..., 0.17598661, 0.17598661,
        0.17598661]])

In [22]:
dataframe['delta_time'] = dataframe.index.to_series().diff()

In [24]:
dataframe.dropna(inplace=True)

In [41]:
import copy
dataframe_copy = copy.deepcopy(dataframe)

In [42]:
dataframe_copy.loc[:, ~dataframe_copy.columns.isin(['delta_time', 'timestamp'])] = dataframe_copy.loc[:, ~dataframe_copy.columns.isin(['delta_time', 'timestamp'])].apply(lambda x: (x-x.mean())/x.std())
dataframe_copy

Unnamed: 0_level_0,col_1,col_2,delta_time
LogTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-10-23 01:40:54.054054054,0.470907,0.878259,0 days 01:40:54.054054054
2022-10-23 03:21:48.108108108,-1.336770,1.246168,0 days 01:40:54.054054054
2022-10-23 05:02:42.162162162,-0.058357,-0.149075,0 days 01:40:54.054054054
2022-10-23 06:43:36.216216216,-1.461536,1.216868,0 days 01:40:54.054054054
2022-10-23 08:24:30.270270270,1.039255,0.647424,0 days 01:40:54.054054054
...,...,...,...
2022-12-31 17:16:23.783783783,0.410162,0.653638,0 days 01:40:54.054054054
2022-12-31 18:57:17.837837837,0.041411,-1.670965,0 days 01:40:54.054054054
2022-12-31 20:38:11.891891892,-0.922445,1.211489,0 days 01:40:54.054054055
2022-12-31 22:19:05.945945946,-1.534124,1.028006,0 days 01:40:54.054054054
