> Copyright 2022 University of Luxembourg
> 
> Licensed under the Apache License, Version 2.0 (the "License");  
> you may not use this file except in compliance with the License.  
> You may obtain a copy of the License at  
>
>    https://www.apache.org/licenses/LICENSE-2.0
>
> Unless required by applicable law or agreed to in writing, software  
> distributed under the License is distributed on an "AS IS" BASIS,  
> WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
> See the License for the specific language governing permissions and  
> limitations under the License.  
>
***

Author: André Stemper (andre.stemper@uni.lu)

***

Dataset class

Use *%run dataset.ipynb* to include this notebook into another notebook.  

Possible datasets:  
["2022.02.09", "2022.02.16", "2022.02.25", "2022.03.09", "2022.03.16", "2022.03.23", "2022.04.06", "2022.05.13",  
 "2022.05.18", "2022.05.20", "2022.05.30", "2022.06.01", "2022.06.03", "2022.06.08", "2022.06.15", "2022.06.22"]

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os, json, math, copy
import datetime as _datetime

%matplotlib inline

Abstract test class

In [None]:
class DatasetTest(object):
    @property
    def name(self):
        return ""

    def test(self, dataset):
        return False

    def run(self, dataset):
        try:
            return self.test(dataset)
        except Exception as e:
            raise(Exception("Test failed to execute: {}".format(e)))


Test for checking for uptime monotonicity

In [None]:
class DatasetTest_UptimeMonotonicity(DatasetTest):
    @property
    def name(self):
        return "uptime is monotonic"

    def test(self, dataset):
        return (dataset.dataframe['uptimeInS']).astype('float64').is_monotonic 
   

Test for checking for uniform sample rate

In [None]:

class DatasetTest_IndexUniformity(DatasetTest):

    epsilon=_datetime.timedelta(milliseconds=300)
    
    @property
    def name(self):
        return "index is uniform (epsilon={})".format(self.epsilon)

    def test(self, dataset):
        delta = (dataset.dataframe.index[1:]-dataset.dataframe.index[0:-1]).fillna(0)
        deltadelta = delta[1:] - delta[0:-1]
        return not (deltadelta > self.epsilon).any()


Test sample period

In [None]:
class DatasetTest_SamplingPeriod(DatasetTest):
        def __init__(self, period=5.0, epsilon=0.2):
            self.period = period
            self.epsilon = epsilon

        @property
        def name(self):
            return "sampling period close to target period"

        def test(self, dataset):
            p,s = dataset.period
            return abs(p-self.period) < self.epsilon

Dataset

In [None]:
class Dataset(object):    
    def __init__(self, name, verbose=False, separate_angle_sin=True, datasets_folder=['.', 'datasets']):
        self.dataset_name = name  
        self.verbose = verbose
        self.separate_angle_sin = separate_angle_sin
        #
        self.__meta = {}    
        self.__dataframe = None        # active dataframe
        self.__loaded_dataframe = None # original unscaled dataframe
        self.__partial_dataframes={}   # partial dataframes merged into loaded dataframe
        #
        self._applied_scale = (0, 1) # a+b*x
        #
        self.contains_telemetry_data = False
        self.contains_table_data = False
        self.contains_sin_of_angle = False
        self.contains_room_temperature_data = False
        # 
        self.__load(name, datasets_folder=datasets_folder)

    @property
    def anomalies(self):
        """ list of anomalies """
        try:
            return self.__meta["anomalies"]["data"]
        except:
            return []

    @property
    def comment(self):
        try:
            return str(self.__meta["comment"])
        except:
            return ""

    @property
    def dataframe(self):
        """ pandas dataframe """
        return self.__dataframe

    @property
    def is_valid(self):
        return all(self.test(human_readable=False).values())

    @property
    def length(self):
        return len(self.__dataframe)

    @property
    def meta(self):
        """ meta data """
        return self.__meta

    @property
    def name(self):
        return self.dataset_name

    @property
    def period(self):
        delta = (self.__dataframe.index[1:]-self.__dataframe.index[0:-1]).fillna(0)
        return (delta.mean().total_seconds(), delta.std().total_seconds())

    @property
    def ranges(self):
        """ dict of ranges """
        try:
            return self.__meta["ranges"]
        except:
            return []

    @property 
    def scale_factor(self):
        return self._applied_scale

    def get_partial_dataframe(self, name):
        try:
            return self.__partial_dataframes[name]
        except:
            raise KeyError

    def plot(self, columns=['temp_0', 'temp_1', 'temp_2', 'temp_3', 'temp_4', 'temp_5', 'temp_6', 'temp_7', 'temp_8', 'angle', 'sin_of_angle'], loc='best', column=None, more_columns=None, not_columns=None,  **kwargs):                           
        """ plot data """
        if not column is None:
            columns=[column]
        if type(more_columns) == str:
            more_columns = list(more_columns)
        if not more_columns is None:
            columns = columns + more_columns
        if not not_columns is None:
            try:
                for v in not_columns:
                    columns.remove(v)
            except:
                pass
        if not self.contains_table_data:
            try:
                columns.remove('angle')
            except:
                pass
            try:
                columns.remove('sin_of_angle')
            except:
                pass
        if not self.separate_angle_sin:   
            try:
                columns.remove('sin_of_angle')
            except:
                pass
        if not self.contains_room_temperature_data:
            try:
                columns.remove('room_temperature')
            except:
                pass
        self.__dataframe.plot(y=columns, **kwargs).legend(loc=loc)
        return self

    def plot_anomalies(self, ax=None, **kwargs):
        """ plot anomalies """
        for anomaly in self.anomalies:
            # start = _datetime.datetime.strptime(anomaly['start'], '%d.%m.%Y %H:%M:%S.%f')
            start = anomaly['start']
            try:
                start = start + self.__meta['rebase_offset']
            except KeyError:                
                pass
            if ax is None:
                ax = plt
            ax.axvline(start, linestyle='dashed', color=(1, 0, 0, 0.5), **kwargs)     
        return self

    def plot_ranges(self, ranges=None, alpha=1.0, ax=None, **kwargs):
        if ranges is None:
            ranges = self.ranges.keys()

        __parse_time = lambda x: _datetime.datetime.strptime(x, '%d.%m.%Y %H:%M:%S.%f')

        for r in ranges:
            rr = self.ranges[r]
            try:
                color = rr['color']
            except:
                color = None
            try:
                metaalpha = rr['alpha']
            except:
                metaalpha = 0.2
            try:
                start = __parse_time(rr['start'])
            except:
                start = self.__dataframe.index[0]
            try:
                stop = __parse_time(rr['stop'])
            except:
                stop = self.__dataframe.index[-1]
            if ax is None:
                ax = plt
            ax.axvspan(start, stop, color=color, alpha=alpha*metaalpha, **kwargs)
        return self

    def plot_time(self, **kwargs):
        """ plot time """
        plt.plot(self.__dataframe.index.values[0:], **kwargs)        
        return self

    def reset(self):
        self.__dataframe = self.__loaded_dataframe 
        self._applied_scale = (0, 1) 
        return self

    def save_dataframe_as_csv(self, filename='dataframe.csv', **kwargs):  
        """ save dataframe """
        if not self.__dataframe is None:  
            self.__dataframe.to_csv(filename, **kwargs)
        return self    

    def overwrite_dataframe_from_csv(self, filename='dataframe.csv', **kwargs):  
        """ load dataframe keeping current index """
        if not self.__dataframe is None:  
            index = self.__dataframe.index
            self.__dataframe = pd.read_csv(filename, **kwargs)
            self.__dataframe.index = index 
        return self    

    def test(self, human_readable=True, tests=None):
        """ run tests on dataset and format results """
        # run tests
        if tests is None:
            tests = [
                DatasetTest_UptimeMonotonicity(),
                DatasetTest_IndexUniformity()
                ]
        results = {}
        for test in tests:
            try:
                results[test.name] = test.run(self)
            except Exception as e:
                results[test.name] = e
        # format 
        if human_readable:
            readable_results = []
            for name, result in results.items():
                if type(result) is bool:
                    readable_results.append(" {} checking if {}: {}".format("+" if result else "-", name, "passed." if result else "failed."))
                else:
                    readable_results.append(" ! checking if {} caused an exception: {}".format(name, e))
            return '\r\n'.join(readable_results)
        return results

    def test_all_ranges(self):
        """ run tests over all known ranges """
        results=[]
        for r in dataset.ranges.keys():
            results.append("Testing range '{}'".format(r))
            results.append(dataset[r].test())
        return '\r\n'.join(results) 

    def extract_by_date(self, start=None, stop=None):
        """ returns a dataset between start and end dates. 
             - if start is None the beginning of this dataset will be used
             - if end is None the end of this dataset will be used 
            dates are in the format d.m.Y H:M:S.f
        """
        try:
            data_start_index = list(self.__dataframe.index >= _datetime.datetime.strptime(start, '%d.%m.%Y %H:%M:%S.%f')).index(True)                        
        except Exception as e:
            data_start_index = 0
        try:
            data_stop_index = list(self.__dataframe.index >= _datetime.datetime.strptime(stop, '%d.%m.%Y %H:%M:%S.%f')).index(True)                        
        except Exception as e:
            rows, columns = self.__dataframe.shape
            data_stop_index = rows
        dataset = copy.deepcopy(self)
        dataset.__dataframe = self.__dataframe.iloc[data_start_index:data_stop_index]
        return dataset
    
    def extract(self, label):
        """ extract a named range defined in meta.json """
        try:
            # get range by label 
            r = self.__meta["ranges"][label]       
            try:
                start = r['start']
                data_start_index = list(self.__dataframe.index >= _datetime.datetime.strptime(start, '%d.%m.%Y %H:%M:%S.%f')).index(True)                        
            except:
                data_start_index = 0
            try:
                stop = r['stop']
                data_stop_index = list(self.__dataframe.index >= _datetime.datetime.strptime(stop, '%d.%m.%Y %H:%M:%S.%f')).index(True)                        
            except:
                rows, columns = self.__dataframe.shape
                data_stop_index = rows
            dataset = copy.deepcopy(self)
            dataset.__dataframe = self.__dataframe.iloc[data_start_index:data_stop_index]
            return dataset
        except KeyError:
            raise(KeyError)

    def normalize(self, debug_plot=True, columns=['temp_0', 'temp_1', 'temp_2', 'temp_3', 'temp_4', 'temp_5', 'temp_6', 'temp_7', 'temp_8', 'angle']): 
        """ normalize the given columns on the min max of each column """           
        temperature_and_angle_norm = self.__dataframe.copy()
        for column in columns:            
            try:
                max_value = self.__dataframe[column].max(numeric_only=True)
                min_value = self.__dataframe[column].min(numeric_only=True)
                temperature_and_angle_norm[column] = (self.__dataframe[column] - min_value) / (max_value-min_value)                        
            except:
                pass                                
        dataset = copy.deepcopy(self)
        dataset.__dataframe = temperature_and_angle_norm
        return dataset

    def minmax(self, columns=['temp_0', 'temp_1', 'temp_2', 'temp_3', 'temp_4', 'temp_5', 'temp_6', 'temp_7', 'temp_8']):
        """ returns the min, max over all given columns """
        try:
            from_min = self.__dataframe[columns].min(numeric_only=True).min()
            from_max = self.__dataframe[columns].max(numeric_only=True).max()
        except Exception as e:
            raise (Exception("Cannot detect min,max over all columns for scaling:{}".format(e)))
        return (from_min, from_max)

    @staticmethod
    def normalize_multiple_datasets(datasets=[], scale_to=(-1, 1), columns=['temp_0', 'temp_1', 'temp_2', 'temp_3', 'temp_4', 'temp_5', 'temp_6', 'temp_7', 'temp_8']):
        """ Normalize multiple datasets to with a common factor """
        all_mins = []
        all_maxs = []
        for dataset in datasets:
            mi,ma = dataset.minmax(columns=columns)
            all_mins.append(mi)
            all_maxs.append(ma)
        scale_from = (min(all_mins), max(all_maxs))
        for dataset in datasets:
            dataset.scale(scale_from=scale_from, scale_to=scale_to, columns=columns)
        
    def scale(self, scale_from=None, scale_to=(-1, 1), columns=['temp_0', 'temp_1', 'temp_2', 'temp_3', 'temp_4', 'temp_5', 'temp_6', 'temp_7', 'temp_8']):
        """ scale values in columns from scale_from range to scale_to range 
            if scale_from=None: automatic scale to the min/max values over all columns 
            
            The difference between the normalize and scaling function is that 
                - normalize scales to the min,max of a single column
                - scale scales after looking at all columns (in the column=[] list) or a fixed value if provided
            """
        try:
            (to_lower, to_upper) = scale_to
        except Exception as e:
            raise Exception("Failed to unpack scale_to limits: {}".format(e))
        if scale_from is None:
            # find the min, max over all columns
            try:
                from_min = self.__dataframe[columns].min(numeric_only=False).min()
                from_max = self.__dataframe[columns].max(numeric_only=False).max()
            except Exception as e:
                raise (Exception("Cannot detect min,max over all columns for scaling:{}".format(e)))
        else:
            try:
                (from_min, from_max) = scale_from
            except Exception as e:
                raise Exception("Failed to unpack scale_from limits: {}".format(e))
        if from_min > from_max:
            raise(Exception("scale_from: lower limit larger then upper limit"))
        if to_lower > to_upper:
            raise(Exception("scale_to: lower limit larger then upper limit"))
        # print((from_min, from_max))
        dataset = copy.deepcopy(self)
        copyframe = self.__dataframe.copy()        
        a=to_lower
        b=(to_upper-to_lower)
        c=from_min
        d=(from_max - from_min)
        applied_scale = (a-(b*c)/d, b/d)
        for column in columns:
            try:
                # dataset.__dataframe[column] = dataset.__dataframe[column].apply(lambda x: to_lower + (to_upper-to_lower) * ((x - from_min) / (from_max - from_min)))
                copyframe[column] = dataset.__dataframe[column].apply(lambda x: applied_scale[0] + x * applied_scale[1])
            except Exception as e:
                raise(Exception("Cannot scale column '{}': {}".format(column, e)))
        dataset.__dataframe = copyframe
        dataset._applied_scale = (dataset._applied_scale[0]+applied_scale[0], dataset._applied_scale[1]*applied_scale[1])
        return dataset

    def rebase_time(self, date=None, now=True):
        """ change start time of dataset """
        copyframe = self.__dataframe.copy()        
        if now:      
            date = _datetime.datetime.today()              
        if date is None:
            date = _datetime.datetime(1970,0,0,0,0,0)                
        offset = date-copyframe.index[0]   
        copyframe.index = copyframe.index + offset
        dataset = copy.deepcopy(self)
        dataset.__dataframe = copyframe    
        dataset.meta['rebase_offset'] = offset
        return dataset

    def get_nearest_index(self, closest_to_time):
        """get the index closest to a given timestamp """
        return self.__dataframe.index[self.__dataframe.index.get_loc(closest_to_time, method='nearest')]

    def get_named_range(self, name, with_comment=False):
        """ get the meta data of a named range """
        ra = self.__meta['ranges'][name]
        if with_comment:
            return ra['start'], ra['stop'], ra['comment']
        return ra['start'], ra['stop']

    def __str__(self):  
        """ explain dataset """
        description = []
        try:
            description.append("Dataset: {}".format(self.__meta['datasets_folder']))
        except:
            pass
        try:
            if len(self.__meta['comment']) > 0:
                description.append("Comment: {}".format(self.__meta['comment']))
        except:
            pass
        description.append("Loaded parts:")
        description.append(" - Loaded telemetry data: {}".format(self.contains_telemetry_data))
        description.append(" - Loaded room temperature data: {}".format(self.contains_room_temperature_data))
        description.append(" - Loaded table data: {}".format(self.contains_table_data))
        description.append("Ranges:")
        for r, rr in self.ranges.items():
            try:
                comment = ''
                try: 
                    if len(rr["comment"]) > 0:
                        comment = ": {}".format(rr["comment"])
                except KeyError:
                    pass
                description.append(" - range '{}': start={}, stop:={}{}".format(r, rr['start'], rr['stop'], comment))
            except:
                pass
        description.append("Anomalies:")
        for rr in self.anomalies:
            try:
                comment = ''
                try: 
                    if len(rr["comment"]) > 0:
                        comment = ": {}".format(rr["comment"])
                except KeyError:
                    pass
                description.append(" - anomaly of {}% at '{}' during {}s{}".format(rr['power'], rr['start'], rr['duration'], comment))
            except:
                pass
        try:
            rows, columns = self.__dataframe.shape
            description.append("Dataframe has {} data points.".format(rows))
        except:
            pass
        try:
            description.append("Dataset index resolution is: {}".format(self.__dataframe.index.resolution))
        except:
            pass

        for r in dataset.ranges.keys():
            description.append("Testing range '{}'".format(r))
            description.append(dataset[r].test())

        try:
            description.append("Dataframe columns:\r\n"+', '.join(["{}".format(column) for column in self.__dataframe.columns]))
        except:
            pass
        return '\r\n'.join(description) 

    def __getitem__(self, k):
        if type(k) == str:
            return self.extract(k)
        if type(k) == slice:
            try:
                return self.extract_by_date(k.start, k.stop)
            except Exception as e:
                pass
        dataset = copy.deepcopy(self)
        dataset.__dataframe = self.__dataframe.iloc[k]
        return dataset               

    def __getslice__(self, start, end):
        if ((type(start) == str) or (type(start) == None)) and ((type(end)==str) or (type(end)==None)):
            return self.extract_by_date(start, end)
        else:
            dataset = copy.deepcopy(self)
            dataset.__dataframe = self.__dataframe.iloc[max(0, start):max(0, end)]
            return dataset                       

    def __load_metadata(self, dataset_name:str, meta_file='meta.json', datasets_folder=['.', 'datasets']):
        """ Load dataset meta data from meta.json """     
        try:
            datasets_path = os.path.join(*datasets_folder)
            with open(os.path.join(datasets_path, dataset_name, meta_file)) as metafile:
                self.__meta = json.load(metafile)
                self.__meta['datasets_folder'] = os.path.join(datasets_path, dataset_name)       
        except FileNotFoundError:
            raise(Exception("Cannot find meta file for dataset {}".format(dataset_name)))
        except json.JSONDecodeError as e:
            raise(Exception("Error in meta.json: {}".format(e)))
        self.__parse_anomalies()

    def __parse_anomalies(self):
        offset = self.__anomalies_time_offset(self.__meta["anomalies"])
        for anomaly in self.__meta["anomalies"]["data"]:
            try:
                time = _datetime.datetime.strptime(anomaly['start'], '%d.%m.%Y %H:%M:%S.%f')
                anomaly['start'] = time + offset 
            except:
                print("Failed to parse anomaly date: {}".format(anomaly))

    def __anomalies_time_offset(self, context:dict, label:str="anomalies"):
        """ find time offset correction for anomalies
            offset: {"days":0, "seconds":0, "microseconds":0, "milliseconds":0, "minutes":0, "hours":0, "weeks":0}
        """
        try:
            tc_map = context["time_correction"]
            try:
                if tc_map['offset'] is None:
                    raise TypeError
                delta_datetime = _datetime.timedelta(**tc_map['offset'])
                if self.verbose:
                    print("Corrected time for '{}' using given offset. {}".format(label, delta_datetime))
            except Exception:  #TypeError:
                try:
                    recorded_datetime = _datetime.datetime.strptime(tc_map['recorded_datetime'], '%d.%m.%Y %H:%M:%S.%f')
                except KeyError:
                    recorded_datetime = _datetime.datetime.fromtimestamp(float(tc_map['recorded_datetime']))
                try:
                    dataframe_datetime = _datetime.datetime.strptime(tc_map['dataframe_datetime'], '%d.%m.%Y %H:%M:%S.%f')
                except KeyError:
                    dataframe_datetime = _datetime.datetime.fromtimestamp(float(tc_map['dataframe_datetime']))
                delta_datetime =  dataframe_datetime - recorded_datetime
                if self.verbose:
                    print("Corrected {} time using given datetimes.".format(label))
            
            if self.verbose:
                print("Time offset corrected for '{}' by {}.".format(label.capitalize(), delta_datetime))

        except Exception as e:
            delta_datetime = _datetime.timedelta({'hours':0})
            if self.verbose:
                print("No time correction has been provided for {}. Assuming clocks to the synchronous.({})".format(label, e))
        return delta_datetime 

           
    def __load(self, dataset_name:str, sin_of_angle=True, datasets_folder=['.', 'datasets'], meta_file='meta.json'):
        """ load the dataset """
        self.__load_metadata(dataset_name=dataset_name, meta_file=meta_file, datasets_folder=datasets_folder)
        self.__load_telemetry_data()
        self.__load_table_data()
        self.__load_room_temperature_data()
        self.__loaded_dataframe = self.__dataframe

    def __merge_partial_dataframe(self, dataframe):
        """ merge new dataframe into this dataset """
        if not self.__dataframe is None:
            if self.verbose:
                print("Source resolution:{}, target resolution: {}".format(dataframe.index.resolution, self.__dataframe.index.resolution))
            # reindex on existing data
            reindexed_dataframe=(dataframe
                 # .reindex(self.__dataframe.index.union(self.__dataframe.index))
                .reindex(self.__dataframe.index.union(dataframe.index))
                .interpolate(method='time', limit_direction='both')                
                #.reindex(self.__dataframe.index)
            )
            self.__dataframe = self.__dataframe.join(reindexed_dataframe)
        else:
            self.__dataframe = dataframe


    def __time_correction(self, dataframe, context:dict, label:str):
        """ time offset correction 
            offset: {"days":0, "seconds":0, "microseconds":0, "milliseconds":0, "minutes":0, "hours":0, "weeks":0}
        """
        try:
            tc_map = context["time_correction"]
            try:
                if tc_map['offset'] is None:
                    raise TypeError
                delta_datetime = _datetime.timedelta(**tc_map['offset'])
                if self.verbose:
                    print("Corrected time for '{}' using given offset. {}".format(label, delta_datetime))
            except Exception:  #TypeError:
                try:
                    recorded_datetime = _datetime.datetime.strptime(tc_map['recorded_datetime'], '%d.%m.%Y %H:%M:%S.%f')
                except KeyError:
                    recorded_datetime = _datetime.datetime.fromtimestamp(float(tc_map['recorded_datetime']))
                try:
                    dataframe_datetime = _datetime.datetime.strptime(tc_map['dataframe_datetime'], '%d.%m.%Y %H:%M:%S.%f')
                except KeyError:
                    dataframe_datetime = _datetime.datetime.fromtimestamp(float(tc_map['dataframe_datetime']))
                delta_datetime =  dataframe_datetime - recorded_datetime
                if self.verbose:
                    print("Corrected time for '{}' using given datetimes.".format(label))

            # offset correct dataframe index
            index_as_list = dataframe.index
            index_as_list += delta_datetime 
            dataframe.index = index_as_list

            if self.verbose:
                print("Time offset corrected for '{}' by {}.".format(label.capitalize(), delta_datetime))

        except Exception as e:
            if self.verbose:
                print("No time correction has been provided for {}. Assuming clocks to the synchronous.({})".format(label, e))
        return dataframe

    def __get_csv_parameters(self, context, delimiter=';', decimal=',', encoding="unicode_escape"):
        """ returns csv information from the meta data or if not provided defaults """
        try:
            delimiter = context["delimiter"]
        except:
            delimiter = ";"
        try:
            decimal = context["decimal"]
        except:
            decimal = ","
        try:
            encoding = context["encoding"]
        except:
            encoding = "unicode_escape"
        return (delimiter, decimal, encoding)

    def __load_telemetry_data(self, label="telemetry data"):
        """ Load telemetry data """
        if self.verbose:
            print("Loading {}".format(label))
        try:
            context = self.__meta['sources']['eps_data']

            # do not load if this block is disabled
            try:
                if context['enabled'] == False:
                    return
            except:
                pass
            
            # find telemetry data 
            satellite_logs = os.path.join(self.__meta['datasets_folder'], *context['directory'])
            if not os.path.exists(satellite_logs):                        
                raise(Exception("Dataset '{}' does not exist in {}".format(self.dataset_name, satellite_logs)))
            telemetry_files = [os.path.join(x[0], context['filename']) for x in os.walk(satellite_logs) if os.path.exists(os.path.join(x[0], context['filename']))]
            telemetry_files.sort()
            if self.verbose:
                print("Detected telemetry files:")
                print(telemetry_files)
            if len(telemetry_files) == 0:            
                raise(Exception("Dataset is empty or path not found! ({})".format(self.dataset_name)))

            __dateparser = lambda x: pd.to_datetime(x, unit='s', origin='unix', utc=False)# .round('S')

            (delimiter, decimal, encoding) = self.__get_csv_parameters(context, delimiter=",", decimal=".", encoding="unicode_escape")
            dataframes={}            
            for telemetry_file in telemetry_files:
                try:
                    dataframes[telemetry_file] = pd.read_csv(telemetry_file,  delimiter=delimiter, decimal=decimal, encoding=encoding, index_col='timestamp', parse_dates=['timestamp'], date_parser=__dateparser)
                except Exception as e:
                    if self.verbose:
                        print("Failed to load telemetry file {}:{}".format(telemetry_file, e))

            # merge data into a single frame 
            dataframe = pd.concat(dataframes)
            dataframe = dataframe.droplevel(level=0)
            dataframe = dataframe.sort_index()
            dataframe = dataframe.drop_duplicates()

            # apply time correction 
            dataframe = self.__time_correction(dataframe, context, label)

            # store dataframe unmerged
            self.__partial_dataframes['telemetry']=dataframe

            # merge dataframe
            self.__merge_partial_dataframe(dataframe)

            # mark as loaded
            self.contains_telemetry_data = True
        except FileNotFoundError:
            dataframe = None
            self.contains_telemetry_data = False

    def __load_table_data(self, label="table data"):
        """ Load table data """
        if self.verbose:
            print("Loading {}".format(label))
        try:
            context = self.__meta['sources']['table_data']
            
            # do not load if this block is disabled
            try:
                if context['enabled'] == False:
                    return
            except:
                pass

            (delimiter, decimal, encoding) = self.__get_csv_parameters(context, delimiter="; ", decimal=",", encoding="unicode_escape")
            __dateparser = lambda x: pd.to_datetime(x, unit='s', origin='unix', utc=False)# .round('S')
            angle_file = os.path.join(self.__meta['datasets_folder'], *context['directory'], context['filename'])
            dataframe = pd.read_csv(angle_file, delimiter=delimiter, decimal=decimal, encoding=encoding, index_col='timestamp', parse_dates=['timestamp'], date_parser=__dateparser, engine='python')
           
            # add start angle
            try:
                # print(dataframe['angle'])
                dataframe['angle'] = dataframe['angle'] + float(context['start_angle'])
                if self.verbose:
                    print("Corrected angle by {} degrees for {}".format( float(context['start_angle']), label))
                # print(dataframe['angle'])
            except Exception as e:
                if self.verbose:
                    print("No start angle provided for {}: {}".format(label, e))

            # remove unnecessary column
            try:
                del dataframe['target']
            except Exception as e:
                pass
            try:
                del dataframe['t_delta']
            except Exception as e:
                pass

            try:
                if context['wrap_angle']:
                    dataframe['angle'] = np.mod(dataframe['angle'], 360.0)
            except Exception:
                pass

            try:
                if context['add_sin_of_angle']:
                    angle_sin = np.sin(dataframe['angle']/360*2*math.pi)
                    if self.separate_angle_sin:
                        dataframe['sin_of_angle'] = angle_sin
                    else:
                        dataframe['angle'] = angle_sin
                    self.contains_sin_of_angle = True
            except KeyError:
                self.contains_sin_of_angle = False

            # apply time correction 
            dataframe = self.__time_correction(dataframe, context, label)

            # store dataframe unmerged
            self.__partial_dataframes['table']=dataframe

            # merge dataframe
            self.__merge_partial_dataframe(dataframe)
           
            # mark as loaded
            self.contains_table_data = True
        except FileNotFoundError:
            if self.verbose:
                print(e)
            self.contains_table_data = False

    def __load_room_temperature_data(self, label="room temperature data"):
        """ Load room temperature data """
        if self.verbose:
            print("Loading {}".format(label))
        try:
            context = self.__meta['sources']['room_temperature_data']

             # do not load if this block is disabled
            try:
                if context['enabled'] == False:
                    return
            except:
                pass

            room_temperature_file = os.path.join(self.__meta['datasets_folder'], *context['directory'], context['filename'])
            (delimiter, decimal, encoding) = self.__get_csv_parameters(context, delimiter=";", decimal=",", encoding="unicode_escape")
            __dateparser = lambda x: _datetime.datetime.strptime(x, '%d.%m.%y %H:%M:%S,%f')
            dataframe = pd.read_csv(room_temperature_file, index_col=3, parse_dates=['Start Time'], date_parser=__dateparser, delimiter=delimiter, decimal=decimal, encoding=encoding)
            dataframe = dataframe[["Average"]]
            dataframe.index.names = ["Timestamp"]
            dataframe = dataframe.rename(columns={"Average":"room_temperature"})

            # apply time correction 
            dataframe = self.__time_correction(dataframe, context, label)

            # store dataframe unmerged
            self.__partial_dataframes['room_temperature']=dataframe

            # merge dataframe
            self.__merge_partial_dataframe(dataframe)       
            
            # mark as loaded
            self.contains_room_temperature_data = True
        except Exception as e:
            if self.verbose:
                print(e)
            self.contains_room_temperature_data = False


Usage example:

In [None]:
if not 'enable_example' in locals():
    enable_example=True

Load a dataset and plot it with anomalies

In [None]:
if enable_example:
    dataset = Dataset('2022.06.15', verbose=True)

Describe the dataset

In [None]:
if enable_example:
    print(dataset)

Detect sampling period

In [None]:
if enable_example:
    period, deviation = dataset["experiment"].period
    print("Detected sampling period: {}[s] with a standard deviation of {}[s]".format(period, deviation))

#### Saving the merged dataframe to CSV

In [None]:
if enable_example:
    dataset.save_dataframe_as_csv("dataset.csv")

#### Plotting (with anomalies)

In [None]:
if enable_example:
    dataset.plot().plot_ranges().plot_anomalies()

Check linearity of time

In [None]:
if enable_example:
    dataset.plot_time()

Room temperature

In [None]:
if enable_example:
    try:
        dataset["experiment"].plot(column='room_temperature')
    except Exception as e:
        print(e)

Plot table angle

In [None]:
if enable_example:
    dataset['experiment'].normalize().plot(columns=['sin_of_angle'])

Plot ranges

In [None]:
if enable_example:
    dataset["reboot"].normalize().plot().plot_ranges().plot_anomalies()

In [None]:
if enable_example:
    dataset["experiment"].plot(more_columns=['room_temperature'], not_columns=['angle'])

In [None]:
if enable_example:
    try:
        print(dataset["experiment"].dataframe.shape)
        dataset["experiment"].plot(column='room_temperature')
    except Exception as e:
        print(e)

In [None]:
if enable_example:
    try:
        print(dataset["experiment"]['normal'].dataframe.shape)
        dataset["experiment"].extract('normal').plot(column='room_temperature')
    except Exception as e:
        print(e)

Data since last satellite reboot

In [None]:
if enable_example:
    dataset["reboot"].plot(not_columns=['angle'], more_columns=['room_temperature']).plot_anomalies()

#### Dataset extracting and slicing
Extract the experiment part (between start and stop), normalize the data and plot. Operations can be chained.

In [None]:
if enable_example:
    try:
        experiment = dataset['experiment'].normalize().plot().plot_anomalies()
        # or 
        experiment = dataset.extract('experiment').normalize().plot().plot_anomalies()
    except KeyError:
        print("Range does not exist on dataset")

Extracting by date:

In [None]:
if enable_example:
    try:
        dataset['15.06.2022 10:30:00.00':'15.06.2022 17:00:00.00'].normalize().plot()
        # or 
        dataset.extract_by_date('15.06.2022 10:30:00.00','15.06.2022 17:00:00.00').normalize().plot()
    except KeyError:
        print("Not data for this date range")

Abnormal data:

In [None]:
if enable_example:
    try:
        experiment = dataset['abnormal'].normalize().plot().plot_anomalies()
    except KeyError:
        print("No abnormal data defined for this dataset")

Slicing dataset returns a new dataset

In [None]:
if enable_example:
    experiment[0:100].plot()

Accessing pandas dataframe from dataset

In [None]:
if enable_example:
    experiment.dataframe['temp_0'].plot()

#### Rebase time 
(useful to overlay multiple datasets from different days)

In [None]:
if enable_example:
    experiment.rebase_time().plot()

#### Scaling the dataset
 1. fix scaling from -10,10 to -1,1
 2. scaling from (min, max) over all columns to -1,1

In [None]:
if enable_example:
    print(dataset['normal'].scale(scale_from=(-10, 10), scale_to=(-1, 1)).plot(columns=['temp_0', 'temp_1', 'temp_2']).scale_factor)
    print(dataset['normal'].scale(scale_from=None, scale_to=(-1, 1)).plot(columns=['temp_0', 'temp_1', 'temp_2']).scale_factor)

### Test reseting the dataset
A range is getting selected and a scale applied to the dataset. Then it is getting reset. The result should revert the operations done before.

In [None]:
if enable_example:    
    print(dataset['normal'].scale(scale_from=None, scale_to=(-1, 1)).reset().plot(columns=['temp_0', 'temp_1', 'temp_2']).scale_factor)

#### Tests and validations - Checking dataset validity
Run test on dataset

In [None]:
if enable_example:
    print("Dataset '{}' {} over the selected range.".format(experiment.name, "is valid" if experiment.is_valid else "is invalid."))

Explain all tests done.

In [None]:
if enable_example:
    print(experiment.test())

Run the tests for all defined ranges

In [None]:
if enable_example:
    print(dataset.test_all_ranges())

Run your own tests

In [None]:
if enable_example:
    class DatasetTest_SamplingPeriod(DatasetTest):
        def __init__(self, period=5.0, epsilon=0.2):
            self.period = period
            self.epsilon = epsilon

        @property
        def name(self):
            return "sampling period close to target period"

        def test(self, dataset):
            p,s = dataset.period
            return abs(p-self.period) < self.epsilon

    print(experiment.test(tests=[DatasetTest_SamplingPeriod(period=5.0)]))

Display angle at anomaly

In [None]:
if enable_example:
    for anomaly in experiment.anomalies:
        anomaly_time = anomaly['start']
        idx=experiment.get_nearest_index(anomaly_time)
        angle = experiment.dataframe['angle'][idx]
        print("@{} angle is {}".format(anomaly_time, angle))