# Datasetprocessors

## BLUED 

In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import csv
import hashlib
import itertools
from datetime import datetime

class BLUED(object):
    
    def __init__(self, path):
        self.file_prefix = "location_001_ivdata_"
        self.dir_prefix = "location_001_dataset_"
        self.boundaries_backup = "cache\BLUED_boundaries"
        self.samplerate = 12000
        
        if not os.path.isdir(path):
            print("Folder not found: "+path)
            return None
        else:
            self.path = path
            
        self.boundaries = self._load_boundaries(path)
        self.events = self._import_events(path)
        
    def get_timeseries(self, start, stop, cache=False):
            
        #check cache
        cache_path = self.path + "\cache\\" + hashlib.md5(str(start) + "_" + str(stop)).hexdigest()
        if os.path.isfile(cache_path):
            print("BLUED: Loading cached version of event " + str(start) + " - " + str(stop) + " .")
            fbackup = open(cache_path, 'rb')
            timeseries = pickle.load(fbackup)
            fbackup.close()
        else:
            timeseries = self._import_timeseries(start, stop)
            if cache:
                print(cache_path)
                fbackup = open(cache_path, 'wb')
                pickle.dump(timeseries, fbackup)
                fbackup.close()

        return timeseries
    
    def get_events_by_typeid(self, type_id):
        return self.events[(self.events['Type'] == type_id)]
    
    def get_timeseries_by_events(self, events):
        timeseries_dict = {}
        
        for (i, event) in enumerate(events.iterrows()):
            timestamp = event[0]
            start_t = timestamp - pd.to_timedelta('2s')
            stop_t = timestamp + pd.to_timedelta('2s')
            iv_event = self.get_timeseries(start_t, stop_t, cache=True)
            iv_event = iv_event.reset_index()
            timeseries_dict[i] = iv_event[['Current '+event[1]['Phase'], 'VoltageA']]
            timeseries_dict[i].columns = ['i(t)', 'v(t)']
        return pd.Panel(timeseries_dict)
    
    def import_boundaries(self, path="."):
        """Force update of the boundaries."""
        self.boundaries = self._import_boundaries(self, path)
        return None
    
    def _import_boundaries(self, path="."):
        n = len(os.listdir(path))
        boundaries = pd.DataFrame(columns={"Timestamp"}, dtype='datetime64[us]')
        for root, dirs, files in os.walk(path):
            
            #Progress
            print("Scanning diretory: "+root)
            
            for fname in files:
                if (fname[0:20] == self.file_prefix):
                    i  = int(fname[20:-4])-1
                    #Progress
                    if ((i+1) % 100) == 0:
                        print("Scanned "+str((i+1)/4 % 100)+"% of current directory") 
                    
                    filepath = os.path.join(root, fname)
                    f = open(filepath)
                    lines = f.readlines()
                    f.close()
                    boundaries.loc[i] = pd.to_datetime(lines[15][5:15] + " " + lines[16][5:20])
                    #!ALERT tijd wordt niet afgerond!!
                    #boundaries[i] = np.datetime64(start_dt)
        return boundaries
    
    
    def _import_events(self, path):
        events = pd.DataFrame()
        for (i, item) in enumerate(os.listdir(path)): #veronderstel dat de lijst met datasets gesorteerd is
            curr_path = os.path.join(path, item)
            if (os.path.isdir(curr_path) and (item.startswith("location_001_dataset"))) :
                filename = os.path.join(curr_path, "location_001_eventslist.txt")
                if not os.path.isfile(filename):
                    print("File not found: "+filename)
                    return None
                data = pd.read_csv(filename)
                data['Timestamp'] = pd.to_datetime(data['Timestamp'])
                data.columns = ['Timestamp', 'Type', 'Phase']
                events = pd.concat([events, data.set_index('Timestamp')])                                           
        return events
                                  
    def _load_boundaries(self, path):                     
        if(os.path.isfile(self.path + "\\" + self.boundaries_backup)):
            print("BLUED: Loading backup of boundaries.")
            fbackup = open(self.path + "\\" + self.boundaries_backup, 'rb')
            boundaries = pickle.load(fbackup)
            fbackup.close()
            
        else:
            print("BLUED: Importing boundaries and creating a backup.")
            boundaries = self._import_boundaries(path)
            fbackup = open(self.path + "\\" + self.boundaries_backup, 'wb')
            pickle.dump(boundaries, fbackup)
            fbackup.close()
    
        return boundaries
    
    def _import_timeseries(self, start, stop):
        timeseries = pd.DataFrame()
        
        #detect necessary directories and files
        # file index is the file suffix minus one !!!
        fstart_i = self.boundaries[(self.boundaries['Timestamp'] <= start)].idxmax()
        fstop_i = self.boundaries[(self.boundaries['Timestamp'] >= stop)].idxmin()
        dstart_i = (fstart_i-1)/400 + 1
        dstop_i = (fstop_i-1)/400 + 1
        
        frange = np.arange(fstart_i, fstop_i+1)+1 # ex: fstart_i = 10 and fstop_i = 12 => [11 12]
        drange = (frange-1)/400 + 1
        
        #check if each file is present and load the data
        for (i, f_index) in enumerate(frange):
            
            curr_path = self.path+"\\"+self.dir_prefix+format(drange[i], '03')+"\\"+self.file_prefix+format(f_index, '03')+".txt"
            print('Reading data from: '+curr_path)
                
            # check if csv exist
            if not os.path.isfile(curr_path):
                print('File '+ curr_path +' does not exist.')
                return None
                
            # read csv and load relevant data
            data = pd.read_csv(curr_path, header=22)
            data['X_Value'] = pd.to_timedelta(data['X_Value']*10e8) + self.boundaries['Timestamp'].iloc[0]
            data = data[(data['X_Value'] >= start) & (data['X_Value'] <= stop)]

            timeseries = pd.concat([timeseries, data.set_index('X_Value')])

        return timeseries
    
        