# Datasetprocessors

## BLUED 

In [None]:
import os
import pickle
import numpy as np
import pandas as pd
import csv
import itertools
from datetime import datetime

class BLUED(object):
    
    def __init__(self, path):
        self.file_prefix = "location_001_ivdata_"
        self.dir_prefix = "location_001_dataset_"
        self.boundaries_backup = "BLUED_boundaries"
        self.samplerate = 12000
        
        if not os.path.isdir(path):
            print("Folder not found: "+path)
            return None
        else:
            self.path = path
            
        self.boundaries = self._load_boundaries(path)
        self.events = self._import_events(path)
        
    def get_timeseries(self, start, stop):
        
        timeseries = pd.DataFrame()
        
        #detect necessary directories and files
        # file index is the file suffix minus one !!!
        fstart_i = np.argmax(self.boundaries[:,0] >= start)+1
        fstop_i = (np.argmin(self.boundaries[:,1] <= stop)-1)+1
        dstart_i = (fstart_i-1)/400 + 1
        dstop_i = (fstop_i-1)/400 + 1
        
        frange = np.arange(fstart_i, fstop_i)
        drange = (frange-1)/400 + 1
        
        #check if each file is present and load the data
        for (i, f_index) in enumerate(frange):
            curr_path = self.path+"\\"+self.dir_prefix+format(drange[i], '03')+"\\"+self.file_prefix+format(f_index, '03')+".txt"
            
            print('Reading data from: '+curr_path)
            
            if not os.path.isfile(curr_path):
                print('File '+ curr_path +' does not exist.')
                return None
            data = pd.read_csv(curr_path, header=22)
            data['X_Value'] = pd.to_timedelta(data['X_Value']*10e8) + self.boundaries[frange[i]]
            
            data = data[(data['X_Value'] > start) & (data['X_Value'] < stop)]
            timeseries.append(data)

        return timeseries
                                  
    def get_timeseries_by_ids(self, ids):
        pass
    
    def import_boundaries(self, path="."):
        """Force update of the boundaries."""
        self.boundaries = self._import_boundaries(self, path)
        return None
    
    def _import_boundaries(self, path="."):
        n = len(os.listdir(path))
        boundaries = pd.DataFrame(columns={"Start datetime"}, dtype='datetime64[us]')
        for root, dirs, files in os.walk(path):
            
            #Progress
            print("Scanning diretory: "+root)
            
            for fname in files:
                if (fname[0:20] == self.file_prefix):
                    i  = int(fname[20:-4])-1
                    #Progress
                    if ((i+1) % 100) == 0:
                        print("Scanned "+str((i+1)/4 % 100)+"% of current directory") 
                    
                    filepath = os.path.join(root, fname)
                    f = open(filepath)
                    lines = f.readlines()
                    f.close()
                    boundaries[i] = pd.to_datetime(lines[15][5:15] + " " + lines[16][5:20], format="%Y/%m/%d %H:%M:%S.%f")
                    #!ALERT tijd wordt niet afgerond!!
                    #boundaries[i] = np.datetime64(start_dt)
        return boundaries
    
    
    def _import_events(self, path):
        events = []
        for (i, item) in enumerate(os.listdir(path)): #veronderstel dat de lijst met datasets gesorteerd is
            curr_path = os.path.join(path, item)
            if os.path.isdir(curr_path):
                thefilename = os.path.join(curr_path, "location_001_eventslist.txt")
                if not os.path.isfile(thefilename):
                    print("File not found: "+thefilename)
                    return None
                with open(thefilename) as f:
                    content = f.readlines()
                    f.close()
                    for eventstr in content[1:]:
                        timestamp = datetime.strptime(eventstr[:23], "%m/%d/%Y %H:%M:%S.%f")
                        device = int(eventstr[24:27])
                        channel = eventstr[28:29]
                        events.append((np.datetime64(timestamp, 'us'), device, channel))                                              
        return events
                                  
    def _load_boundaries(self, path):                     
        if(os.path.isfile(self.boundaries_backup)):
            print("BLUED: Loading backup of boundaries.")
            fbackup = open(self.boundaries_backup, 'rb')
            boundaries = pickle.load(fbackup)
            fbackup.close()
            
        else:
            print("BLUED: Importing boundaries and creating a backup.")
            boundaries = self._import_boundaries(path)
            fbackup = open(self.boundaries_backup, 'wb')
            pickle.dump(boundaries, fbackup)
            fbackup.close()
    
        return boundaries
        

In [None]:
test = BLUED("E:\BLUED")
test.boundaries[10:13]

BLUED: Importing boundaries and creating a backup.
Scanning diretory: E:\BLUED
Scanning diretory: E:\BLUED\location_001_dataset_001
Scanned 25% of current directory


In [9]:
data = test.get_timeseries(pd.to_datetime('2011-10-20T12:15:29.823'), pd.to_datetime('2011-10-20T12:18:48.125'))

Reading data from: E:\BLUED\location_001_dataset_001\location_001_ivdata_011.txt


TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [111]:
import pandas as pd
data = pd.read_csv("E:\BLUED\location_001_dataset_001\location_001_ivdata_011.txt", header=22)

In [118]:
pd.offsets.Second()

ValueError: 'u' is a bad directive in format '%us'

In [90]:
test.boundaries[3,0]

numpy.datetime64('2011-10-20T12:03:39.723')

In [76]:
(test.boundaries[10,0] <= np.datetime64('2011-10-20T12:15:29.824'))

True

In [29]:
(np.datetime64('2011-10-20T12:18:48.123')-np.datetime64('2011-10-20T12:18:48.120')).item().total_seconds()

0.003

In [52]:
np.genfromtxt("E:\BLUED\location_001_dataset_001\location_001_ivdata_011.txt", skip_header=24, delimiter=",")

array([[  1.01720008e+03,   1.46385200e+00,  -2.01232100e+00,
          3.24614630e+01],
       [  1.01720017e+03,   1.42139800e+00,  -1.94906400e+00,
          3.76999250e+01],
       [  1.01720025e+03,   1.37894400e+00,  -2.05449200e+00,
          4.30961720e+01],
       ..., 
       [  1.11634975e+03,   9.54404000e-01,  -5.19626100e+00,
          1.69261058e+02],
       [  1.11634983e+03,   8.05815000e-01,  -5.25951800e+00,
          1.69639742e+02],
       [  1.11634992e+03,   6.57225000e-01,  -5.17517500e+00,
          1.69829084e+02]])

In [4]:
import pandas as pd

In [5]:
data = pd.read_csv("E:\BLUED\location_001_dataset_001\location_001_ivdata_011.txt", header=22)

In [6]:
data.tail()

Unnamed: 0,X_Value,Current A,Current B,VoltageA,Comment
1189795,1116.349583,1.145447,-4.85889,168.314348,
1189796,1116.349667,1.060539,-5.048661,168.945488,
1189797,1116.34975,0.954404,-5.196261,169.261058,
1189798,1116.349833,0.805815,-5.259518,169.639742,
1189799,1116.349917,0.657225,-5.175175,169.829084,


In [7]:
data.head()

Unnamed: 0,X_Value,Current A,Current B,VoltageA,Comment
0,1017.2,1.357717,-1.780378,26.591861,
1,1017.200083,1.463852,-2.012321,32.461463,
2,1017.200167,1.421398,-1.949064,37.699925,
3,1017.20025,1.378944,-2.054492,43.096172,
4,1017.200333,1.378944,-2.075578,48.429305,


In [25]:
pd.to_datetime('2011-10-20T12:15:29.823') + pd.to_timedelta(data['X_Value']*10e8)

0         2011-10-20 12:32:27.023000
1         2011-10-20 12:32:27.023083
2         2011-10-20 12:32:27.023167
3         2011-10-20 12:32:27.023250
4         2011-10-20 12:32:27.023333
5         2011-10-20 12:32:27.023417
6         2011-10-20 12:32:27.023500
7         2011-10-20 12:32:27.023583
8         2011-10-20 12:32:27.023667
9         2011-10-20 12:32:27.023750
10        2011-10-20 12:32:27.023833
11        2011-10-20 12:32:27.023917
12        2011-10-20 12:32:27.024000
13        2011-10-20 12:32:27.024083
14        2011-10-20 12:32:27.024167
15        2011-10-20 12:32:27.024250
16        2011-10-20 12:32:27.024333
17        2011-10-20 12:32:27.024417
18        2011-10-20 12:32:27.024500
19        2011-10-20 12:32:27.024583
20        2011-10-20 12:32:27.024667
21        2011-10-20 12:32:27.024750
22        2011-10-20 12:32:27.024833
23        2011-10-20 12:32:27.024917
24        2011-10-20 12:32:27.025000
25        2011-10-20 12:32:27.025083
26        2011-10-20 12:32:27.025167
2