# load dataset


Objective load a dataset and run a pre-defined set of analyses

Two options
1. Load the entire dataset, calculate absolute mean and write/use for modeling - see section Method 1
2. Load the aggregates - absolute mean values for each sample (20480 events) - see section Method 2

This set of notebooks is based upon 
1. NASA bearing dataset: http://data-acoustics.com/measurements/bearing-faults/bearing-4/
   Set number 2: 4 accelerometers one on each bearing
2. Tutorial: https://towardsdatascience.com/machine-learning-for-anomaly-detection-and-condition-monitoring-d4614e7de770


Acknowledgement is made for the measurements used in this work provided through data-acoustics.com Database

Note: the dataset is 1GB (and only 85MB is loaded), check if someone else has already downloaded/loaded!

In [1]:
import configparser
import logging
from ocs_sample_library_preview import *
import json
import pandas as pd
import os
import sys
from pathlib import Path
import datetime

logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)
#logger.setLevel(logging.DEBUG)

config = configparser.ConfigParser()
config.read('config.ini')

ocsClient = OCSClient(config.get('Access', 'ApiVersion'), config.get('Access', 'Tenant'), config.get('Access', 'Resource'), 
                        config.get('Credentials', 'ClientId'), config.get('Credentials', 'ClientSecret'))
        

namespace_id = config.get('Configurations', 'Namespace')

In [2]:
# check status of the 4 streams, if they return values, check to see if the dataset has already been loaded.
ocsClient.acceptverbosity = True
for stream in ocsClient.Streams.getStreams(namespace_id,query='nasa.bearing*'):
    value = ocsClient.Streams.getLastValue(namespace_id,stream.Id)
    print(f'{stream.Id}: {value}')

NASA.Bearing1: None
NASA.Bearing4: None
NASA.Bearing2: None
NASA.Bearing3: None


# Method 1

Download the dataset from http://data-acoustics.com/measurements/bearing-faults/bearing-4/

Extract the 2nd_test.rar file contents into a folder, for example: C:\data\IMS\2nd_test and specify that folder in the variable data_dir in the next cell

In [3]:
data_dir = Path("C:/data/IMS/2nd_test")
if not data_dir.is_dir():
    print("Cannot find data directory")

In [4]:
# load the datafiles

# function to bulk load data by bearing
def write_to_sds(dataset):
    # for each bearing 1 through 4
    for bearing in range(1,5):
            try:
                # build a list of timestamp,index,bearing # channel data
                values = [{ "timestamp": key.isoformat(), "index": int(row[f'index']),"channel": row[f'Bearing {bearing}'] } for key,row in dataset.iterrows()]
                # write it!
                ocsClient.Streams.insertValues(namespace_id,f'Nasa.bearing{bearing}',json.dumps(values))
            except Exception as e:
                print(f'Error: {str(e)}')

# dataframe for loaded file
dataset = pd.DataFrame(columns=['Bearing 1','Bearing 2','Bearing 3','Bearing 4'])

# load datafiles
for filename in os.listdir(data_dir):
    #logging.debug(f'filename: {filename}')
    print(f'filename: {filename}')
    start_period = (datetime.datetime.strptime(filename,"%Y.%m.%d.%H.%M.%S")) 
    logging.debug(f'filename: {filename}, start_period: {start_period}')
    dataset=pd.read_csv(os.path.join(data_dir, filename), sep='\t', header=None, names=['Bearing 1','Bearing 2','Bearing 3','Bearing 4'])
    dataset['index'] = dataset.index
    dataset['index'] += 1
    dataset['timestamp'] = pd.date_range(start_period, periods=20480, freq='50us')
    dataset.set_index('timestamp',inplace=True)
    write_to_sds(dataset)

filename: 2004.02.12.10.32.39
filename: 2004.02.12.10.42.39
filename: 2004.02.12.10.52.39
filename: 2004.02.12.11.02.39
filename: 2004.02.12.11.12.39
filename: 2004.02.12.11.22.39
filename: 2004.02.12.11.32.39
filename: 2004.02.12.11.42.39
filename: 2004.02.12.11.52.39
filename: 2004.02.12.12.02.39
filename: 2004.02.12.12.12.39
filename: 2004.02.12.12.22.39
filename: 2004.02.12.12.32.39
filename: 2004.02.12.12.42.39
filename: 2004.02.12.12.52.39
filename: 2004.02.12.13.02.39
filename: 2004.02.12.13.12.39
filename: 2004.02.12.13.22.39
filename: 2004.02.12.13.32.39
filename: 2004.02.12.13.42.39
filename: 2004.02.12.13.52.39
filename: 2004.02.12.14.02.39
filename: 2004.02.12.14.12.39
filename: 2004.02.12.14.22.39
filename: 2004.02.12.14.32.39
filename: 2004.02.12.14.42.39
filename: 2004.02.12.14.52.39
filename: 2004.02.12.15.02.39
filename: 2004.02.12.15.12.39
filename: 2004.02.12.15.22.39
filename: 2004.02.12.15.32.39
filename: 2004.02.12.15.42.39
filename: 2004.02.12.15.52.39
filename: 

In [5]:
ocsClient.acceptverbosity = True
for stream in ocsClient.Streams.getStreams(namespace_id,query='nasa.bearing*'):
    value = ocsClient.Streams.getLastValue(namespace_id,stream.Id)
    print(f'{stream.Id}: {value}')

NASA.Bearing1: {'timestamp': '2004-02-19T06:22:40.02395Z', 'index': 20480.0, 'channel': -0.002}
NASA.Bearing4: {'timestamp': '2004-02-19T06:22:40.02395Z', 'index': 20480.0, 'channel': -0.002}
NASA.Bearing2: {'timestamp': '2004-02-19T06:22:40.02395Z', 'index': 20480.0, 'channel': 0.0}
NASA.Bearing3: {'timestamp': '2004-02-19T06:22:40.02395Z', 'index': 20480.0, 'channel': 0.0}


In [81]:
# Retrieve mean values for each channel and sample
# 20kHz (20480) events were collected 948 times
# this step retrieves the mean for each set of events

ocsClient.acceptverbosity=True

merged_data = pd.DataFrame()
df_values = pd.DataFrame()
pd.options.display.float_format = '{:.3}'.format
pd.options.display.float_format = None

# start of data collection
signal_datetime = (datetime.datetime.strptime("2004-02-12T10:32:39","%Y-%m-%dT%H:%M:%S"))
# for each data collection cycle
for _ in range(1,985):
    #print(signal_datetime)
    signal_means = pd.Series(name=signal_datetime.isoformat())
    # Each bearing for a cycle
    for bearing in range(1,5):
        values = ocsClient.Streams.getRangeValues(namespace_id,f'nasa.bearing{bearing}',start=signal_datetime,skip=0,count=20480,value_class=None,reverse=False,boundary_type=SdsBoundaryType.Exact)
        df_result = pd.DataFrame.from_dict(values)
        signal_means[f'bearing {bearing}'] = round(df_result['channel'].astype(np.float32).abs().mean(),6)
    merged_data = merged_data.append(signal_means)
    signal_datetime += datetime.timedelta(minutes=10)
print(merged_data.shape)
print(merged_data.head())

(984, 4)
                     bearing 1  bearing 2  bearing 3  bearing 4
2004-02-12T10:32:39   0.058333   0.071832   0.083244   0.043066
2004-02-12T10:42:39   0.058997   0.074008   0.084439   0.044541
2004-02-12T10:52:39   0.060240   0.074224   0.083922   0.044443
2004-02-12T11:02:39   0.061454   0.073843   0.084462   0.045082
2004-02-12T11:12:39   0.061361   0.075607   0.082837   0.045119


In [181]:
# write the mean values to OCS

for i in range(1,5):
    values = (merged_data[f'bearing {i}']
                .reset_index()
                .rename(columns={'index':'timestamp',f'bearing {i}':'channel'})
                .to_json(orient='records'))
    ocsClient.Streams.updateValues(namespace_id,f'Nasa.bearing{i}.agg',values)

In [38]:
# write values to a csv file for folks who want to use method two below
# Note: only run if the file does not exist
#merged_data.to_csv(path_or_buf='nasa.bearing.aggregates.csv')

# Method2

load the values from a csv file to OCS

In [None]:
# 
merged_data = pd.read_csv('nasa.bearing.aggregates.csv',index_col='timestamp')
# read values for each bearing and load into OCS
for i in range(1,5):
    values = (merged_data[f'bearing {i}']
                .reset_index()
                .rename(columns={'index':'timestamp',f'bearing {i}':'channel'})
                .to_json(orient='records'))
    ocsClient.Streams.updateValues(namespace_id,f'Nasa.bearing{i}.agg',values)

In [29]:
merged_data.shape

(984, 4)

In [37]:
# retrieve/check loaded data
import pprint as pprint
for i in range(1,5):
    signal_starttime = ocsClient.Streams.getFirstValue(namespace_id,f'Nasa.bearing{i}.agg',None)['timestamp']
    pprint.pprint(ocsClient.Streams.getRangeValues(namespace_id,f'Nasa.bearing{i}.agg',start=signal_starttime,count=10,value_class=None,skip=0,reverse=False,boundary_type=SdsBoundaryType.Inside))
    # retrieve summary information
    #signal_starttime = ocsClient.Streams.getFirstValue(namespace_id,f'Nasa.bearing{i}.agg',None)['timestamp']
    #signal_endtime = ocsClient.Streams.getLastValue(namespace_id,f'Nasa.bearing{i}.agg',None)['timestamp']
    #pprint.pprint(ocsClient.Streams.getSummaries(namespace_id, f'Nasa.bearing{i}.agg', value_class=None, start=signal_starttime, end=signal_endtime, count=1))

[{'channel': 0.058333, 'timestamp': '2004-02-12T10:32:39Z'},
 {'channel': 0.058997, 'timestamp': '2004-02-12T10:42:39Z'},
 {'channel': 0.06024, 'timestamp': '2004-02-12T10:52:39Z'},
 {'channel': 0.061454, 'timestamp': '2004-02-12T11:02:39Z'},
 {'channel': 0.061361, 'timestamp': '2004-02-12T11:12:39Z'},
 {'channel': 0.061669, 'timestamp': '2004-02-12T11:22:39Z'},
 {'channel': 0.061944, 'timestamp': '2004-02-12T11:32:39Z'},
 {'channel': 0.061232, 'timestamp': '2004-02-12T11:42:39Z'},
 {'channel': 0.062282, 'timestamp': '2004-02-12T11:52:39Z'},
 {'channel': 0.059893, 'timestamp': '2004-02-12T12:02:39Z'}]
[{'channel': 0.071832, 'timestamp': '2004-02-12T10:32:39Z'},
 {'channel': 0.074008, 'timestamp': '2004-02-12T10:42:39Z'},
 {'channel': 0.074224, 'timestamp': '2004-02-12T10:52:39Z'},
 {'channel': 0.073843, 'timestamp': '2004-02-12T11:02:39Z'},
 {'channel': 0.075607, 'timestamp': '2004-02-12T11:12:39Z'},
 {'channel': 0.073281, 'timestamp': '2004-02-12T11:22:39Z'},
 {'channel': 0.074593, 't