# explore dataset


This set of notebooks is based upon 
1. NASA bearing dataset: http://data-acoustics.com/measurements/bearing-faults/bearing-4/
   Set number 2: 4 accelerometers one on each bearing
2. Tutorial: https://towardsdatascience.com/machine-learning-for-anomaly-detection-and-condition-monitoring-d4614e7de770

Acknowledgement is made for the measurements used in this work provided through data-acoustics.com database and the python code from a tutorial by Vegard Flovik.

In [1]:
import configparser
import logging
from ocs_sample_library_preview import *
import json
import pandas as pd
import pprint

logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

config = configparser.ConfigParser()
config.read('config.ini')

ocsClient = OCSClient(config.get('Access', 'ApiVersion'), config.get('Access', 'Tenant'), config.get('Access', 'Resource'), 
                        config.get('Credentials', 'ClientId'), config.get('Credentials', 'ClientSecret'))

namespace_id = config.get('Configurations', 'Namespace')
print(namespace_id)

kchudnamespace


In [48]:
# determine the dataset start-end dates
# they should all be the same, so we'll use the signal variables in later cells
import pprint as pprint
for i in range(1,5):
    stream = f'Nasa.bearing{i}.agg'
    signal_starttime = ocsClient.Streams.getFirstValue(namespace_id,stream,None)['timestamp']
    signal_endtime = ocsClient.Streams.getLastValue(namespace_id,stream,None)['timestamp']
    print(f'{stream},{signal_starttime},{signal_endtime}')

Nasa.bearing1.agg,2004-02-12T10:32:39Z,2004-02-19T06:22:39Z
Nasa.bearing2.agg,2004-02-12T10:32:39Z,2004-02-19T06:22:39Z
Nasa.bearing3.agg,2004-02-12T10:32:39Z,2004-02-19T06:22:39Z
Nasa.bearing4.agg,2004-02-12T10:32:39Z,2004-02-19T06:22:39Z


In [49]:
# get the first 10 values from each of the aggregate streams to explore what the data looks like
import pprint as pprint
for i in range(1,5):
    signal_starttime = ocsClient.Streams.getFirstValue(namespace_id,f'Nasa.bearing{i}.agg',None)['timestamp']
    pprint.pprint(ocsClient.Streams.getRangeValues(namespace_id,f'Nasa.bearing{i}.agg',start=signal_starttime,count=10,value_class=None,skip=0,reverse=False,boundary_type=SdsBoundaryType.Inside))
    # retrieve summary information

[{'channel': 0.058333, 'timestamp': '2004-02-12T10:32:39Z'},
 {'channel': 0.058997, 'timestamp': '2004-02-12T10:42:39Z'},
 {'channel': 0.06024, 'timestamp': '2004-02-12T10:52:39Z'},
 {'channel': 0.061454, 'timestamp': '2004-02-12T11:02:39Z'},
 {'channel': 0.061361, 'timestamp': '2004-02-12T11:12:39Z'},
 {'channel': 0.061669, 'timestamp': '2004-02-12T11:22:39Z'},
 {'channel': 0.061944, 'timestamp': '2004-02-12T11:32:39Z'},
 {'channel': 0.061232, 'timestamp': '2004-02-12T11:42:39Z'},
 {'channel': 0.062282, 'timestamp': '2004-02-12T11:52:39Z'},
 {'channel': 0.059893, 'timestamp': '2004-02-12T12:02:39Z'}]
[{'channel': 0.071832, 'timestamp': '2004-02-12T10:32:39Z'},
 {'channel': 0.074008, 'timestamp': '2004-02-12T10:42:39Z'},
 {'channel': 0.074224, 'timestamp': '2004-02-12T10:52:39Z'},
 {'channel': 0.073843, 'timestamp': '2004-02-12T11:02:39Z'},
 {'channel': 0.075607, 'timestamp': '2004-02-12T11:12:39Z'},
 {'channel': 0.073281, 'timestamp': '2004-02-12T11:22:39Z'},
 {'channel': 0.074593, 't

In [50]:
# define a function to format the getSummaries output into a DataFrame based upon a stream query to help understand the data
def sds_summaries_format(query,start=None,end=None,property=None):
    
    if start == None or end == None:
        print("not implemented, specify start and end parameters")
        return None
    
    df_summaries = None
    df_summaries = pd.DataFrame(columns=['Count', 'Minimum', 'Maximum', 'Range', 'Total', 'Mean', 'StandardDeviation', 'PopulationStandardDeviation', 
                               'WeightedMean', 'WeightedStandardDeviation', 'WeightedPopulationStandardDeviation', 'Skewness', 'Kurtosis'])
    for stream in ocsClient.Streams.getStreams(namespace_id,query=query):
        try:
            summary = ocsClient.Streams.getSummaries(namespace_id,stream.Id,start=start,end=end,count=1,
                                                                value_class=None)
            #pprint.pprint(summary)
            for key,value in summary[0]['Summaries'].items():
                df_summaries.loc[stream.Name,key] = value[f'{property}']
        except Exception as e:
            print(f'getSummaries error: {str(e)}')
    
    return(df_summaries)

df = sds_summaries_format("nasa.bearing*.agg",start= signal_starttime,end= signal_endtime,property="channel")
df.sort_index(inplace=True)
df

Unnamed: 0,Count,Minimum,Maximum,Range,Total,Mean,StandardDeviation,PopulationStandardDeviation,WeightedMean,WeightedStandardDeviation,WeightedPopulationStandardDeviation,Skewness,Kurtosis
bearing1 aggregate,983,0.001169,0.45332,0.452151,79.5811,0.0809573,0.0394991,0.039479,0.0809573,0.0394991,0.039479,4.02395,22.1821
bearing2 aggregate,983,0.000767,0.161011,0.160244,77.239,0.0785748,0.0112868,0.0112811,0.0785748,0.0112868,0.0112811,3.41145,17.9829
bearing3 aggregate,983,0.000716,0.151295,0.150579,80.0123,0.0813961,0.0111132,0.0111075,0.0813961,0.0111132,0.0111075,2.90515,13.8691
bearing4 aggregate,983,0.001699,0.119042,0.117343,47.0356,0.0478491,0.00931572,0.00931098,0.0478491,0.00931572,0.00931098,3.91832,20.2148


In [51]:
# create a dataframe of the aggregate dataset that could be used for later analysis work

ocsClient.acceptverbosity=True
dfagg = pd.DataFrame()
for bearing in range(1,5):
    values = ocsClient.Streams.getRangeValues(namespace_id,f'nasa.bearing{bearing}.agg',start= signal_starttime,skip=0,count=985,value_class=None,reverse=False,boundary_type=SdsBoundaryType.Exact)
    df_temp = pd.DataFrame.from_dict(values).set_index('timestamp')
#    df_temp = df_temp.set_index('timestamp')
    df_temp.rename(columns={'channel':f'bearing {bearing}'},inplace=True)
    if dfagg.empty:
        dfagg = dfagg.append(df_temp)
    else:
        dfagg = dfagg.merge(df_temp,on='timestamp')

In [52]:
dfagg.head()

Unnamed: 0_level_0,bearing 1,bearing 2,bearing 3,bearing 4
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2004-02-12T10:32:39Z,0.058333,0.071832,0.083244,0.043066
2004-02-12T10:42:39Z,0.058997,0.074008,0.084439,0.044541
2004-02-12T10:52:39Z,0.06024,0.074224,0.083922,0.044443
2004-02-12T11:02:39Z,0.061454,0.073843,0.084462,0.045082
2004-02-12T11:12:39Z,0.061361,0.075607,0.082837,0.045119
