In [7]:
# [1] Common functions and discrete statistics

from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime
from IPython.display import display
import math

theme = 'gridon'

def convertTimeFloat(time):
    # 1666427168
    dateTimeFormat = '%Y-%m-%d %H:%M:%S'
    return pd.to_datetime(
        datetime.fromtimestamp(time, tz=None),
        format = dateTimeFormat
    )

def convertTimeShortString(time):
    # 2022-10-21 13:24:12
    dateTimeFormat = '%Y-%m-%d %H:%M:%S'
    return pd.to_datetime(
        datetime.strptime(time, dateTimeFormat)
    )

def convertTimeLongString(time):
    # 2022-10-21T13:23:56.7964196Z
    dateTimeFormat = '%Y-%m-%dT%H:%M:%S'
    zRemoved = time.split('.')[0]
    return pd.to_datetime(
        datetime.strptime(zRemoved, dateTimeFormat)
    )

def dataFrameFromParserResultType1(folder, file):
    csv = pd.read_csv(folder + file, delimiter=',')
    df = pd.DataFrame()
    df['TimeStamp'] = csv['Time'].apply(convertTimeFloat)
    df['Event'] = csv['EventId']
    df['Parameter0'] = csv['EventId']
    df['Parameter1'] = extractParameters(1, csv['ParameterList'])
    df['Parameter2'] = extractParameters(2, csv['ParameterList'])
    df['Parameter3'] = extractParameters(3, csv['ParameterList'])
    df['Parameter4'] = extractParameters(4, csv['ParameterList'])
    df['Parameter5'] = extractParameters(5, csv['ParameterList'])
    df['Parameter6'] = extractParameters(6, csv['ParameterList'])
    df['Value'] = 1
    df['Row'] = np.arange(len(df))
    return df

def dataFrameFromParserResultType2(folder, file):
    csv = pd.read_csv(folder + file, delimiter=',')
    df = pd.DataFrame()
    df['TimeStamp'] = csv['Time'].apply(convertTimeLongString)
    df['Event'] = csv['EventId']
    df['Parameter0'] = csv['EventId']
    df['Parameter1'] = extractParameters(1, csv['ParameterList'])
    df['Parameter2'] = extractParameters(2, csv['ParameterList'])
    df['Parameter3'] = extractParameters(3, csv['ParameterList'])
    df['Parameter4'] = extractParameters(4, csv['ParameterList'])
    df['Parameter5'] = extractParameters(5, csv['ParameterList'])
    df['Parameter6'] = extractParameters(6, csv['ParameterList'])
    df['Value'] = 1
    df['Row'] = np.arange(len(df))
    return df

def dataFrameFromParserResultType3(folder, file):
    csv = pd.read_csv(folder + file, delimiter=',')
    df = pd.DataFrame()
    df['TimeStamp'] = csv['Time'].apply(convertTimeShortString)
    df['Event'] = csv['EventId']
    df['Parameter0'] = csv['EventId']
    df['Parameter1'] = extractParameters(1, csv['ParameterList'])
    df['Parameter2'] = extractParameters(2, csv['ParameterList'])
    df['Parameter3'] = extractParameters(3, csv['ParameterList'])
    df['Parameter4'] = extractParameters(4, csv['ParameterList'])
    df['Parameter5'] = extractParameters(5, csv['ParameterList'])
    df['Parameter6'] = extractParameters(6, csv['ParameterList'])
    df['Value'] = 1
    df['Row'] = np.arange(len(df))
    return df

def extractParameters(index, column):
    return column.apply(extractParameter, index = index)

def extractParameter(parameter, index):
    temp = parameter.split(",")
    result = list(map(lambda x : x.strip()
        .replace('[','')
        .replace(']','')
        .replace('"','')
        .replace("'","")
        ,temp)
    )
    if index > len(result):
        return ''
    else: 
        return result[index-1]

def mountAggregation(data):
    obj = {}
    eventColumns = list(data)
    for event in eventColumns:
        obj[event] = 'sum'
    obj['TimeStamp'] = 'min'
    return obj

def convertToVector(data):
    events = data.drop('Event', axis=1, level=0).drop('Row', axis=1, level=0).drop('TimeStamp', axis=1, level=0)
    events = events.droplevel(level=0, axis=1)
    events = events.reset_index()
    aggregation = mountAggregation(events)
    aggregated = events.agg(aggregation)
    return aggregated

def convertToVectorComplete(data):
    events = data.drop('Event', axis=1).drop('Row', axis=1).drop('TimeStamp', axis=1)
    events = events.reset_index()
    aggregation = mountAggregation(events)
    aggregated = events.agg(aggregation)
    return aggregated

def segmentTimeWindows(dataFrame, segment):
    resultList = []
    last = 0
    counter = 0

    result = dataFrame.resample(segment).groups

    firstTimeStampSample = dataFrame['TimeStamp'][0]
    firstTimeStampSegment = list(result.keys())[0]
    diff = firstTimeStampSample -firstTimeStampSegment
    result = dataFrame.resample(segment, offset = diff).groups

    for key in result:
        data = dataFrame[last:result[key]]
        resultList.append(data)
        last = result[key]
        counter+=1
    return resultList[:-1]

def transposeData(data, timewindow):
    dataFrameWide = pd.pivot(data, index=['Row','TimeStamp','Event'], columns=['Parameter0'], values=['Value'])
    dataFrameWide = dataFrameWide.reset_index()
    dataFrameWide = dataFrameWide.set_index(pd.DatetimeIndex(dataFrameWide['TimeStamp']))

    segmentList = segmentTimeWindows(dataFrameWide, timewindow)

    vectors = []
    for frame in segmentList:
        vector = convertToVector(frame)
        vectors.append(vector)

    data = pd.concat(vectors, axis=1)
    eventsGrouped = data.transpose()
    return eventsGrouped

def transposeDataComplete(data, timewindow):
    dataFrameWide = pd.pivot(data, index=['Row','TimeStamp','Event'], columns=['Parameter0','Parameter1','Parameter2','Parameter3','Parameter4','Parameter5','Parameter6'], values=['Value'])
    dataFrameWide.columns = dataFrameWide.columns.to_flat_index()
    dataFrameWide = dataFrameWide.reset_index()
    dataFrameWide = dataFrameWide.set_index(pd.DatetimeIndex(dataFrameWide['TimeStamp']))

    segmentList = segmentTimeWindows(dataFrameWide, timewindow)

    vectors = []
    for frame in segmentList:
        vector = convertToVectorComplete(frame)
        vectors.append(vector)

    data = pd.concat(vectors, axis=1)
    eventsGrouped = data.transpose()
    return eventsGrouped

def plotLineEvents(name, data, timewindow):
    eventsGrouped = transposeData(data, timewindow)
    columns = eventsGrouped.columns.drop('TimeStamp')
    fig = px.line(eventsGrouped, x='TimeStamp', y=columns, title=name, template=theme)
    fig.show()

def plotBoxPlot(name, data, timeWindow):
    eventsGrouped = transposeData(data, timeWindow)
    N = len(eventsGrouped.columns)-1 # Except timestamp column
    c = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 360, N)]

    fig = go.Figure(data = [go.Box(y = eventsGrouped.iloc[:, i + 1],
        marker_color=c[i], name=eventsGrouped.columns[i + 1], boxpoints='suspectedoutliers',)
        for i in range(int(N))]
    )
    fig.update_layout(
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=True),
        yaxis=dict(zeroline=False, gridcolor='white'),
        title=name,
        template=theme
    )
    fig.show()

def plotHeapMap(name, data, timeWindow, max = None):
    eventsGrouped = transposeData(data, timeWindow)
    events = eventsGrouped.drop(['TimeStamp'], axis=1)
    if(max is None):
        fig = px.imshow(events.T, color_continuous_scale='solar', template=theme)
    else:
        fig = px.imshow(events.T, range_color=[0, max], color_continuous_scale='solar', template=theme)
    fig.update_layout(title=name)
    fig.show()


In [8]:
# [2] Common functions and discrete statistics

from datetime import datetime
from IPython.display import display
import math

theme = 'gridon'

def convertTimeFloat(time):
    # 1666427168
    dateTimeFormat = '%Y-%m-%d %H:%M:%S'
    return pd.to_datetime(
        datetime.fromtimestamp(time, tz=None),
        format = dateTimeFormat
    )

def convertTimeShortString(time):
    # 2022-10-21 13:24:12
    dateTimeFormat = '%Y-%m-%d %H:%M:%S'
    return pd.to_datetime(
        datetime.strptime(time, dateTimeFormat)
    )

def convertTimeLongString(time):
    # 2022-10-21T13:23:56.7964196Z
    dateTimeFormat = '%Y-%m-%dT%H:%M:%S'
    zRemoved = time.split('.')[0]
    return pd.to_datetime(
        datetime.strptime(zRemoved, dateTimeFormat)
    )

def dataFrameFromParserResultType1(folder, file):
    csv = pd.read_csv(folder + file, delimiter=',')
    df = pd.DataFrame()
    df['TimeStamp'] = csv['Time'].apply(convertTimeFloat)
    df['Event'] = csv['EventId']
    df['Parameter0'] = csv['EventId']
    df['Parameter1'] = extractParameters(1, csv['ParameterList'])
    df['Parameter2'] = extractParameters(2, csv['ParameterList'])
    df['Parameter3'] = extractParameters(3, csv['ParameterList'])
    df['Parameter4'] = extractParameters(4, csv['ParameterList'])
    df['Parameter5'] = extractParameters(5, csv['ParameterList'])
    df['Parameter6'] = extractParameters(6, csv['ParameterList'])
    df['Value'] = 1
    df['Row'] = np.arange(len(df))
    return df

def dataFrameFromParserResultType2(folder, file):
    csv = pd.read_csv(folder + file, delimiter=',')
    df = pd.DataFrame()
    df['TimeStamp'] = csv['Time'].apply(convertTimeLongString)
    df['Event'] = csv['EventId']
    df['Parameter0'] = csv['EventId']
    df['Parameter1'] = extractParameters(1, csv['ParameterList'])
    df['Parameter2'] = extractParameters(2, csv['ParameterList'])
    df['Parameter3'] = extractParameters(3, csv['ParameterList'])
    df['Parameter4'] = extractParameters(4, csv['ParameterList'])
    df['Parameter5'] = extractParameters(5, csv['ParameterList'])
    df['Parameter6'] = extractParameters(6, csv['ParameterList'])
    df['Value'] = 1
    df['Row'] = np.arange(len(df))
    return df

def dataFrameFromParserResultType3(folder, file):
    csv = pd.read_csv(folder + file, delimiter=',')
    df = pd.DataFrame()
    df['TimeStamp'] = csv['Time'].apply(convertTimeShortString)
    df['Event'] = csv['EventId']
    df['Parameter0'] = csv['EventId']
    df['Parameter1'] = extractParameters(1, csv['ParameterList'])
    df['Parameter2'] = extractParameters(2, csv['ParameterList'])
    df['Parameter3'] = extractParameters(3, csv['ParameterList'])
    df['Parameter4'] = extractParameters(4, csv['ParameterList'])
    df['Parameter5'] = extractParameters(5, csv['ParameterList'])
    df['Parameter6'] = extractParameters(6, csv['ParameterList'])
    df['Value'] = 1
    df['Row'] = np.arange(len(df))
    return df

def extractParameters(index, column):
    return column.apply(extractParameter, index = index)

def extractParameter(parameter, index):
    temp = parameter.split(",")
    result = list(map(lambda x : x.strip()
        .replace('[','')
        .replace(']','')
        .replace('"','')
        .replace("'","")
        ,temp)
    )
    if index > len(result):
        return ''
    else: 
        return result[index-1]

def mountAggregation(data):
    obj = {}
    eventColumns = list(data)
    for event in eventColumns:
        obj[event] = 'sum'
    obj['TimeStamp'] = 'min'
    return obj

def convertToVector(data):
    events = data.drop('Event', axis=1, level=0).drop('Row', axis=1, level=0).drop('TimeStamp', axis=1, level=0)
    events = events.droplevel(level=0, axis=1)
    events = events.reset_index()
    aggregation = mountAggregation(events)
    aggregated = events.agg(aggregation)
    return aggregated

def convertToVectorComplete(data):
    events = data.drop('Event', axis=1).drop('Row', axis=1).drop('TimeStamp', axis=1)
    events = events.reset_index()
    aggregation = mountAggregation(events)
    aggregated = events.agg(aggregation)
    return aggregated

def segmentTimeWindows(dataFrame, segment):
    resultList = []
    last = 0
    counter = 0

    result = dataFrame.resample(segment).groups

    firstTimeStampSample = dataFrame['TimeStamp'][0]
    firstTimeStampSegment = list(result.keys())[0]
    diff = firstTimeStampSample -firstTimeStampSegment
    result = dataFrame.resample(segment, offset = diff).groups

    for key in result:
        data = dataFrame[last:result[key]]
        resultList.append(data)
        last = result[key]
        counter+=1
    return resultList[:-1]

def transposeData(data, timewindow):
    dataFrameWide = pd.pivot(data, index=['Row','TimeStamp','Event'], columns=['Parameter0'], values=['Value'])
    dataFrameWide = dataFrameWide.reset_index()
    dataFrameWide = dataFrameWide.set_index(pd.DatetimeIndex(dataFrameWide['TimeStamp']))

    segmentList = segmentTimeWindows(dataFrameWide, timewindow)

    vectors = []
    for frame in segmentList:
        vector = convertToVector(frame)
        vectors.append(vector)

    data = pd.concat(vectors, axis=1)
    eventsGrouped = data.transpose()
    return eventsGrouped

def transposeDataComplete(data, timewindow):
    dataFrameWide = pd.pivot(data, index=['Row','TimeStamp','Event'], columns=['Parameter0','Parameter1','Parameter2','Parameter3','Parameter4','Parameter5','Parameter6'], values=['Value'])
    dataFrameWide.columns = dataFrameWide.columns.to_flat_index()
    dataFrameWide = dataFrameWide.reset_index()
    dataFrameWide = dataFrameWide.set_index(pd.DatetimeIndex(dataFrameWide['TimeStamp']))

    segmentList = segmentTimeWindows(dataFrameWide, timewindow)

    vectors = []
    for frame in segmentList:
        vector = convertToVectorComplete(frame)
        vectors.append(vector)

    data = pd.concat(vectors, axis=1)
    eventsGrouped = data.transpose()
    return eventsGrouped

def getTimeWindowList(data, timewindow):
    dataFrameWide = pd.pivot(data, index=['Row','TimeStamp','Event'], columns=['Parameter0','Parameter1','Parameter2','Parameter3','Parameter4','Parameter5','Parameter6'], values=['Value'])
    dataFrameWide.columns = dataFrameWide.columns.to_flat_index()
    dataFrameWide = dataFrameWide.reset_index()
    dataFrameWide = dataFrameWide.set_index(pd.DatetimeIndex(dataFrameWide['TimeStamp']))
    return segmentTimeWindows(dataFrameWide, timewindow)

def plotLineEvents(name, data, timewindow):
    eventsGrouped = transposeData(data, timewindow)
    columns = eventsGrouped.columns.drop('TimeStamp')
    fig = px.line(eventsGrouped, x='TimeStamp', y=columns, title=name, template=theme)
    fig.show()

def plotBoxPlot(name, data, timeWindow):
    eventsGrouped = transposeData(data, timeWindow)
    N = len(eventsGrouped.columns)-1 # Except timestamp column
    c = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 360, N)]

    fig = go.Figure(data = [go.Box(y = eventsGrouped.iloc[:, i + 1],
        marker_color=c[i], name=eventsGrouped.columns[i + 1], boxpoints='suspectedoutliers',)
        for i in range(int(N))]
    )
    fig.update_layout(
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=True),
        yaxis=dict(zeroline=False, gridcolor='white'),
        title=name,
        template=theme
    )
    fig.show()

def plotHeapMap(name, data, timeWindow, max = None):
    eventsGrouped = transposeData(data, timeWindow)
    events = eventsGrouped.drop(['TimeStamp'], axis=1)
    if(max is None):
        fig = px.imshow(events.T, color_continuous_scale='solar', template=theme)
    else:
        fig = px.imshow(events.T, range_color=[0, max], color_continuous_scale='solar', template=theme)
    fig.update_layout(title=name)
    fig.show()


In [9]:
# Loading data

IPLoMFolder = '/home/javarotti/Data/ParsedData/IPLoM/'

IPLoMFile1 = 'abb-edge-17-broker-2.log_structured.csv'
IPLoMFile2 = 'abb-edge-18-broker-1.log_structured.csv'
IPLoMFile3 = 'abb-edge-18-broker-2.log_structured.csv'
IPLoMFile4 = 'abb-edge-19-broker-1.log_structured.csv'
IPLoMFile5 = 'abb-edge-19-broker-2.log_structured.csv'
IPLoMFile6 = 'abb-edge-19-broker-3.log_structured.csv'
IPLoMFile7 = 'abb-edge-19-broker-5.log_structured.csv'
IPLoMFile8 = 'abb-edge-20-broker-3.log_structured.csv'
IPLoMFile9 = 'step2-abb-edge-18-proxy-1.log_structured.csv'
IPLoMFile10 = 'step2-abb-edge-18-proxy-2.log_structured.csv'
IPLoMFile11 = 'step2-abb-edge-19-proxy-1.log_structured.csv'
IPLoMFile12 = 'step2-abb-edge-19-proxy-2.log_structured.csv'
IPLoMFile13 = 'step2-abb-edge-19-proxy-3.log_structured.csv'
IPLoMFile14 = 'step2-abb-edge-19-proxy-5.log_structured.csv'
IPLoMFile15 = 'step1-abb-edge-18-csconnect-1.log_structured.csv'
IPLoMFile16 = 'step1-abb-edge-19-csconnect-1.log_structured.csv'
IPLoMFile17 = 'step1-abb-edge-18-csconnectrouter-1.log_structured.csv'
IPLoMFile18 = 'step1-abb-edge-19-csconnectrouter-1.log_structured.csv'
IPLoMFile19 = 'step1-abb-edge-19-csconnectsimulator-1.log_structured.csv'
IPLoMFile20 = 'step1-abb-edge-kuber-01-csconnect-1.log_structured.csv'

dfList = []
dfList.append({'data': dataFrameFromParserResultType1(IPLoMFolder, IPLoMFile1), 'name': 'abb-edge-17-broker-2'})
dfList.append({'data': dataFrameFromParserResultType1(IPLoMFolder, IPLoMFile2), 'name': 'abb-edge-18-broker-1'})
dfList.append({'data': dataFrameFromParserResultType1(IPLoMFolder, IPLoMFile3), 'name': 'abb-edge-18-broker-2'})
dfList.append({'data': dataFrameFromParserResultType1(IPLoMFolder, IPLoMFile4), 'name': 'abb-edge-19-broker-1'})
dfList.append({'data': dataFrameFromParserResultType1(IPLoMFolder, IPLoMFile5), 'name': 'abb-edge-19-broker-2'})
dfList.append({'data': dataFrameFromParserResultType1(IPLoMFolder, IPLoMFile6), 'name': 'abb-edge-19-broker-3'})
dfList.append({'data': dataFrameFromParserResultType1(IPLoMFolder, IPLoMFile7), 'name': 'abb-edge-19-broker-5'})
dfList.append({'data': dataFrameFromParserResultType1(IPLoMFolder, IPLoMFile8), 'name': 'abb-edge-20-broker-3'})
dfList.append({'data': dataFrameFromParserResultType2(IPLoMFolder, IPLoMFile9), 'name': 'abb-edge-18-proxy-1'})
dfList.append({'data': dataFrameFromParserResultType2(IPLoMFolder, IPLoMFile10), 'name': 'abb-edge-18-proxy-2'})
dfList.append({'data': dataFrameFromParserResultType2(IPLoMFolder, IPLoMFile11), 'name': 'abb-edge-19-proxy-1'})
dfList.append({'data': dataFrameFromParserResultType2(IPLoMFolder, IPLoMFile12), 'name': 'abb-edge-19-proxy-2'})
dfList.append({'data': dataFrameFromParserResultType2(IPLoMFolder, IPLoMFile13), 'name': 'abb-edge-19-proxy-3'})
dfList.append({'data': dataFrameFromParserResultType2(IPLoMFolder, IPLoMFile14), 'name': 'abb-edge-19-proxy-5'})
dfList.append({'data': dataFrameFromParserResultType3(IPLoMFolder, IPLoMFile15), 'name': 'abb-edge-18-csconnect-1'})
dfList.append({'data': dataFrameFromParserResultType3(IPLoMFolder, IPLoMFile16), 'name': 'abb-edge-19-csconnect-1'})
dfList.append({'data': dataFrameFromParserResultType3(IPLoMFolder, IPLoMFile17), 'name': 'abb-edge-18-csconnectrouter-1'})
dfList.append({'data': dataFrameFromParserResultType3(IPLoMFolder, IPLoMFile18), 'name': 'abb-edge-19-csconnectrouter-1'})
dfList.append({'data': dataFrameFromParserResultType3(IPLoMFolder, IPLoMFile19), 'name': 'abb-edge-19-csconnectsimulator-1'})
dfList.append({'data': dataFrameFromParserResultType3(IPLoMFolder, IPLoMFile19), 'name': 'abb-edge-19-csconnectsimulator-1'})
dfList.append({'data': dataFrameFromParserResultType3(IPLoMFolder, IPLoMFile20), 'name': 'abb-edge-kuber-01-csconnect-1'})

In [19]:
# Common functions and PCA

import itertools
from pandas import DataFrame
from pca import pca as pcaLibrary

def nameFeature(column):
    if isinstance(column, str): 
        return column
    itens = list(filter(lambda x : x != 'Value' and x != '', column))
    return ':'.join(itens)

def plotComponentBars(name, data, timeWindow):
    eventsGrouped = transposeDataComplete(data, timeWindow)
    pcamodel = PCA(n_components=10)
    pcaBase = eventsGrouped.drop('TimeStamp', axis=1)
    components = pcamodel.fit_transform(pcaBase)
    dfPCA = pd.DataFrame(data=components)

    fig = px.bar(
        x = dfPCA.columns + 1,
        y = pcamodel.explained_variance_ratio_,
        labels = dict(x = 'Component', y = 'Explained variances'))
    fig.add_trace(go.Scatter(
        x = list(range(1,len(pcamodel.explained_variance_ratio_) + 1)),
        y = np.cumsum(pcamodel.explained_variance_ratio_), 
        name = 'Cumulative',
        text = list(map(lambda x : str(round(x*100, 2)) + '%' , np.cumsum(pcamodel.explained_variance_ratio_))),
        mode="lines+markers+text",
        textposition='top center')
    )   
    fig.update_layout(autosize=False, title=name, width=1000, height=500)
    fig.show()

def plotScatterPlot(name, data, timeWindow, componentLimit = 0, pca_x = 1, pca_y = 2, scale = None, factorVectors = False):
    eventsGrouped = transposeDataComplete(data, timeWindow)
    pcaBase = eventsGrouped.drop('TimeStamp', axis=1)
    pca = PCA(n_components=6)
    components = pca.fit_transform(pcaBase)
    loadings = pca.components_.T * np.sqrt(pca.explained_variance_)

    fig = px.scatter(
        components,
        x=pca_x-1,
        y=pca_y-1,
        hover_name=list(
            map(
                lambda x: 'id:' + str(x),
                range(0, np.shape(components)[0])
            ))
    )
    
    if factorVectors:
        for i, featureIndex in enumerate(range(0, len(pcaBase.columns) - 1)):
            factorX = loadings[i, pca_x-1]
            factorY = loadings[i, pca_y-1]
            if (math.sqrt(factorX**2 + factorY**2) > componentLimit):
                fig.add_shape(
                    type='line',
                    line=dict(color='rgba(190, 0, 0, 0.5)', width = 2),
                    x0=0, y0=0,
                    x1=factorX,
                    y1=factorY,
                    name = nameFeature(pcaBase.columns[featureIndex])
                )

                label = i
                fig.add_annotation(
                    x=factorX,
                    y=factorY,
                    ax=0, ay=0,
                    xanchor="center",
                    yanchor="bottom",
                    text=label,
                    hovertext = nameFeature(pcaBase.columns[featureIndex])
                )

    if (scale is not None):
        fig.update_xaxes(range=[scale[0][0], scale[0][1]])
        fig.update_yaxes(range=[scale[1][0], scale[1][1]])

    fig.update_traces(marker_size=15, marker_color='rgba(31, 81, 255, 0.2)', marker_line=dict(width=1, color='rgba(255, 255, 255, 0.6)'))
    fig.update_layout(autosize=False, title=name, width=1000, height=500, xaxis_title='PC' + str(pca_x), yaxis_title='PC' + str(pca_y))
    fig.show()

def plotVectors(name, data, timeWindow, minimum = 0, pca_x = 1, pca_y = 2, mode = 'complete'):

    if (mode == 'complete'):
        eventsGrouped = transposeDataComplete(data, timeWindow)
    elif (mode == 'events'):
        eventsGrouped = transposeData(data, timeWindow)
    else:
        print('Invalid mode')

    pcaBase = eventsGrouped.drop('TimeStamp', axis=1)
    pca = PCA(n_components=6)
    components = pca.fit_transform(pcaBase)
    loadings = pca.components_.T * np.sqrt(pca.explained_variance_)

    fig = px.scatter()
    for i, featureIndex in enumerate(range(0, len(pcaBase.columns) - 1)):
        xFactor = loadings[i, pca_x-1]
        yFactor = loadings[i, pca_y-1]
        if (math.sqrt(xFactor**2 + yFactor**2) > minimum):
            fig.add_shape(
                type='line',
                line=dict(color='rgba(190, 0, 0, 0.5)', width = 2),
                x0=0, y0=0,
                x1=xFactor,
                y1=yFactor,
                name = nameFeature(pcaBase.columns[featureIndex])
            )

            label = i
            fig.add_annotation(
                bgcolor='rgba(255, 255, 255, 1)',
                borderwidth=2,
                x=loadings[i, pca_x-1],
                y=loadings[i, pca_y-1],
                ax=0, ay=0,
                xanchor="center",
                yanchor="bottom",
                text=label,
                hovertext = nameFeature(pcaBase.columns[featureIndex])
            )
    fig.update_layout(autosize=False, title=name, width=1000, height=500, xaxis_title='PC' + str(pca_x), yaxis_title='PC' + str(pca_y))
    fig.show()

def plot3DScatterPlot(name, data, timeWindow, scale = None, mode = 'complete'):
    pca = PCA(n_components=3)

    if (mode == 'complete'):
        eventsGrouped = transposeDataComplete(data, timeWindow)
    elif (mode == 'events'):
        eventsGrouped = transposeData(data, timeWindow)
    else:
        print('Invalid mode')

    pcaBase = eventsGrouped.drop('TimeStamp', axis=1)
    components = pca.fit_transform(pcaBase)

    fig = px.scatter_3d(
        components, x=0, y=1, z=2,
        labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'},
        opacity=0.4,
        hover_name=list(map(lambda x: 'id:' + str(x), range(0, np.shape(components)[0])))
    )

    if scale is None:
        fig.update_layout(autosize=False, title=name, width=800, height=700)
    else:
        fig.update_layout(
            autosize=False, title=name, width=800, height=700,
            scene = dict(
                xaxis = dict(range=[scale[0][0],scale[0][1]]),
                yaxis = dict(range=[scale[1][0],scale[1][1]]),
                zaxis = dict(range=[scale[2][0],scale[2][1]])
            )
        )    
    fig.show()

def extractComponents(name, data, timeWindow, components = 3, mode = 'complete'):

    if (mode == 'complete'):
        eventsGrouped = transposeDataComplete(data, timeWindow)
    elif (mode == 'events'):
        eventsGrouped = transposeData(data, timeWindow)
    else: 
        print('Invalid mode')

    pca = PCA(n_components = components) 
    pcaBase = eventsGrouped.drop('TimeStamp', axis=1)
    components = pca.fit_transform(pcaBase)
    return components

def componentFactors(data, timeWindow, pca_x = 1, pca_y = 2, orderBy='Factor', head=50, mode='complete'):

    if (mode == 'complete'):
        eventsGrouped = transposeDataComplete(data, timeWindow)
    elif (mode == 'events'):
        eventsGrouped = transposeData(data, timeWindow)
    else:
        print('Invalid mode')

    pcaBase = eventsGrouped.drop('TimeStamp', axis=1)
    pca = PCA(n_components=6)
    components = pca.fit_transform(pcaBase)
    loadings = pca.components_.T * np.sqrt(pca.explained_variance_)

    ls = []
    for i, featureIndex in enumerate(range(0, len(pcaBase.columns) - 1)):
        xFactor = loadings[i, pca_x-1]
        yFactor = loadings[i, pca_y-1]
        ls.append([
            nameFeature(pcaBase.columns[featureIndex]),
            math.sqrt(xFactor**2 + yFactor**2),
            xFactor, yFactor
        ])
    df = DataFrame(ls)
    df.columns = ['Feature', 'Factor', 'PC' + str(pca_x), 'PC' + str(pca_y)]
    df = df.sort_values(by = [orderBy], ascending=False).head(head)
    return df

def listTimeWindows(df, timeWindow, file):
    twList = getTimeWindowList(df['data'], timeWindow)

    with open(file, 'w') as fp:
        fp.write('index,start,end' + "\n")
        for index, time in enumerate(twList):
            interval = twList[index].take([0, -1]).loc[:,'TimeStamp']
            fp.write(str(index) + "," + str(interval.iloc[0]) + "," + str(interval.iloc[1]) + '\n')

In [11]:
# Analysis with parameters    
# https://plotly.com/python/pca-visualization/

dfBroker17_2 = [df for df in dfList if df['name'] == 'abb-edge-17-broker-2'][0]
eventsGrouped = transposeDataComplete(dfBroker17_2['data'], '10T')
eventsGrouped.to_csv('/home/javarotti/Temp/temp4.csv')

pca = PCA(n_components=3)
pcaBase = eventsGrouped.drop('TimeStamp', axis=1)
components = pca.fit_transform(pcaBase)
inverse_transform = pca.inverse_transform(components)

dfPCA = pd.DataFrame(data=components)
dfPCA.columns = ['PC' + str(col+1) for col in dfPCA.columns]

fig = px.bar(x = dfPCA.columns, y = pca.explained_variance_ratio_, labels = dict(x = 'Component', y = 'Variance'))
fig.update_layout(autosize=False, width=500, height=500)
fig.show()

fig = px.scatter(components, x=0, y=1, hover_name=list(map(lambda x: 'id:' + str(x), range(0, np.shape(components)[0]))))

loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
total_var = pca.explained_variance_ratio_.sum() * 100

temp = open("/home/javarotti/Temp/temp1.txt", "w")

for i, featureIndex in enumerate(range(0, len(pcaBase.columns) - 1)):
    xFactor = loadings[i, 0]
    yFactor = loadings[i, 1]
    if (math.sqrt(xFactor**2 + yFactor**2) > 3):
        fig.add_shape(
            type='line',
            line=dict(color="firebrick", width=2),
            x0=0, y0=0,
            x1=xFactor,
            y1=yFactor
        )
        fig.add_annotation(
            x=loadings[i, 0],
            y=loadings[i, 1],
            ax=0, ay=0,
            xanchor="center",
            yanchor="bottom",
            text=nameFeature(pcaBase.columns[featureIndex]),
        )
    temp.write(nameFeature(pcaBase.columns[featureIndex]) + ': ' + str(math.sqrt(xFactor**2 + yFactor**2)) + '\n')

temp.close()

fig.update_layout(autosize=False, width=1500, height=500, xaxis_title='PCA1', yaxis_title='PCA2')
fig.show()

fig = px.scatter_3d(
    components, x=0, y=1, z=2,
    title=f'Total Explained Variance: {total_var:.2f}%',
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'},
    opacity=0.4,
    hover_name=list(map(lambda x: 'id:' + str(x), range(0, np.shape(components)[0])))
)
fig.update_layout(autosize=False, width=500, height=500)
fig.show()

df = eventsGrouped[[('Value','b318a04a','','','','','',''),('Value','a32dc633','','','','','','')]]
df.columns = ['b318a04a', 'a32dc633']

df2 = df.groupby(['b318a04a','a32dc633']).size().reset_index(name='count')
fig = px.scatter(df2, x = 'b318a04a', y = 'a32dc633', size='count', color="count", color_continuous_scale='solar')
fig.update_layout(autosize=False, width=600, height=500)
fig.show()


In [12]:
# 1. Broker
timeWindowBroker = '10T'

In [13]:
dfEdge17Broker2 = [df for df in dfList if df['name'] == 'abb-edge-17-broker-2'][0]
plotComponentBars(dfEdge17Broker2['name'], dfEdge17Broker2['data'], timeWindowBroker)
plotScatterPlot(dfEdge17Broker2['name'], dfEdge17Broker2['data'], timeWindowBroker, 3, factorVectors=True)
plot3DScatterPlot(dfEdge17Broker2['name'], dfEdge17Broker2['data'], timeWindowBroker)
plotVectors(dfEdge17Broker2['name'], dfEdge17Broker2['data'], timeWindowBroker, minimum = 0.1, pca_x = 1, pca_y = 2, mode = 'complete')
plotVectors(dfEdge17Broker2['name'], dfEdge17Broker2['data'], timeWindowBroker, minimum = 0.1, pca_x = 2, pca_y = 3, mode = 'complete')
display(componentFactors(dfEdge17Broker2['data'], timeWindowBroker, mode='complete'))

Unnamed: 0,Feature,Factor,PC1,PC2
14,a32dc633,4.673352,4.672569,0.085548
65,2186d001:subscriber_edgeassethubapi,0.610667,0.219886,-0.569706
66,2186d001:publisher_edgeassethubapi,0.610667,0.219886,-0.569706
62,72fc4c37:publisher_edgeassethubapi:c0:k90:uedg...,0.596268,0.264095,-0.534593
61,72fc4c37:subscriber_edgeassethubapi:c0:k90:ued...,0.596268,0.264095,-0.534593
13,b318a04a,0.227597,-0.225764,-0.028826
1,ed44beee,0.08963,0.086341,0.024057
59,72fc4c37:edgefilestorage:c1:k90:uedgefilestorage,0.08963,0.086341,0.024057
58,72fc4c37:edgemethodinvocation:c1:k90:uedgemeth...,0.08963,0.086341,0.024057
64,72fc4c37:subscriber_edgedashboardapi:c0:k90:ue...,0.08963,0.086341,0.024057


In [14]:
scale = [[-20, 50],[-1, 10],[1, -1]]

dfEdge18Broker1 = [df for df in dfList if df['name'] == 'abb-edge-18-broker-1'][0]
plotComponentBars(dfEdge18Broker1['name'], dfEdge18Broker1['data'], timeWindowBroker)
plotScatterPlot(dfEdge18Broker1['name'], dfEdge18Broker1['data'], timeWindowBroker, 0.5, factorVectors=True)
plot3DScatterPlot(dfEdge18Broker1['name'], dfEdge18Broker1['data'], timeWindowBroker, scale)
plot3DScatterPlot(dfEdge18Broker1['name'], dfEdge18Broker1['data'], timeWindowBroker)
plotVectors(dfEdge18Broker1['name'], dfEdge18Broker1['data'], timeWindowBroker, mode='complete')
display(componentFactors(dfEdge18Broker1['data'], timeWindowBroker, mode='complete'))

Unnamed: 0,Feature,Factor,PC1,PC2
19,b318a04a,24.118385,24.118385,-0.003064
13,a32dc633,0.609272,0.08702,0.603025
2,f7c14a2e:event/encryption_key/model,0.087039,0.012431,0.086146
10,afc0e2cb:ipv4,0.087039,0.012431,0.086146
11,afc0e2cb:ipv6,0.087039,0.012431,0.086146
18,72fc4c37:edgerouterIncoming:c0:k20:uedgerouter,0.087039,0.012431,0.086146
17,72fc4c37:publisher_controlsystemconnect:c0:k90...,0.087039,0.012431,0.086146
16,72fc4c37:subscriber_controlsystemconnect:c0:k9...,0.087039,0.012431,0.086146
15,72fc4c37:edgerouterOutgoing:c1:k20:uedgerouter,0.087039,0.012431,0.086146
14,72fc4c37:csconnectbroker:c0:k90:ucsconnectbroker,0.087039,0.012431,0.086146


In [15]:
dfEdge17Broker2 = [df for df in dfList if df['name'] == 'abb-edge-17-broker-2'][0]
plotComponentBars(dfEdge17Broker2['name'], dfEdge17Broker2['data'], timeWindowBroker)
plotScatterPlot(dfEdge17Broker2['name'], dfEdge17Broker2['data'], timeWindowBroker, 0.4)
plot3DScatterPlot(dfEdge17Broker2['name'], dfEdge17Broker2['data'], timeWindowBroker)
factorsEdge18Broker2 = componentFactors(dfEdge17Broker2['data'], timeWindowBroker)

In [16]:
dfEdge18Broker1 = [df for df in dfList if df['name'] == 'abb-edge-18-broker-1'][0]
plotComponentBars(dfEdge18Broker1['name'], dfEdge18Broker1['data'], timeWindowBroker)
plotScatterPlot(dfEdge18Broker1['name'], dfEdge18Broker1['data'], timeWindowBroker, 0.4)
plot3DScatterPlot(dfEdge18Broker1['name'], dfEdge18Broker1['data'], timeWindowBroker)
factorsEdge18Broker2 = componentFactors(dfEdge18Broker1['data'], timeWindowBroker)

In [17]:
dfEdge18Broker2 = [df for df in dfList if df['name'] == 'abb-edge-18-broker-2'][0]
plotComponentBars(dfEdge18Broker2['name'], dfEdge18Broker2['data'], timeWindowBroker)
plotScatterPlot(dfEdge18Broker2['name'], dfEdge18Broker2['data'], timeWindowBroker, 0.4)
plot3DScatterPlot(dfEdge18Broker2['name'], dfEdge18Broker2['data'], timeWindowBroker)
factorsEdge18Broker2 = componentFactors(dfEdge18Broker2['data'], timeWindowBroker)

In [20]:
dfEdge19Broker1 = [df for df in dfList if df['name'] == 'abb-edge-19-broker-1'][0]
plotComponentBars(dfEdge19Broker1['name'], dfEdge19Broker1['data'], timeWindowBroker)
plotScatterPlot(dfEdge19Broker1['name'], dfEdge19Broker1['data'], timeWindowBroker, 0.8, factorVectors = False)
plot3DScatterPlot(dfEdge19Broker1['name'], dfEdge19Broker1['data'], timeWindowBroker)
factorsEdge19Broker1 = componentFactors(dfEdge19Broker1['data'], timeWindowBroker)
display(factorsEdge19Broker1.sort_values(by = ['Factor'], ascending=False))

Unnamed: 0,Feature,Factor,PC1,PC2
7,a32dc633,1.632993,1.632958,-0.010754
2,f7c14a2e:event/encryption_key/model,0.816497,0.816479,-0.005377
3,f7c14a2e:modules/local/#,0.816497,0.816479,-0.005377
8,54884714:edge-proxy_in,0.816497,0.816479,-0.005377
9,54884714:edge-proxy_out,0.816497,0.816479,-0.005377
17,f7c14a2e:modules/+/status,0.612372,0.612359,-0.004033
18,f7c14a2e:system/#,0.612372,0.612359,-0.004033
14,f7c14a2e:cold/#,0.612372,0.612359,-0.004033
16,f7c14a2e:hot/#,0.612372,0.612359,-0.004033
15,f7c14a2e:warm/#,0.612372,0.612359,-0.004033


In [21]:
dfEdge19Broker2 = [df for df in dfList if df['name'] == 'abb-edge-19-broker-2'][0]
plotComponentBars(dfEdge19Broker2['name'], dfEdge19Broker2['data'], timeWindowBroker)
plotScatterPlot(dfEdge19Broker2['name'], dfEdge19Broker2['data'], timeWindowBroker, factorVectors = False, pca_x = 1, pca_y = 3)
# plotScatterByPCALibrary(dfEdge19Broker2['name'], dfEdge19Broker2['data'], timeWindowBroker)
plotVectors(dfEdge19Broker2['name'], dfEdge19Broker2['data'], timeWindowBroker, pca_x = 1, pca_y = 3)
plot3DScatterPlot(dfEdge19Broker2['name'], dfEdge19Broker2['data'], timeWindowBroker)
factorsEdge19Broker2 = componentFactors(dfEdge19Broker2['data'], timeWindowBroker, pca_x = 1, pca_y = 3)
display(factorsEdge19Broker2.sort_values(by = ['PC3'], ascending=False))

Unnamed: 0,Feature,Factor,PC1,PC3
21,e7695eba:edge-proxy_in,0.195134,0.020214,0.194085
7,a32dc633,1.201645,1.191165,0.15836
20,e7695eba:edge-proxy_out,0.125037,0.015619,0.124058
8,54884714:edge-proxy_in,0.608687,0.59788,0.114193
10,41f93083:edge-proxy_out,0.453428,0.448869,0.064139
9,54884714:edge-proxy_out,0.594927,0.593285,0.044166
12,b318a04a,0.097334,-0.089831,0.037473
1,ed44beee,0.145791,0.144416,-0.019973
6,601f8224:running,0.145791,0.144416,-0.019973
5,afc0e2cb:ipv6,0.145791,0.144416,-0.019973


In [22]:
dfEdge19Broker3 = [df for df in dfList if df['name'] == 'abb-edge-19-broker-3'][0]
plotComponentBars(dfEdge19Broker3['name'], dfEdge19Broker3['data'], timeWindowBroker)
plotScatterPlot(dfEdge19Broker3['name'], dfEdge19Broker3['data'], timeWindowBroker, 0.5, factorVectors = True)
plot3DScatterPlot(dfEdge19Broker3['name'], dfEdge19Broker3['data'], timeWindowBroker)
plotVectors(dfEdge19Broker3['name'], dfEdge19Broker3['data'], timeWindowBroker, pca_x=1, pca_y=3)
factorsEdge19Broker3 = componentFactors(dfEdge19Broker3['data'], timeWindowBroker)
# display(factorsEdge19Broker3.sort_values(by = ['Factor'], ascending=False))

In [23]:
dfEdge19Broker5 = [df for df in dfList if df['name'] == 'abb-edge-19-broker-5'][0]
plotComponentBars(dfEdge19Broker5['name'], dfEdge19Broker5['data'], timeWindowBroker)
plotScatterPlot(dfEdge19Broker5['name'], dfEdge19Broker5['data'], timeWindowBroker, 0.5)
plot3DScatterPlot(dfEdge19Broker5['name'], dfEdge19Broker5['data'], timeWindowBroker)
plotVectors(dfEdge19Broker5['name'], dfEdge19Broker5['data'], timeWindowBroker, pca_x=3, pca_y=4)
factorsEdge19Broker5 = componentFactors(dfEdge19Broker5['data'], timeWindowBroker)
display(factorsEdge19Broker5.sort_values(by = ['Factor'], ascending=False))

Unnamed: 0,Feature,Factor,PC1,PC2
7,a32dc633,0.982907,0.873742,0.450201
3,f7c14a2e:modules/local/#,0.7219,0.720319,-0.047763
2,f7c14a2e:event/encryption_key/model,0.7219,0.720319,-0.047763
16,f7c14a2e:hot/#,0.696193,0.689736,-0.0946
19,f7c14a2e:$SYS/#,0.696193,0.689736,-0.0946
15,f7c14a2e:warm/#,0.696193,0.689736,-0.0946
14,f7c14a2e:cold/#,0.696193,0.689736,-0.0946
13,a0a40a28,0.696193,0.689736,-0.0946
17,f7c14a2e:modules/+/status,0.696193,0.689736,-0.0946
18,f7c14a2e:system/#,0.696193,0.689736,-0.0946


In [24]:
dfEdge20Broker3 = [df for df in dfList if df['name'] == 'abb-edge-20-broker-3'][0]
plotComponentBars(dfEdge20Broker3['name'], dfEdge20Broker3['data'], timeWindowBroker)
plotScatterPlot(dfEdge20Broker3['name'], dfEdge20Broker3['data'], timeWindowBroker, 0.5, factorVectors = False)
plot3DScatterPlot(dfEdge20Broker3['name'], dfEdge20Broker3['data'], timeWindowBroker)
factorsEdge20Broker3 = componentFactors(dfEdge20Broker3['data'], timeWindowBroker)
display(factorsEdge20Broker3.sort_values(by = ['Factor'], ascending=False))

Unnamed: 0,Feature,Factor,PC1,PC2
13,a32dc633,286.241803,286.239275,1.203222
30,2186d001:tsmodule,10.870918,-3.787169,10.189908
19,a0a40a28,10.56642,-3.248484,10.054679
9,f7c14a2e:$SYS/#,10.49726,-3.158278,10.010881
2,f7c14a2e:event/encryption_key/model,10.49726,-3.158278,10.010881
5,f7c14a2e:warm/#,10.49726,-3.158278,10.010881
7,f7c14a2e:modules/+/status,10.49726,-3.158278,10.010881
8,f7c14a2e:system/#,10.49726,-3.158278,10.010881
6,f7c14a2e:hot/#,10.49726,-3.158278,10.010881
3,f7c14a2e:modules/local/#,10.49726,-3.158278,10.010881


In [25]:
# Proxy
timeWindowProxy = '30T'

In [26]:
dfEdge18Proxy1 = [df for df in dfList if df['name'] == 'abb-edge-18-proxy-1'][0]
plotComponentBars(dfEdge18Proxy1['name'], dfEdge18Proxy1['data'], timeWindowProxy)
plotScatterPlot(dfEdge18Proxy1['name'], dfEdge18Proxy1['data'], timeWindowProxy, 0.5)
plot3DScatterPlot(dfEdge18Proxy1['name'], dfEdge18Proxy1['data'], timeWindowProxy)
factorsEdge18Proxy1 = componentFactors(dfEdge18Proxy1['data'], timeWindowProxy)
display(factorsEdge18Proxy1.sort_values(by = ['Factor'], ascending=False))

Unnamed: 0,Feature,Factor,PC1,PC2
98,f78b4b4e,2.717158,2.651399,-0.594164
97,368e49f2,1.374872,1.340548,-0.305295
21,c0a5a082:abb.ability.device:edge-proxy,1.192461,0.532621,1.066902
1,73b979dd,0.596231,0.26631,0.533451
0,f6acc8b0,0.596231,0.26631,0.533451
75,e8d80cca,0.447173,0.199733,0.400088
55,c8b5a8a5,0.447173,0.199733,0.400088
57,33a808ef,0.298115,0.133155,0.266725
50,467075b0:model.updated:csconnectbroker,0.298115,0.133155,0.266725
90,5aae11d0,0.298115,0.133155,0.266725


In [27]:
dfEdge18Proxy2 = [df for df in dfList if df['name'] == 'abb-edge-18-proxy-2'][0]
plotComponentBars(dfEdge18Proxy2['name'], dfEdge18Proxy2['data'], timeWindowProxy)
plotScatterPlot(dfEdge18Proxy2['name'], dfEdge18Proxy2['data'], timeWindowProxy, 0.5)
plot3DScatterPlot(dfEdge18Proxy2['name'], dfEdge18Proxy2['data'], timeWindowProxy)
factorsEdge18Proxy2 = componentFactors(dfEdge18Proxy2['data'], timeWindowProxy)
display(factorsEdge18Proxy2.sort_values(by = ['Factor'], ascending=False))

Unnamed: 0,Feature,Factor,PC1,PC2
98,f78b4b4e,2.961585,-2.959855,-0.101191
97,368e49f2,1.484651,-1.483801,-0.05022
21,c0a5a082:abb.ability.device:edge-proxy,0.905454,-0.117263,0.897829
0,f6acc8b0,0.452727,-0.058631,0.448914
1,73b979dd,0.452727,-0.058631,0.448914
75,e8d80cca,0.339545,-0.043974,0.336686
55,c8b5a8a5,0.339545,-0.043974,0.336686
83,7c846fa0,0.226363,-0.029316,0.224457
50,467075b0:model.updated:csconnectbroker,0.226363,-0.029316,0.224457
57,33a808ef,0.226363,-0.029316,0.224457


In [28]:
time_window_proxy_19_1 = '3T'

dfEdge19Proxy1 = [df for df in dfList if df['name'] == 'abb-edge-19-proxy-1'][0]
plotComponentBars(dfEdge19Proxy1['name'], dfEdge19Proxy1['data'], time_window_proxy_19_1)
plotScatterPlot(dfEdge19Proxy1['name'], dfEdge19Proxy1['data'], time_window_proxy_19_1, 0.5, scale=[[-1,8],[-1,5]], pca_x = 1, pca_y = 2)
plotScatterPlot(dfEdge19Proxy1['name'], dfEdge19Proxy1['data'], time_window_proxy_19_1, 0.5, scale=[[-1,8],[-1,5]], pca_x = 1, pca_y = 3)

plot3DScatterPlot(dfEdge19Proxy1['name'], dfEdge19Proxy1['data'], time_window_proxy_19_1)
plotVectors(dfEdge19Proxy1['name'], dfEdge19Proxy1['data'], time_window_proxy_19_1, pca_x = 1, pca_y = 3)
display(componentFactors(dfEdge19Proxy1['data'], time_window_proxy_19_1, pca_x=1, pca_y=3))

Unnamed: 0,Feature,Factor,PC1,PC3
1,73b979dd,0.444444,0.443825,-0.023461
0,f6acc8b0,0.444444,0.443825,-0.023461
26,368e49f2,0.2691,0.091635,0.253018
15,3cdc0eb3:abb.ability.device,0.222222,0.221912,-0.01173
5,954da85d,0.222222,0.221912,-0.01173
2,98905022,0.111111,0.110956,-0.005865
16,9888cde4,0.111111,0.110956,-0.005865
25,d441c4e8:Edges device model updated:8,0.111111,0.110956,-0.005865
24,01d5ed8b:edge-proxy : abbiapcpdev.azurecr.io/e...,0.111111,0.110956,-0.005865
23,8c997508:edge-proxy,0.111111,0.110956,-0.005865


In [29]:
time_window_proxy_19_2 = '3T'

dfEdge19Proxy2 = [df for df in dfList if df['name'] == 'abb-edge-19-proxy-2'][0]
plotComponentBars(dfEdge19Proxy2['name'], dfEdge19Proxy2['data'], time_window_proxy_19_2)
plotScatterPlot(dfEdge19Proxy2['name'], dfEdge19Proxy2['data'], time_window_proxy_19_2, 0.5, pca_x=1, pca_y=3)
plot3DScatterPlot(dfEdge19Proxy2['name'], dfEdge19Proxy2['data'], time_window_proxy_19_2,
    scale=[[-1,8],[-1,5],[-1,5]])
plotVectors(dfEdge19Proxy2['name'], dfEdge19Proxy2['data'], time_window_proxy_19_2, pca_x = 1, pca_y = 3)
display(componentFactors(dfEdge19Proxy2['data'], time_window_proxy_19_2, pca_x=1, pca_y=3))

#factorsEdge19Proxy2 = componentFactors(dfEdge19Proxy2['data'], time_window_proxy_19_2)
#display(factorsEdge19Proxy2.sort_values(by = ['Factor'], ascending=False))

Unnamed: 0,Feature,Factor,PC1,PC3
0,f6acc8b0,0.313499,0.311152,-0.03829
1,73b979dd,0.313499,0.311152,-0.03829
26,368e49f2,0.199905,0.099104,-0.173609
9,4c70812a,0.187417,0.109909,0.151807
14,50f6770d,0.187417,0.109909,0.151807
33,b217e024,0.164545,0.032121,0.16138
30,5f911ecf,0.164545,0.032121,0.16138
29,9ae61561,0.164545,0.032121,0.16138
5,954da85d,0.156749,0.155576,-0.019145
15,3cdc0eb3:abb.ability.device,0.156749,0.155576,-0.019145


In [30]:
dfEdge19Proxy3 = [df for df in dfList if df['name'] == 'abb-edge-19-proxy-3'][0]
plotComponentBars(dfEdge19Proxy3['name'], dfEdge19Proxy3['data'], timeWindowProxy)
plotScatterPlot(dfEdge19Proxy3['name'], dfEdge19Proxy3['data'], timeWindowProxy, 0.5)
plot3DScatterPlot(dfEdge19Proxy3['name'], dfEdge19Proxy3['data'], timeWindowProxy)
factorsEdge19Proxy3 = componentFactors(dfEdge19Proxy3['data'], timeWindowProxy)
display(factorsEdge19Proxy3.sort_values(by = ['Factor'], ascending=False))

Unnamed: 0,Feature,Factor,PC1,PC2
1,73b979dd,0.640513,0.476704,-0.427796
0,f6acc8b0,0.640513,0.476704,-0.427796
14,50f6770d,0.502356,0.430503,0.258899
9,4c70812a,0.502356,0.430503,0.258899
30,5f911ecf,0.480384,0.311327,0.365848
29,9ae61561,0.480384,0.311327,0.365848
33,b217e024,0.480384,0.311327,0.365848
7,97c1b109,0.354268,0.326728,0.136949
5,954da85d,0.320256,0.238352,-0.213898
15,3cdc0eb3:abb.ability.device,0.320256,0.238352,-0.213898


In [31]:
time_window_proxy_19_5 = '3T'

dfEdge19Proxy5 = [df for df in dfList if df['name'] == 'abb-edge-19-proxy-5'][0]
plotComponentBars(dfEdge19Proxy5['name'], dfEdge19Proxy5['data'], time_window_proxy_19_5)
plotScatterPlot(dfEdge19Proxy5['name'], dfEdge19Proxy5['data'], time_window_proxy_19_5, 0.5)
plot3DScatterPlot(dfEdge19Proxy5['name'], dfEdge19Proxy5['data'], time_window_proxy_19_5)
plotVectors(dfEdge19Proxy2['name'], dfEdge19Proxy2['data'], time_window_proxy_19_2)
display(componentFactors(dfEdge19Proxy5['data'], time_window_proxy_19_2))

Unnamed: 0,Feature,Factor,PC1,PC2
56,206fbea4:model.updated to target controlsystem...,3.921581,3.921117,0.060267
129,07d3be8d:model.create,3.844262,3.843333,-0.084489
182,07d3be8d:reference.create,1.809064,1.808627,-0.039759
126,07d3be8d:type.create,0.904532,0.904314,-0.01988
16,c0a5a082:abb.ability.device:edge-proxy,0.787422,0.161704,0.77064
150,206fbea4:type.create done to controlsystemconnect,0.452266,0.452157,-0.00994
128,206fbea4:type.create failure to controlsystemc...,0.452266,0.452157,-0.00994
71,e8d80cca,0.365618,0.15443,0.331403
1,73b979dd,0.292865,0.00485,0.292825
45,c8b5a8a5,0.292865,0.00485,0.292825


In [32]:
# 3. CSConnect
timeWindowCSConnect = '5T'

In [33]:
dfEdgeKuber01CSConnect1 = [df for df in dfList if df['name'] == 'abb-edge-18-csconnect-1'][0]
plotComponentBars(dfEdgeKuber01CSConnect1['name'], dfEdgeKuber01CSConnect1['data'], timeWindowCSConnect)
plotScatterPlot(dfEdgeKuber01CSConnect1['name'], dfEdgeKuber01CSConnect1['data'], timeWindowCSConnect, 0.5, factorVectors = False)
plot3DScatterPlot(dfEdgeKuber01CSConnect1['name'], dfEdgeKuber01CSConnect1['data'], timeWindowCSConnect)
factorsEdge18CSConnect1 = componentFactors(dfEdgeKuber01CSConnect1['data'], timeWindowCSConnect)
display(factorsEdge18CSConnect1.sort_values(by = ['Factor'], ascending=False))

TypeError: plotScatterPlot() got an unexpected keyword argument 'factorLabels'

In [None]:
dfEdge19CSConnect1 = [df for df in dfList if df['name'] == 'abb-edge-19-csconnect-1'][0]
plotComponentBars(dfEdge19CSConnect1['name'], dfEdge19CSConnect1['data'], timeWindowCSConnect)

In [None]:
dfEdge19CSConnect1 = [df for df in dfList if df['name'] == 'abb-edge-19-csconnect-1'][0]
x = componentFactors(dfEdge19CSConnect1['data'], timeWindowCSConnect, pca_x=2, pca_y=3)
display(x.sort_values(by = ['PC2','PC3'], ascending=False))

In [None]:
dfEdge19CSConnect1 = [df for df in dfList if df['name'] == 'abb-edge-19-csconnect-1'][0]

# Analysis based on events and states
plotComponentBars(dfEdge19CSConnect1['name'], dfEdge19CSConnect1['data'], timeWindowCSConnect)

#plotScatterPlot(dfEdge19CSConnect1['name'], dfEdge19CSConnect1['data'], timeWindowCSConnect, pca_x=1, pca_y=2)
#plotScatterPlot(dfEdge19CSConnect1['name'], dfEdge19CSConnect1['data'], timeWindowCSConnect, pca_x=1, pca_y=3)
plotScatterPlot(dfEdge19CSConnect1['name'], dfEdge19CSConnect1['data'], timeWindowCSConnect, pca_x=2, pca_y=3)

plot3DScatterPlot(dfEdge19CSConnect1['name'], dfEdge19CSConnect1['data'], timeWindowCSConnect)

plotVectors(dfEdge19CSConnect1['name'], dfEdge19CSConnect1['data'], timeWindowCSConnect, minimum = 0.1, pca_x = 2, pca_y = 3)
display(componentFactors(dfEdge19CSConnect1['data'], timeWindowCSConnect))

# Analysis based only on events
plot3DScatterPlot(dfEdge19CSConnect1['name'], dfEdge19CSConnect1['data'], timeWindowCSConnect, mode = 'events')
plotVectors(dfEdge19CSConnect1['name'], dfEdge19CSConnect1['data'], timeWindowCSConnect, minimum = 0.1, pca_x = 2, pca_y = 3, mode = 'events')
display(componentFactors(dfEdge19CSConnect1['data'], timeWindowCSConnect, mode = 'events'))

In [None]:
dfEdgeKuber01CSConnect1 = [df for df in dfList if df['name'] == 'abb-edge-kuber-01-csconnect-1'][0]
#plotComponentBars(dfEdgeKuber01CSConnect1['name'], dfEdgeKuber01CSConnect1['data'], timeWindowCSConnect)
#plotScatterPlot(dfEdgeKuber01CSConnect1['name'], dfEdgeKuber01CSConnect1['data'], timeWindowCSConnect, pca_x=1, pca_y=2)
plot3DScatterPlot(dfEdgeKuber01CSConnect1['name'], dfEdgeKuber01CSConnect1['data'], timeWindowCSConnect, mode = 'events')
plotVectors(dfEdgeKuber01CSConnect1['name'], dfEdgeKuber01CSConnect1['data'], timeWindowCSConnect, minimum = 0.1, pca_x = 1, pca_y = 2, mode = 'events')

In [None]:
dfEdgeKuber01CSConnect1 = [df for df in dfList if df['name'] == 'abb-edge-kuber-01-csconnect-1'][0]
plotComponentBars(dfEdgeKuber01CSConnect1['name'], dfEdgeKuber01CSConnect1['data'], timeWindowCSConnect)
plotScatterPlot(dfEdgeKuber01CSConnect1['name'], dfEdgeKuber01CSConnect1['data'], timeWindowCSConnect, pca_x=1, pca_y=2)
plot3DScatterPlot(dfEdgeKuber01CSConnect1['name'], dfEdgeKuber01CSConnect1['data'], timeWindowCSConnect)
plotVectors(dfEdgeKuber01CSConnect1['name'], dfEdgeKuber01CSConnect1['data'], timeWindowCSConnect, minimum = 0.1, pca_x = 1, pca_y = 2, mode = 'complete')

#factorsEdgeKuber01CSConnect1 = componentFactors(dfEdgeKuber01CSConnect1['data'], timeWindowCSConnect)
#display(factorsEdgeKuber01CSConnect1.sort_values(by = ['Factor'], ascending=False))

In [None]:
dfEdgeKuber01CSConnect1 = [df for df in dfList if df['name'] == 'abb-edge-kuber-01-csconnect-1'][0]
listTimeWindows(dfEdgeKuber01CSConnect1, timeWindowCSConnect,
    '/home/javarotti/Data/Analysis/Statistics/Temp/dfEdgeKuber01CSConnect1.csv')


In [None]:
plotVectors(dfEdge19CSConnect1['name'], dfEdge19CSConnect1['data'], timeWindowCSConnect, minimum = 0.1, pca_x = 1, pca_y = 2, mode='events')
aa = componentFactors(dfEdge19CSConnect1['data'], timeWindowCSConnect, mode = 'events', head=100)

In [None]:
# 4. CSConnect Router
timeWindowCSConnectRouter = '3T'

In [None]:
dfEdge18CSConnectRouter1 = [df for df in dfList if df['name'] == 'abb-edge-18-csconnectrouter-1'][0]
plotComponentBars(dfEdge18CSConnectRouter1['name'], dfEdge18CSConnectRouter1['data'], timeWindowCSConnectRouter)
plotScatterPlot(dfEdge18CSConnectRouter1['name'], dfEdge18CSConnectRouter1['data'], timeWindowCSConnectRouter)
plot3DScatterPlot(dfEdge18CSConnectRouter1['name'], dfEdge18CSConnectRouter1['data'], timeWindowCSConnectRouter)
plotVectors(dfEdge18CSConnectRouter1['name'], dfEdge18CSConnectRouter1['data'], timeWindowCSConnectRouter)
display(componentFactors(dfEdge18CSConnectRouter1['data'], timeWindowCSConnectRouter))

In [None]:
dfEdge19CSConnectRouter1 = [df for df in dfList if df['name'] == 'abb-edge-19-csconnectrouter-1'][0]
plotComponentBars(dfEdge19CSConnectRouter1['name'], dfEdge19CSConnectRouter1['data'], timeWindowCSConnectRouter)
plotScatterPlot(dfEdge19CSConnectRouter1['name'], dfEdge19CSConnectRouter1['data'], timeWindowCSConnectRouter)
plot3DScatterPlot(dfEdge19CSConnectRouter1['name'], dfEdge19CSConnectRouter1['data'], timeWindowCSConnectRouter)
plotVectors(dfEdge19CSConnectRouter1['name'], dfEdge19CSConnectRouter1['data'], timeWindowCSConnectRouter)
display(componentFactors(dfEdge19CSConnectRouter1['data'], timeWindowCSConnectRouter))

In [None]:
# 5. CSConnect Simulator
timeWindowCSConnectSimulator = '10T'

In [None]:
dfEdge19CSConnectSimulator1 = [df for df in dfList if df['name'] == 'abb-edge-19-csconnectsimulator-1'][0]
plotComponentBars(dfEdge19CSConnectSimulator1['name'], dfEdge19CSConnectSimulator1['data'], timeWindowCSConnectSimulator)
plotScatterPlot(dfEdge19CSConnectSimulator1['name'], dfEdge19CSConnectSimulator1['data'], timeWindowCSConnectSimulator)
plot3DScatterPlot(dfEdge19CSConnectSimulator1['name'], dfEdge19CSConnectSimulator1['data'], timeWindowCSConnectSimulator)
plotVectors(dfEdge19CSConnectSimulator1['name'], dfEdge19CSConnectSimulator1['data'], timeWindowCSConnectSimulator, minimum=5)
display(componentFactors(dfEdge19CSConnectSimulator1['data'], timeWindowCSConnectSimulator))

#### DBSCAN

In [None]:
# Common function for DBSCAN

from sklearn.cluster import DBSCAN # for building a clustering model
from sklearn import metrics # for calculating Silhouette score

def trackData(data, timeWindow):
    eventsGrouped = transposeDataComplete(data, timeWindow)
    return eventsGrouped

def dbscanPlot3D(name, data, timeWindow, epsilon, minPoints, pca_x = 1, pca_y = 2, pca_z = 3, trainModel = 3, mode = 'complete'):
    
    x_pca_name = 'pca' + str(pca_x)
    y_pca_name = 'pca' + str(pca_y)
    z_pca_name = 'pca' + str(pca_z)

    components = extractComponents(name, data, timeWindow, mode = mode, components = 10)
    df = DataFrame(components)
    df.columns = list(map(lambda x : 'pca' + str(x), range(1, len(df.columns) + 1)))

    model = DBSCAN(
        eps=epsilon, # default=0.5, The maximum distance between two samples for one to be considered as in the neighborhood of the other.
        min_samples=minPoints, # default=5, The number of samples (or total weight) in a neighborhood for a point to be considered as a core point.
        metric='euclidean', # default='euclidean'. The metric to use when calculating distance between instances in a feature array. 
        metric_params=None, # default=None, Additional keyword arguments for the metric function.
        algorithm='auto', # {‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}, default=’auto’, The algorithm to be used by the NearestNeighbors module to
                          # compute pointwise distances and find nearest neighbors.
        leaf_size=30, # default=30, Leaf size passed to BallTree or cKDTree.
        p=None, # default=None, The power of the Minkowski metric to be used to calculate distance between points. If None, then p=2
        n_jobs=None, # default=None, The number of parallel jobs to run. None means 1 unless in a joblib.parallel_backend context. -1 means using
                     # all processors.
    )

    # Fit the models
    dfModel = df.loc[:, list(map(lambda x : 'pca' + str(x), range(1, trainModel + 1)))]
    clm = model.fit(dfModel)
    dfModel['cluster'] = clm.labels_
    dfModel['label'] = list(map(lambda x : 'Outlier' if x == -1 else 'Cluster ' + str(x), clm.labels_))
    
    df3D = dfModel.loc[:, [x_pca_name, y_pca_name, z_pca_name, 'label']]
    fig = px.scatter_3d(df,
        x=df3D[x_pca_name], y=df3D[y_pca_name], z=df3D[z_pca_name], color=df3D['label'],
        opacity=0.3, color_discrete_sequence = px.colors.qualitative.Plotly,
        hover_name=list(map(lambda x: 'id:' + str(x), range(0, np.shape(components)[0]))),
        height=1000, width=1000
    )
    fig.update_traces(marker_size=10, marker_line=dict(width=1, color='rgba(255, 255, 255, 0.6)'))
    fig.update_layout(title = name,  scene=dict(xaxis_title=x_pca_name, yaxis_title=y_pca_name, zaxis_title=z_pca_name))
    fig.show()

def dbscanPlot2D(name, data, timeWindow, epsilon, minPoints, pca_x = 1, pca_y = 2, trainModel = 2, mode = 'complete'):

    x_pca_name = 'pca' + str(pca_x)
    y_pca_name = 'pca' + str(pca_y)
    
    components = extractComponents(name, data, timeWindow, mode = mode, components = 10)
    df = DataFrame(components)
    df.columns = list(map(lambda x : 'pca' + str(x), range(1, len(df.columns) + 1)))
    
    model = DBSCAN(
        eps=epsilon, # default=0.5, The maximum distance between two samples for one to be considered as in the neighborhood of the other.
        min_samples=minPoints, # default=5, The number of samples (or total weight) in a neighborhood for a point to be considered as a core point.
        metric='euclidean', # default='euclidean'. The metric to use when calculating distance between instances in a feature array. 
        metric_params=None, # default=None, Additional keyword arguments for the metric function.
        algorithm='auto', # {‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}, default=’auto’, The algorithm to be used by the NearestNeighbors module to
                          # compute pointwise distances and find nearest neighbors.
        leaf_size=30, # default=30, Leaf size passed to BallTree or cKDTree.
        p=None, # default=None, The power of the Minkowski metric to be used to calculate distance between points. If None, then p=2
        n_jobs=None, # default=None, The number of parallel jobs to run. None means 1 unless in a joblib.parallel_backend context. -1 means using
                     # all processors.
    )

    # Fit the models
    dfModel = df.loc[:, list(map(lambda x : 'pca' + str(x), range(1, trainModel + 1)))]
    clm = model.fit(dfModel)
    dfModel['cluster'] = clm.labels_
    dfModel['label'] = list(map(lambda x : 'Outlier' if x == -1 else 'Cluster ' + str(x), clm.labels_))

    df2D = dfModel.loc[:, [x_pca_name, y_pca_name, 'label']]
    fig = px.scatter(df2D,
        x=df2D[x_pca_name], y=df2D[y_pca_name], color=df2D['label'],
        opacity=0.3, color_discrete_sequence = px.colors.qualitative.Plotly,
        hover_name=list(map(lambda x: 'id:' + str(x), range(0, np.shape(components)[0]))),
    )
    fig.update_traces(marker_size=15, marker_line=dict(width=1, color='rgba(255, 255, 255, 0.6)'))
    fig.update_layout(autosize=False, title=name, width=1000, height=500, xaxis_title=x_pca_name, yaxis_title=y_pca_name)
    fig.show()

In [None]:
timeWindowCSConnect = '5T'

In [None]:
dfEdge19CSConnect1 = [df for df in dfList if df['name'] == 'abb-edge-19-csconnect-1'][0]

dbscanPlot3D(dfEdge19CSConnect1['name'], dfEdge19CSConnect1['data'],
 timeWindowCSConnect, epsilon=3, minPoints=2, trainModel = 3, mode = 'complete')
dbscanPlot2D(dfEdge19CSConnect1['name'], dfEdge19CSConnect1['data'],
 timeWindowCSConnect, epsilon=3, minPoints=2, pca_x=2, pca_y=3, trainModel=4, mode = 'complete')
dbscanPlot2D(dfEdge19CSConnect1['name'], dfEdge19CSConnect1['data'],
 timeWindowCSConnect, epsilon=3, minPoints=2, pca_x=2, pca_y=3, trainModel=3, mode = 'complete')


In [None]:
timeWindowBroker = '10T'

In [None]:
dfEdge17Broker2 = [df for df in dfList if df['name'] == 'abb-edge-17-broker-2'][0]

dbscanPlot3D(dfEdge17Broker2['name'], dfEdge17Broker2['data'],
 timeWindowBroker, epsilon=2, minPoints=2, trainModel = 5, mode = 'complete')
dbscanPlot2D(dfEdge17Broker2['name'], dfEdge17Broker2['data'],
 timeWindowBroker, epsilon=2, minPoints=2, pca_x=2, pca_y=3, trainModel=5, mode = 'complete')
dbscanPlot2D(dfEdge17Broker2['name'], dfEdge17Broker2['data'],
 timeWindowBroker, epsilon=3, minPoints=2, pca_x=2, pca_y=3, trainModel=5, mode = 'complete')

In [None]:
dfEdge19Broker5 = [df for df in dfList if df['name'] == 'abb-edge-19-broker-5'][0]

dbscanPlot3D(dfEdge19Broker5['name'], dfEdge19Broker5['data'],
 timeWindowBroker, epsilon=2, minPoints=2, trainModel = 5, mode = 'complete')
dbscanPlot2D(dfEdge19Broker5['name'], dfEdge19Broker5['data'],
 timeWindowBroker, epsilon=2, minPoints=2, pca_x=1, pca_y=2, trainModel=5, mode = 'complete')
listTimeWindows(dfEdge19Broker5, timeWindowBroker, 
 '/home/javarotti/Data/Analysis/Statistics/Temp/dfEdge19Broker5.csv' )

 