In [5]:
## Load Dataframe
import pandas as pd
import plotly.express as px
import warnings
import fnmatch
import os

pd.set_option('future.no_silent_downcasting', True)

def loadPrometheusData(root, fileRegex, metricsName, fileAggFunc, fileExtn, aggfunction):
    print("processing "+fileRegex+metricsName+'-'+fileAggFunc+'*'+fileExtn)
    df1 = loadDataFrameFromFileRegex(root, fileRegex+metricsName+'-'+fileAggFunc+'*'+fileExtn, metrics=metricsName+'_'+fileAggFunc)
    if(metricsName == 'task_queue_length'):
        df1.loc[df1['metrics_name'].str.contains('securiti-appliance-downloader-tasks-queue', regex=False), 'metrics'] = 'taskq_'+fileAggFunc
        df1.loc[df1['metrics_name'].str.contains('t-appliance-downloader-tasks-queue', regex=False), 'metrics'] = 'downloadq_'+fileAggFunc
        df1.loc[df1['metrics_name'].str.contains('securiti-appliance-linker', regex=False), 'metrics'] = 'linkerq_'+fileAggFunc

    if(metricsName == 'infra_access_latency'):
        df1.loc[df1['metrics_name'].str.contains('appliance_es_access_latency', regex=False), 'metrics'] = 'esLatency_'+fileAggFunc
        df1.loc[df1['metrics_name'].str.contains('appliance_postgres_access_latency', regex=False), 'metrics'] = 'pgLatency_'+fileAggFunc
        df1.loc[df1['metrics_name'].str.contains('appliance_redis_access_latency', regex=False), 'metrics'] = 'redisLatency_'+fileAggFunc

    df1['node_ip']=df1['node_ip'].fillna("master")
    df1 = df1.groupby(['appliance_id','ts', 'node_ip', 'metrics']).agg(value=('value', aggfunction)).reset_index()   
    df1['ts']=pd.to_datetime(df1['ts'],unit='s')
    return df1[['appliance_id','ts', 'node_ip', 'metrics', 'value']] 

def loadDataFrameFromFileRegex(root, regex, **kwargs):
    metrics = kwargs.get('metrics', None)
    df_arr = []
    for path, subdirs, files in os.walk(root):
        for name in files:
            if fnmatch.fnmatch(name, regex) and os.path.getsize(os.path.join(path, name)) > 0:
                # print(os.path.join(path, name))
                df = pd.read_csv(os.path.join(path, name))
                df.insert(1, 'metrics', metrics)
                df_arr.append(df)
    if not df_arr:
        warnings.warn("No matching file found in "+root+" for regex: "+regex+". Empty dataframe will be returned." )
        return pd.DataFrame()    
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=FutureWarning)      
        return pd.concat(df_arr, ignore_index=True)

In [None]:
root = 'dataDir'
filePrefix = 'securiti_appliance_'
fileExtn = '.csv'

# metricsArr = ['infra_access_latency']
metricsArr = ['cpu_used', 'download_workers_count', 'memory_used', 'task_queue_length']
#'infra_access_latency', 'pod_cpu_usage', 'pod_memory_usage',
df_arr = []
for metricsName in metricsArr:
    for fileAggFunc in ['max', 'avg']:
        aggfunction = 'mean'
        if(fileAggFunc == 'max'):
            aggfunction = 'max'
        df_tmp = loadPrometheusData(root, filePrefix, metricsName, fileAggFunc, fileExtn, aggfunction)
        df_arr.append(df_tmp)

df = pd.concat(df_arr, ignore_index=True)

print("loading Unstrctured Data from file")
df9 = loadDataFrameFromFileRegex(root, 'UNSTRUCTURED-*.csv', metrics='dataScanned')
df9.rename(columns={'pod':'appliance_id'}, inplace=True)
df9['node_ip']="master"
df9=df9.groupby(['appliance_id', 'ts', 'node_ip']).agg(\
    dataScanned=('dataScannedInGB', 'sum'), \
    scanTime=('processingTimeinHrs', 'sum'), \
    numFilesScanned=('numberOfFilesScanned', 'sum'), \
    scannerIdleTime=('IdleTimeInHrs', 'sum'), \
    uniqPodCount=('uniqPodCount', 'max')).reset_index()
df9['ts']=pd.to_datetime(df9['ts'],unit='ms')
df9['avgFileSizeInMB']=df9['dataScanned']*1000/df9['numFilesScanned']
df9 = pd.melt(df9, id_vars=['appliance_id','ts', 'node_ip'], var_name='metrics', value_name='value')
df = pd.concat([df,df9], ignore_index=True)

display(df)


In [None]:
fromDt = '2024-08-04'
toDt = '2024-08-19'
dfds = df[(df['metrics'] == 'dataScanned') & (df['ts'] >= fromDt) & (df['ts'] <= toDt) ]
dfds = dfds.groupby('appliance_id').agg(value=('value', 'sum')).reset_index().sort_values('value', ascending=False)
print(dfds.to_string())

In [None]:
appliance_id = '0af83074-d88b-4990-b033-92f28b161d2c'
fromDt = '2024-08-05'
toDt = '2024-08-07'
dfp = df[(df['appliance_id'] == appliance_id) & (df['ts'] >= fromDt) & (df['ts'] <= toDt) ]
fig = px.line(dfp, x="ts", y="value", color='node_ip', facet_row='metrics', height=7000, facet_row_spacing=0.005, \
                            category_orders={"metrics": ["dataScanned", "scanTime"]})
fig = fig.update_yaxes(matches=None)
fig.show()

In [None]:
appliance_id = '37286f5a-9f8d-4f05-829a-2e9a8f25c5e4'
fromDt = '2024-08-04'
toDt = '2024-08-06'
dfp = df[(df['appliance_id'] == appliance_id) & (df['ts'] >= fromDt) & (df['ts'] <= toDt) ]
dfc = df.pivot_table(index=['appliance_id','ts'], columns='metrics', values='value', aggfunc='max').reset_index()
dfc.drop('appliance_id', axis=1, inplace=True)
dfc = dfc.corr()
fig1 = px.imshow(dfc, text_auto=True, height=1000)
fig1.show()