In [None]:
## Load Dataframe
import pandas as pd
pd.options.mode.chained_assignment = None
import os

df_arr = []
root = ".dataDir"
for path, subdirs, files in os.walk(root):
    for name in files:
        df = pd.read_csv(os.path.join(path, name))
        df_arr.append(df)

df_cons = pd.concat(df_arr, ignore_index=True)
df_cons['ts'] = pd.to_datetime(df_cons['ts'],unit='ms')

display(df_cons)

In [None]:
## create tenant specific timeseriesplots
import plotly.express as px
from plotly.subplots import make_subplots

## Filters on tenant and date
customer_name='exl'
df_tenant = df_cons[(df_cons['tenant'].str.contains(customer_name, regex=False) & df_cons['ts'].between('2024-7-5', '2024-7-11'))]
dateStr = str(df_tenant['ts'].min())+' and '+str(df_tenant['ts'].min())

df_app=df_tenant.groupby(['fileFormat']).agg({'dataScannedInGB': 'sum', 'processingTimeinHrs':'sum'}).add_suffix('_Count').reset_index()
df_app['scanSpeedinGBperDay']=df_app.dataScannedInGB_Count*24/df_app.processingTimeinHrs_Count
df_app.sort_values('scanSpeedinGBperDay',inplace=True)
fig3 = px.bar(df_app, y='fileFormat', x='scanSpeedinGBperDay', log_x=True, orientation='h', title='Unstructured ScanSpeed by FileFormat between '+dateStr+ ' for '+customer_name)
fig3.show()

df_tenn = df_tenant.groupby(['ts', 'tenant']).agg({'dataScannedInGB': 'sum', 'numberOfFilesScanned':'sum','IdleTimeInHrs':'sum', 'processingTimeinHrs':'sum', 'uniqPodCount':'mean'}).add_suffix('_Count').reset_index()
df_tenn['ts'] = pd.to_datetime(df_tenn['ts'],unit='ms')
df_tenn['numberOfFilesScanned_Count'] = (df_tenn['numberOfFilesScanned_Count']/1000).astype(float)
df_tenn['processingTimeinHrs_Count'] = (df_tenn['processingTimeinHrs_Count']/24).astype(float)
df_tenn['IdleTimeInHrs_Count'] = (df_tenn['IdleTimeInHrs_Count']/24).astype(float)
df_tenn['uniqPodCount_Count'] = (df_tenn['uniqPodCount_Count']).astype(float)
df_tenn.rename(columns={'dataScannedInGB_Count':"dataScannedInGB", 'processingTimeinHrs_Count': 'processingTimeinDays', 'IdleTimeInHrs_Count':'IdleTimeInDays', 'numberOfFilesScanned_Count':'numberOfFilesScanned(x1000)'}, inplace=True)

subfig = make_subplots(specs=[[{"secondary_y": True}]])
fig = px.bar(df_tenn, x='ts', y=df_tenn.columns[2:6], log_y=True, title=customer_name)
fig2 = px.line(df_tenn, x='ts', y='uniqPodCount_Count')
fig2.update_traces(yaxis="y2",showlegend=True,name='uniqDWCount_avg')
subfig.add_traces(fig.data + fig2.data)
subfig.layout.yaxis2.title="dw-count"
subfig.layout.yaxis.title="GB"
subfig.layout.title='Unstructured Scan performance between '+dateStr+' for '+customer_name
subfig.show()



In [None]:
## Run Aggregations
top_n = int(15)
df_aggr = df_cons.groupby(['tenant']).agg({'dataScannedInGB': 'sum', 'processingTimeinHrs':'sum'})
df_aggr['AvgDataInGBperdayByDw']=df_aggr.dataScannedInGB*24/df_aggr.processingTimeinHrs
df_aggr = df_aggr.add_suffix('_Count').reset_index()
df_aggr.sort_values('dataScannedInGB_Count',inplace=True)
df_aggr['dataScannedinTib']=(df_aggr['dataScannedInGB_Count']/1000).astype(float)
fig4 = px.bar(df_aggr.tail(top_n), y='tenant', x='dataScannedinTib',orientation='h', title='top '+str(top_n)+' customer by datavolume scanned')
fig4.show()
df_aggr.describe(percentiles=[0.25, 0.75])

In [None]:
## Create Plots
df_aggr = df_cons.groupby(['ds','tenant','pod']).agg({'dataScannedInGB': 'sum', 'processingTimeinHrs':'sum'})
df_aggr['AvgDataInGBperdayByDw']=df_aggr.dataScannedInGB*24/df_aggr.processingTimeinHrs
df_aggr = df_aggr.add_suffix('_Count').reset_index()
fig = px.box(df_aggr, x='ds', y='AvgDataInGBperdayByDw_Count', color='ds', points=False, log_y=True)
fig.update_layout(
    font=dict(family="Didact Gothic"),
    yaxis_title="<b>GB/day/dw<b>",
    xaxis_title="<b>fileFormat<b>",
    title_x=0.5,
    title_font=dict(size=24,color='black'),
    plot_bgcolor='rgba(0, 0, 0, 0)', #'white',  
    paper_bgcolor='rgba(0, 0, 0, 0)', #'white',
    width=1200,  
    height=900
)
fig.update_traces(quartilemethod="exclusive")
fig.update_yaxes(nticks=10, minor=dict(showgrid=True, gridwidth=2, gridcolor='Black'))
fig.show()