In [None]:
import datetime
import json
import os
import glob
import re
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from rich import print

In [None]:
folder = 'logs'

# Update with your logfile name
scenario = 'scenario01_DOCDB_5371_9h55h92f.json'

# It can analyze multple logfiles with the same extension by using a wild card
# For example, to analye all files with extension _rv9ft04i at once.
# scenario = 'scenario01_DOCDB_\.*_rv9ft04i.json'

data = {}

def get_folders(folder):
    return [f for f in os.listdir(folder) if os.path.isdir(os.path.join(folder, f))]

def get_files(folder):
    return [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]

boxes = get_folders(folder)
for box in boxes:
    files = get_files(os.path.join(folder, box))
    for file in files:
        if re.match(scenario, file):
            with open(os.path.join(folder, box, file), 'r') as f:
                file_data = json.loads(f.read())
                for timestamp in file_data.keys():
                    datetime_str = datetime.datetime.fromtimestamp(int(timestamp)).strftime('%Y-%m-%d %H:%M:%S')                                                                                            
                    if datetime_str not in data.keys():
                        # Add a new entry but make sure no division by 0
                        if file_data[timestamp]['count'] == 0:
                            avg_time = 0
                        else:
                            avg_time = file_data[timestamp]['query_time'] / file_data[timestamp]['count']
                        data[datetime_str] = {
                            'count': file_data[timestamp]['count'],
                            'query_time': file_data[timestamp]['query_time'],
                            'min_time': file_data[timestamp]['min_time'],
                            'max_time': file_data[timestamp]['max_time'],
                            'avg_time': avg_time,
                        }
                    else:
                        # An entry for this time stamp exists update the values
                        data[datetime_str]['count'] += file_data[timestamp]['count']
                        data[datetime_str]['query_time'] += file_data[timestamp]['query_time']                                                
                        if file_data[timestamp]['min_time'] < data[datetime_str]['min_time']:
                            data[datetime_str]['min_time'] = file_data[timestamp]['min_time']                            
                        if file_data[timestamp]['max_time'] > data[datetime_str]['max_time']:
                            data[datetime_str]['max_time'] = file_data[timestamp]['max_time']                        
                        data[datetime_str]['avg_time'] = data[datetime_str]['query_time']/data[datetime_str]['count']


# Order the data by timespamp. This can be usefull if multiple logsfies are used
dataKeys = list(data.keys())
dataKeys.sort()
sortedData = {i: data[i] for i in dataKeys}
data = sortedData

In [None]:
# Convert the dictionary to a Pandas DataFrame
df = pd.DataFrame.from_dict(data, orient='index')

# Convert the index to a DateTimeIndex
df.index = pd.to_datetime(df.index)

# Create a time series plot with multiple lines
plt.plot(figsize=(8, 6))
plt.plot(df.index, df['count'], label='count')
# plt.plot(df.index, df['query_time'], label='query_time')

# Add labels and title
# plt.ylim(0, 20000)
plt.xlabel('Time')
plt.ylabel('Value')
plt.title('DocumentDB + EC Requests/s (count)')
plt.xticks(rotation=90)

# Show the legend
plt.legend()

# Show the plot
plt.show()

In [None]:
# plt.ylim(0, 0.02)
#plt.plot(df.index, df['min_time'], label='min_time')
#plt.plot(df.index, df['max_time'], label='max_time')
plt.plot(df.index, df['avg_time'], label='avg_time')

# Add labels and title
plt.xlabel('Time')
plt.ylabel('Value')
plt.title('DocumentDB + EC Response Time (sec)')
plt.xticks(rotation=90)

# Show the legend
plt.legend()

# Show the plot
plt.show()