In [None]:
import duckdb
import matplotlib.pyplot as plt
import os
import pandas as pd
import seaborn as sns
import sys

directory_path = os.path.abspath(os.path.join('../utils/'))
if directory_path not in sys.path:
    sys.path.append(directory_path)
from functions import *
from constant import *

# Experiment parameters, need to be set before running this notebook.
EXPERIMENT_ID = ["spark_del_sf_1000", "spark_ib_cow_w3m_sf_1000", "spark_ib_mor_w3m_sf_1000","spark_ib_cow_w3_sf_1000"]  
EXPERIMENT_START_TIME = ["2023-06-29T14:39:03.335772Z", "2023-06-30T17:40:51.588244Z", "2023-07-01T19:34:38.242127Z","2023-07-04T10:07:29.985447Z"]

In [None]:
# --- Check input validity and create DB connection --- #
assert len(EXPERIMENT_ID)==len(EXPERIMENT_START_TIME)

# Connect to database.
con = duckdb.connect(database=DUCKDB_PATH, read_only=True)

In [None]:
# Compute phase time.
EXP_DATA = pd.DataFrame()
for idx, id in enumerate(EXPERIMENT_ID):
    exp_df = retrieve_event_df(con, id, EXPERIMENT_START_TIME[idx])
    exp_df["exp_name"] = id
    EXP_DATA = pd.concat([EXP_DATA, exp_df])
    
EXP_DATA = filterByEventType(EXP_DATA, "EXEC_PHASE")
# Calculate latency for each element.
EXP_DATA['time_diff_in_mins'] = EXP_DATA.apply(lambda x: time_diff_in_minutes(x['event_start_time'], x['event_end_time']), axis=1)

EXP_DATA.groupby(['exp_name'])['time_diff_in_mins'].sum().reset_index()

In [None]:
# Compute statement time.
EXP_DATA = pd.DataFrame()
for idx, id in enumerate(EXPERIMENT_ID):
    exp_df = retrieve_event_df(con, id, EXPERIMENT_START_TIME[idx])
    exp_df["exp_name"] = id
    EXP_DATA = pd.concat([EXP_DATA, exp_df])
    
EXP_DATA = filterByEventType(EXP_DATA, "EXEC_STATEMENT")
# Calculate latency for each element.
EXP_DATA['time_diff_in_mins'] = EXP_DATA.apply(lambda x: time_diff_in_minutes(x['event_start_time'], x['event_end_time']), axis=1)

EXP_DATA.groupby(['exp_name'])['time_diff_in_mins'].sum().reset_index()