In [1]:
import gzip
import json
import pandas as pd
import numpy as np
from os.path import join, isfile
from os import listdir
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def get_json(fname, fpath='../../wos_paper/wos_db_benchmark/benchmarking/'):
    fname = join(fpath, fname)
    with open(fname, 'r') as fp:
        data = json.load(fp)
    return data

def get_json_fnames(sourcepath='../../wos_paper/wos_db_benchmark/benchmarking/'):
    only_json = [f for f in listdir(sourcepath) if 
                     isfile(join(sourcepath, f)) 
                     and f[-4:] == 'json']
    return only_json

def get_content(sql_list, fpath, how='sql'):
    report = []
    if how == 'sql':
        limit_tokens = 3
    elif how == 'arango':
        limit_tokens = 3
    else:
        limit_tokens = 3
    for f in sql_list:
        coded = f.split('_')
        limit = -1 if len(coded) < limit_tokens else int(coded[-1].split('.')[0])
        content = get_json(f, fpath)
        report += [(coded[0], limit, f, content)]
    return report

def get_mysql_times(sql_report):
    times_stat = []
    for item in sql_report:
        key, limit, f, content = item
        if 'elapsed' in content:
            times_stat += [(*item[:-1], content['elapsed'])]
        else:
            times_stat += [(*item[:-1], content['elapsed_1978'])]
    return times_stat

def get_arango_times(reports):
    times = [(*item[:-1], [sum(x.values()) for x in item[-1]]) for item in reports]
    times_stat = [(*item[:-1], np.mean(item[-1]), np.std(item[-1])) for item in times]
    return times_stat

In [3]:
sql_list = sorted(get_json_fnames())
sql_report = get_content(sql_list, '../../wos_paper/wos_db_benchmark/benchmarking/')
sql_times = get_mysql_times(sql_report)
sql_times[:4]

[('query1', -1, 'query1_results.json', 5.173050403594971),
 ('query1', 125, 'query1_results_125.json', 12.609336137771606),
 ('query2', -1, 'query2_results.json', 100.15733695030212),
 ('query3', -1, 'query3_results.json', 838.3496978282928)]

In [4]:
arango_list = sorted(get_json_fnames('../results/arango/'))
arango_report = get_content(arango_list, '../results/arango/')
arango_times = get_arango_times(arango_report)
arango_times[:3]

[('query1',
  -1,
  'query1_profile.json',
  102.45896935462952,
  0.47149495004183867),
 ('query1',
  10,
  'query1_profile_limit_10.json',
  0.0007680257161458334,
  4.134865280457959e-05),
 ('query1',
  1280,
  'query1_profile_limit_1280.json',
  9.039621829986572,
  4.916712782442367)]

In [17]:
dfa = pd.DataFrame(arango_times, columns=['query', 'limit', 'fname', 'delta', 'std']).sort_values(['query', 'limit'])
dfs = pd.DataFrame(sql_times, columns=['query', 'limit', 'fname', 'delta']).sort_values(['query', 'limit'])

In [18]:
dfa

Unnamed: 0,query,limit,fname,delta,std
0,query1,-1,query1_profile.json,102.458969,0.471495
8,query1,5,query1_profile_limit_5.json,0.000692,0.000162
1,query1,10,query1_profile_limit_10.json,0.000768,4.1e-05
4,query1,20,query1_profile_limit_20.json,0.017741,0.006473
7,query1,40,query1_profile_limit_40.json,0.024754,0.014577
11,query1,80,query1_profile_limit_80.json,0.088174,0.051521
3,query1,160,query1_profile_limit_160.json,0.239687,0.121746
6,query1,320,query1_profile_limit_320.json,1.003525,0.65502
10,query1,640,query1_profile_limit_640.json,3.161221,1.671771
2,query1,1280,query1_profile_limit_1280.json,9.039622,4.916713


In [11]:
df0 = pd.merge(dfs[['query', 'limit', 'delta']], dfa[['query', 'limit', 'delta']], 
         how='inner', on=['query', 'limit'], suffixes=['_sql', '_arango']).sort_values(['query', 'limit'])

In [12]:
df0

Unnamed: 0,query,limit,delta_sql,delta_arango
0,query1,-1,5.17305,102.458969
1,query2,-1,100.157337,128.315026
2,query3,-1,838.349698,22.170851
3,query4,-1,3548.872995,88.873273
4,query5,-1,93.625098,248.488283
6,query5,5,212.950017,31.872162
5,query5,10,590.33232,42.69021


In [13]:
df00 = pd.merge(dfs[['query', 'limit', 'delta']], dfa[['query', 'limit', 'delta']], 
         how='outer', on=['query', 'limit'], suffixes=['_sql', '_arango']).sort_values(['query', 'limit'])

In [16]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
df00

Unnamed: 0,query,limit,delta_sql,delta_arango
0,query1,-1,5.17305,102.458969
17,query1,5,,0.000692
10,query1,10,,0.000768
13,query1,20,,0.017741
16,query1,40,,0.024754
20,query1,80,,0.088174
1,query1,125,12.609336,
12,query1,160,,0.239687
15,query1,320,,1.003525
19,query1,640,,3.161221
