In [60]:
import gzip
import json
import pandas as pd
import numpy as np
from os.path import join, isfile
from os import listdir
import matplotlib.pyplot as plt
import seaborn as sns
import string

In [28]:
def get_json(fname, fpath='../../wos_paper/wos_db_benchmark/benchmarking/'):
    fname = join(fpath, fname)
    if fname[-2:] == 'gz':
        with gzip.open(fname, "rb") as f:
            data = json.loads(f.read(), encoding="utf-8")
    else:
        with open(fname, 'r') as fp:
            data = json.load(fp)
    return data

def get_json_fnames(sourcepath='../../wos_paper/wos_db_benchmark/benchmarking/'):
    only_json = [f for f in listdir(sourcepath) if 
                     isfile(join(sourcepath, f)) 
                     and f[-4:] == 'json']
    return only_json


def get_result_fnames(sourcepath='../results/arango/'):
    r = [f for f in listdir(sourcepath) if 
                     isfile(join(sourcepath, f)) 
                     and f[-7:] == 'json.gz' and 'result' in f and 'limit' not in f]
    return r

def get_content(sql_list, fpath, how='sql'):
    report = []
    if how == 'sql':
        limit_tokens = 3
    elif how == 'arango':
        limit_tokens = 3
    else:
        limit_tokens = 3
    for f in sql_list:
        coded = f.split('_')
        limit = -1 if len(coded) < limit_tokens else int(coded[-1].split('.')[0])
        content = get_json(f, fpath)
        report += [(coded[0], limit, f, content)]
    return report

def get_report_gzipped(flist, fpath):
    acc = []
    for fname in flist:
        qname = fname.split('/')[0]
        data = get_json(fname, fpath)
        acc += [(qname, data)] 
    return acc

def get_mysql_times(sql_report):
    times_stat = []
    for item in sql_report:
        key, limit, f, content = item
        if 'elapsed' in content:
            times_stat += [(*item[:-1], content['elapsed'])]
        else:
            times_stat += [(*item[:-1], content['elapsed_1978'])]
    return times_stat

def get_arango_times(reports):
    times = [(*item[:-1], [sum(x.values()) for x in item[-1]]) for item in reports]
    times_stat = [(*item[:-1], np.mean(item[-1]), np.std(item[-1])) for item in times]
    return times_stat

In [29]:
sql_list = sorted(get_json_fnames())
sql_report = get_content(sql_list, '../../wos_paper/wos_db_benchmark/benchmarking/')
# sql_report[:2]

In [30]:
flist = sorted(get_result_fnames())
arango_data = get_report_gzipped(flist, '../results/arango/')

In [51]:
# q1
q1_sql = pd.DataFrame(sql_report[1][-1]['results_1978'], columns=['year', 'title', 'count'])
q1_arango = pd.DataFrame([{**item['journal'], **{'count': item['number_pubs']}} for item in arango_data[0][1]])

# q1_arango groupby by title for comparison

q1_cmp = pd.merge(q1_sql, q1_arango, on='title', how='outer', suffixes=['_sql', '_arango'])
q1_cmp[['title', 'issn', 'isbn', 'count_sql', 'count_arango']].head(15)

Unnamed: 0,title,issn,isbn,count_sql,count_arango
0,FEDERATION PROCEEDINGS,0014-9446,,7287.0,7287
1,BULLETIN OF THE AMERICAN PHYSICAL SOCIETY,0003-0503,,5895.0,5895
2,CLINICAL RESEARCH,0009-9279,,5208.0,5208
3,ABSTRACTS OF PAPERS OF THE AMERICAN CHEMICAL S...,0065-7727,,4514.0,4514
4,BRITISH MEDICAL JOURNAL,0959-8138,,3597.0,3581
5,BRITISH MEDICAL JOURNAL,0959-535X,,3597.0,12
6,BRITISH MEDICAL JOURNAL,1756-1833,,3597.0,4
7,LANCET,0140-6736,,3189.0,3189
8,TLS-THE TIMES LITERARY SUPPLEMENT,0307-661X,,3062.0,3062
9,NATURE,0028-0836,,2990.0,2990


In [67]:
puncts = list(string.punctuation)
from nltk.corpus import stopwords
stop_words_nltk = set(stopwords.words('english'))
numerics = set([str(i) for i in range(100)])
all_stops = list(set(puncts) | stop_words_nltk | numerics)
q2_sql = pd.DataFrame(sql_report[2][-1]['frequent_words'], columns=['uword', 'count'])
q2_sql_mod = q2_sql.loc[~q2_sql.uword.isin(all_stops)]
q2_arango = pd.DataFrame(arango_data[1][1])
q2_cmp = pd.merge(q1_sql_mod, q1_arango, on='uword', how='outer', suffixes=['_sql', '_arango'])

KeyError: 'uword'