In [8]:
import glob
import itertools as it
import pandas as pd
#import mako

def get_secs(timestring):
    # convert h:mm:ss or m:ss to seconds
    parts = map(float,timestring.split(":")[::-1])
    products = [60 ** i for i in xrange(len(parts))]
    total = sum(part * product for part,product in it.izip(parts, products))
    return total

def parse_filename(filename, d):
    filename_fields = filename.split("_")
    d["dataset"] = filename_fields[0]
    d["k"] = int(filename_fields[1][1:])
    d["frequency"] = int(filename_fields[2][1:])
    return d

def parse_info(filename, d):
    with open(filename) as f:
        d["dsk size"] = float(f.readline().split()[4][:-1])
    return d

def parse_bench(filename, d):
    order = "variable" if "varord" in filename else "fixed"
    d[order] = d[order] if order in d else {}
    with open(filename) as f:
        lines = f.readlines()
        size_fields = ["DBG size", "LCS size"]
        op_fields = ["forward", "backward", "lastchar", "shorter_1", "shorter_2", "shorter_4", "shorter_8",
                       "longer_1", "longer_2", "longer_4", "longer_8", "maxlen ", "maxlen*"]
        for i,line in enumerate(lines):
            for field in size_fields + op_fields:
                if field in line: 
                    factor = 1
                    if field in op_fields:
                        factor = 0.001 # convert to microsec if time measurement
                    d[order][field.rstrip()] = float(line.split()[-2]) * factor
                    if "longer" in field:
                        d[order][field + " per node"] = float(lines[i+1].split()[3]) * factor
                        d[order][field + " nodes"] = int(lines[i+1].split()[-1][1:-1]) * factor
        d[order]["Total size"] = d[order]["DBG size"] + (d[order]["LCS size"] if order == "variable" else 0)
    return d

def parse_build(filename, d):
    order = "variable" if "varord" in filename else "fixed"
    d[order] = d[order] if order in d else {}
    
    if "peak disk" not in d[order]: d[order]["peak disk"] = []
    if "peak mem" not in d[order]: d[order]["peak mem"] = []
    if "wall time" not in d[order]: d[order]["wall time"] = []
        
    fields = ["wall clock", "Peak disk allocs"]
    with open(filename) as f:
        for line in f:
            if "Building from" in line: d["# kmers"] = int(line.split()[6])/2
            elif "Peak disk allocs" in line: d[order]["peak disk"].append(float(line.split()[-2])/1024) # MB -> GB
            elif "Maximum resident" in line: d[order]["peak mem"].append(float(line.split()[-1])/1024**2) # KB -> GB
            elif "wall clock" in line: d[order]["wall time"].append(get_secs(line.split()[-1])/60.0)
    return d

def parse(filename, d=None):
    d = d or dict()
    parse_filename(filename, d)
    if "info" in filename: parse_info(filename, d)
    elif "bench" in filename: parse_bench(filename, d)
    elif "build" in filename: parse_build(filename, d)
    return d

def parse_group(files, d=None):
    for f in files:
        d = parse(f, d)
    return d

def generate_table_rows(d):
    # output two runs of experiments, fixed and variable, with averages of peak disk, peak mem, wall time
    for order in ["fixed", "variable"]:
        for i in xrange(len(d[order]["peak disk"])):
            new_d = {"order": order}
            for key, data in d.items():
                if key not in ["peak disk", "peak mem", "wall time", "fixed", "variable"]: new_d[key] = data
            for key, data in d[order].items():
                if key not in ["peak disk", "peak mem", "wall time", "fixed", "variable"]: new_d[key] = data
            for x in ["peak disk", "peak mem", "wall time"]:
                new_d[x] = d[order][x][i]
            yield new_d

def get_dataframe(groups):
    return pd.DataFrame(list(it.chain.from_iterable(generate_table_rows(parse_group(group)) for group in groups)))
            
def get_files():
    log_files = [x for x in glob.glob("*.log") if "dsk" not in x] #ignore DSK timings for now
    return [list(g) for _,g in it.groupby(log_files, key=lambda x: "".join(x.split("_")[:3]))]

def get_data():
    groups = get_files()
    df = get_dataframe(groups)
    return df

if __name__ == "__main__":
    get_data()

In [22]:
df = get_data()

datasets = ["ecoli", "chrom14", "human", "parrot"]

pd.options.display.float_format = '{:.2f}'.format

# describe dsk data
# min should work for these fixed values
df.groupby(["dataset", "k"])[["dsk size", "k", "# kmers", "frequency"]].min().transpose()[datasets]

#df

dataset,ecoli,chrom14,chrom14,chrom14,chrom14,chrom14,human,parrot
k,28,16,32,48,56,64,56,56
dsk size,1.6,4.2,7.1,7.2,6.9,6.4,27.0,71.0
k,28.0,16.0,32.0,48.0,56.0,64.0,56.0,56.0
# kmers,204098902.0,561434000.0,951050150.0,482664928.0,461445333.0,427654657.0,1794522954.0,4716731435.0
frequency,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0


In [26]:
grouped = df.groupby(["dataset", "k", "order"])
fields = ["Total size", "wall time", "peak mem", "peak disk", "forward", "backward", "lastchar"] \
    + [col for col in list(df) if col.startswith("maxlen")] \
    + [col for col in list(df) if col.startswith("shorter")] \
    + sorted([col for col in list(df) if col.startswith("longer")],key=lambda x:x[::-1])
grouped.min()[fields].transpose()[datasets]

dataset,ecoli,ecoli,chrom14,chrom14,chrom14,chrom14,chrom14,chrom14,chrom14,chrom14,chrom14,chrom14,human,human,parrot,parrot
k,28,28,16,16,32,32,48,48,56,56,64,64,56,56,56,56
order,fixed,variable,fixed,variable,fixed,variable,fixed,variable,fixed,variable,fixed,variable,fixed,variable,fixed,variable
Total size,162.65,420.14,584.59,778.11,885.68,1541.3,391.58,1271.49,413.94,1415.54,436.97,1562.59,1706.21,5547.89,4297.58,13917.22
wall time,3.93,5.09,6.85,8.79,13.42,18.36,13.53,17.21,14.37,18.72,15.26,19.92,64.45,83.85,162.58,225.73
peak mem,3.16,3.16,2.78,2.78,3.21,3.21,3.23,3.22,3.22,3.22,3.22,3.22,7.65,9.31,15.3,15.29
peak disk,12.17,12.17,33.47,33.47,56.69,56.69,57.54,57.54,56.68,56.68,67.35,67.35,248.37,248.37,562.28,562.28
forward,6.0,17.03,6.68,25.72,5.66,18.49,6.29,16.5,6.24,16.17,6.01,15.39,7.07,18.31,7.77,19.39
backward,8.23,59.77,10.48,90.15,8.57,65.27,8.5,58.65,8.47,55.63,8.18,53.43,9.27,62.85,10.46,63.87
lastchar,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.02,0.01,0.01,0.01,0.01
maxlen,,5.41,,6.49,,5.93,,5.91,,5.98,,5.75,,6.71,,7.49
maxlen*,,1.43,,2.26,,2.16,,1.55,,1.56,,1.55,,2.02,,2.46
shorter_1,,14.65,,12.49,,15.75,,18.0,,17.72,,17.84,,19.54,,19.84


In [27]:
grouped.min()[fields].transpose()[datasets]["chrom14"]

k,16,16,32,32,48,48,56,56,64,64
order,fixed,variable,fixed,variable,fixed,variable,fixed,variable,fixed,variable
Total size,584.59,778.11,885.68,1541.3,391.58,1271.49,413.94,1415.54,436.97,1562.59
wall time,6.85,8.79,13.42,18.36,13.53,17.21,14.37,18.72,15.26,19.92
peak mem,2.78,2.78,3.21,3.21,3.23,3.22,3.22,3.22,3.22,3.22
peak disk,33.47,33.47,56.69,56.69,57.54,57.54,56.68,56.68,67.35,67.35
forward,6.68,25.72,5.66,18.49,6.29,16.5,6.24,16.17,6.01,15.39
backward,10.48,90.15,8.57,65.27,8.5,58.65,8.47,55.63,8.18,53.43
lastchar,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.02
maxlen,,6.49,,5.93,,5.91,,5.98,,5.75
maxlen*,,2.26,,2.16,,1.55,,1.56,,1.55
shorter_1,,12.49,,15.75,,18.0,,17.72,,17.84
