## Results

There are two trials:

* In-memory: in Evaluation-with-vis notebook (to be renamed)
* PostgreSQL 100M records: in Postgre100M notebook

To run the results you need two results file from each of the above.

In [None]:
import sys
import os
import time
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from IPython.display import clear_output

from utils import (generate_data,
                   plot_data,
                   compute)

In [None]:
def format_num(num):
    if num >= 1000000:
        return f"{num / 1000000}m"
    elif num >= 1000:
        return f"{num / 1000}k"
    else:
        return f"{num}"

def factors_of(x):
    d2 = [0.001, 0.005, 0.01, 0.05, 0.1]
    return [i * x for i in d2]

PM = 'planned missing'
GM = 'general missing'
SET = 'sets'
PATTERNS = [SET, GM, PM]
SETVIS = 'setvis'
UPSET = 'upsetplot'
PACKAGES = [UPSET, SETVIS]
# upset not possible with 100000 rows & 500 cols on 16gb machine
UPSET_COL_LIMIT = 500
UPSET_ROW_LIMIT = 100000
# setvis not possible with planned missing 500000 x 1000 44gb machine
GM_ROW = 100000
GM_COL = 50
GM_INTS = factors_of(GM_ROW)
ROWS = [10000, 25000,50000, GM_ROW, 500000] 
COLS = [10, GM_COL, 100, 500, 700]
UPSET_LIMIT = 500
def size_mb(object):
    return f"{sys.getsizeof(object)/1024/1024:.3f}MB"

In [None]:
GM_INTS

In [None]:
# df = generate_data(GM, GM_ROW, GM_COL, 2 * GM_COL - 1)
# print("df size", size_mb(df))
# upset_df = compute(df, UPSET, False)
# print("obj size", size_mb(upset_df))
# upset_df.head()

In [None]:
import seaborn as sns

def plot_df(d, time=True, y='seconds', compute=True, x='colxrow', 
            hue='pattern', markertext=True):
    ylabel = "Time (seconds)"
    xlabel = "Millions of cells"
    # Legend title mapping
    legend_map = {
        "upsetplot-sets": "upsetplot (set-type data)",
        "setvis-sets": "setvis (set-type data)",
        "upsetplot-planned missing": "upsetplot (missing values)",
        "setvis-planned missing": "setvis (missing values)"
    }
    # Apply legend title mapping to the DataFrame
    d[hue] = d[hue].map(legend_map).fillna(d[hue])

    # is it GM only?
    gm_plt = (d['pattern'].str.contains('GM')).all()
    # temp title
    libraries = d['library'].unique() if 'library' in d.columns else 'PSQL - '
    title = ' & '.join(libraries) if not isinstance(libraries, str) else 'SETVIS'
    if time:
        title += " - times"
    else:
        title += " - memory"

    # toggle values
    if gm_plt:
        xlabel = "Number of missing combinations."
    if not time:
        ylabel = "Memory (MB)"
    set_plt = (d['pattern'].str.contains('set-type data')).all()
    if set_plt:
        xlabel = "Number of set intersections"
        rows_str = ','.join(map(str, d['rows'].unique().tolist()))
        title += f" For {rows_str} rows and 2 column set data."
        
    # start the plot
    plt.figure(figsize=(4, 4))
    
    def format_ax(ax):
        ax.set_xlabel(xlabel, fontsize=12)
        ax.set_ylabel(ylabel, fontsize=12)

        # Add markers to the line plot
        for line in ax.lines:
            line.set_marker('o')

    g = sns.relplot(data=d, x=x, y=y, hue=hue, kind='line', height=3, aspect=1)
    for ax in g.axes.flat:
        format_ax(ax)
    sns.move_legend(g, "upper left", bbox_to_anchor=(.55, .65), title="")
    g.add_legend()
    
    # plt.title(f"Figure: {title}")
    plt.show()


## Results

In [None]:
# eye opener
# df = generate_data(GM, 1000000, 10, 2 * 10 -1)
# f"{sys.getsizeof(df)/1024/1024:.3f}MB"
# '381.470MB'

In [None]:
# where the files are
# os.chdir("/home/layik/Documents/papers/setvis")
file = '-'.join(map(str, COLS)) + 'X' + '-'.join(map(str, ROWS)) 
times_df = pd.read_csv(file + "-times.csv")
mem_df = pd.read_csv(file + "-mems.csv")

In [None]:
############ modify dataframe for plotting ####################
# modify mem_df: upset to upsetplot
mem_df.loc[mem_df['library'] == 'upset', 'library'] = UPSET
# combine library & pattern
mem_df['pattern'] = mem_df['library'] + "-" + mem_df['pattern']

# modify times_df
times_df.loc[times_df['library'] == 'upset', 'library'] = UPSET
# combine library & pattern
times_df['pattern'] = times_df['library'] + "-" + times_df['pattern']

In [None]:
# mem_df[(mem_df['pattern'].str.contains(GM))]

### Memory results

In [None]:
# mem_df[(mem_df['pattern'].str.contains(SET)) & (mem_df['library'] == SETVIS)]

In [None]:
# x-axis for SET is combinations
plot_df(mem_df[(mem_df['pattern'].str.contains(SET)) & 
               (mem_df['colxrow'] == 1000000)], time = False, 
        x = 'combinations', y = 'memory')
# x-axis for GM should be num of combs
mem_pm_df = mem_df[(mem_df['pattern'].str.contains(PM))].copy()
mem_pm_df['colxrow'] = mem_pm_df['colxrow']/1e6
plot_df(mem_pm_df, time = False, 
        y = 'memory', markertext = False)
plot_df(mem_df[(mem_df['pattern'].str.contains(GM))], time = False, y = 'memory', markertext = False)

# plotting setvis two objects
# plot_df(mem_df[(mem_df['pattern'].str.contains(GM)) & (mem_df['library'] == SETVIS)], False, 'memory_col')
# plot_df(mem_df[(mem_df['pattern'].str.contains(GM)) & (mem_df['library'] == SETVIS)], False, 'memory_row')


In [46]:
mem_df[(mem_df['pattern'].str.contains(SET)) & 
               (mem_df['colxrow'] == 1000000)]

Unnamed: 0,rows,columns,colxrow,memory,pattern,library,memory_df,memory_col,memory_row,combinations,output_rows,output_cols
56,500000,2,1000000,56.28199,upsetplot-sets,upsetplot,55.7424,0.0,0.0,100,500000,1
57,500000,2,1000000,266.15157,upsetplot-sets,upsetplot,56.54349,0.0,0.0,500,500000,1
58,500000,2,1000000,7.64068,setvis-sets,setvis,91.4575,0.01127,7.62941,100,100,110
59,500000,2,1000000,7.8955,setvis-sets,setvis,93.05968,0.26609,7.62941,500,500,550
60,500000,2,1000000,8.6861,setvis-sets,setvis,93.25995,1.05669,7.62941,1000,1000,1100
61,500000,2,1000000,33.89362,setvis-sets,setvis,94.94604,26.26421,7.62941,5000,5000,5500
62,500000,2,1000000,112.60989,setvis-sets,setvis,95.15681,104.98048,7.62941,10000,10000,11000


In [None]:
# memory for upset stats
# mem_df[mem_df['library'] == UPSET].describe()

In [None]:
# memory for setvis stats
# mem_df[mem_df['library'] == SETVIS].describe()

In [None]:
# pattern using least/most memory upset
upset_df = mem_df[(mem_df['library'] == UPSET)]
display(upset_df[upset_df['memory'] == max(upset_df['memory'])])
upset_df[upset_df['memory'] == min(upset_df['memory'])]

In [None]:
# pattern using least/most memory setvis
setvis_df = mem_df[(mem_df['library'] == SETVIS)]
display(setvis_df[setvis_df['memory'] == max(setvis_df['memory'])])
setvis_df[setvis_df['memory'] == min(setvis_df['memory'])]

### Times results

In [None]:
# overall min/max time
display(times_df[times_df['seconds'] == min(times_df['seconds'])])
times_df[times_df['seconds'] == max(times_df['seconds'])]

In [None]:
# plot plotting times
plot_df(times_df[(times_df['compute'] == False) & (~times_df['pattern'].str.contains(GM))], compute = False)
plot_df(times_df[(times_df['compute'] == False) & (times_df['pattern'].str.contains(GM))], compute = False)

In [None]:
# pattern using least/max time upset
upset_df = times_df[(times_df['library'] == UPSET)]
upset_df[upset_df['seconds'] == max(upset_df['seconds'])]

In [None]:
# pattern using least time setvis
setvis_df = times_df[(times_df['library'] == SETVIS)]
setvis_df[setvis_df['seconds'] == max(setvis_df['seconds'])]
# setvis_df[setvis_df['pattern'] == GM]
# 100	0.021	general missing	setvis	True

## PSQL 100M record

In [None]:
times_100m = pd.read_csv("100M-psql-times.csv")
mem_100m = pd.read_csv("100M-psql-mems.csv")

In [None]:
mem_100m

In [None]:
plot_df(mem_100m[(mem_100m['pattern'].str.contains(GM))], time = False, y = 'memory', x="combinations")
