# Perfomance Upset/PACE


We need to evaluate three aspects:
1. Missingness computation (time in sec)
2. Visualisation (time in sec) (only implemented for `upset` at the moment)
3. RAM (`psutil.virtual(memory)`, not implemented yet)


The results of the evaluation/profiling are written to a csv file. The filename contains the visualisation package that was used and a timestamp `outputfile_<package>_<yyyymmdd-HHMMSS>`. The config file that was used  for the evaluation is saved with the timestamp `config_<yyyymmdd-HHMMSS>.yaml`. The config file contains the parameters required to run the evaluation and a comment with the `git commit id`.

Use the variable `package` to switch between "pace" and "upset".


In [40]:
from pace.membership import Membership
from pace.plots import PlotSession
from utils import generate_pattern, eval_data, set_seed
from datetime import datetime, time
import psutil
import yaml
import pandas as pd
import csv
from datetime import datetime
from pathlib import Path
import subprocess
# Yaml loaders and dumpers
from ruamel.yaml.main import round_trip_load as yaml_load, round_trip_dump as yaml_dump
import io
import cProfile
import pstats
import pandas as pd
import re

In [41]:
# just for debugging
import upsetplot

## Save copy of config file

Add a comment line with git commit hash and then copy content of `config.yaml`. 

In [42]:
timestamp = datetime.now().strftime('%Y%m%d-%H%M%S')
commit = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()

In [43]:
config_in = Path.cwd() / "config.yaml"
results_dir = Path.cwd() / "results"
config_out = results_dir /f"config_{timestamp}.yml"

with open(config_in, 'r') as f:
    config_content = yaml_load(f)

if not results_dir.exists():
    results_dir.mkdir()
    
with open(config_out, "w") as f:
    f.write(f"# git commit hash: {commit} \n")
    f.write(yaml_dump(config_content))


## Load simulation parameters

In [44]:
config_yaml = open("config.yaml")
config = yaml.load(config_yaml, Loader=yaml.FullLoader)

In [45]:
num_rows = config["dataset"]["num_rows"]
num_cols = config["dataset"]["num_cols"]
num_int = config["dataset"]["intersections"]  # number of intersections
data_type = config["dataset"]["datatype"]
seed = config["seed"][0]
patterns = config["patterns"]
filename = config["output"]["filename"]
package = "pace" # change this to "upset" if required

## Files

In [46]:
path = Path.cwd() / "evaluation"
output_file = results_dir / f"{filename}_{package}_{timestamp}.csv"
profile_file = Path.cwd() / "profile.csv"

## Functions

In [47]:
def profile_to_csv(profile, file="profile.csv"):
    """Writes the result of the profiling a csv file.
    
    Parameters
    ----------
    profile : cProfile.Profile
        text stream 
    file : str
        path to csv file to which we write the profiling results from text stream
    """
    result = io.StringIO()
    pstats.Stats(profile, stream=result).print_stats()
    result = result.getvalue()
    
    # chop string into a csv-like buffer
    result='ncalls'+result.split('ncalls')[-1]
    result='\n'.join([','.join(line.rstrip().split(None,5)) for line in result.split('\n')])

    # write to csv file
    with open(file, 'w') as f:
        f.write(result)
        f.close()


    

In [48]:
def extract_function_name(string):
    """Extracts the function name from a string. 
    
    Assumes that the function name is between parenthesis. 
    Applied on the column 'filename:lineno(function)'
    
    Parameters
    ----------
    string : str
        string with the value of the above described column
    
    Returns
    -------
    function_name : str
        name of the function without path or line number
    """
    result = re.search(r"\((.*?)\)",string)
    if result:
        return result.group(1)
    else:
        return "-900"

## Generate data and evaluate

In [49]:
if package == "upset":
    data_func = ""

with open(output_file, "w") as csvfile:
    try:
        w = csv.writer(csvfile, delimiter=",")
        w.writerow(
            [
                "Package",
                "Pattern",
                "Num Rows",
                "Num Cols",
                "Num Intersections",
                "Tims (s) Missingness",
                "Tims (s) Total",
                "RAM",
            ]
        )
        # set seed
        set_seed(seed)
        # run evaluation
        for pattern in patterns:
            for dtype in data_type:
                for inter in num_int:
                    for row in num_rows:
                        for col in num_cols:
                            # step 1: generate data
                            df = generate_pattern(
                                pattern, row, col, inter, dtype
                            )
                            # step 2: evaluate data and profile performance
                            pr = cProfile.Profile()
                            pr.enable()
                            vir_mem = eval_data(df, package)
                            pr.disable()
                            profile_to_csv(pr, file=profile_file)
                            # step 3: load profile and extract timings we're interested in
                            df = pd.read_csv(profile_file)
                            df["function"] = df["filename:lineno(function)"].apply(lambda row: extract_function_name(row))
                            # step 4: add timings to result
                            time_total = (df["cumtime"][df["function"] == "eval_data"]).values[0]
                            time_missingness = (df["cumtime"][df["function"] == "compute_missingness"]).values[0]
                            results = [package, pattern, row, col, inter,time_missingness, time_total, 1]
                            w.writerow(results)
    except:
        raise

In [50]:
df.sort_values(by=["cumtime"], ascending=False)

Unnamed: 0,ncalls,tottime,percall,cumtime,percall.1,filename:lineno(function),function
468,1,0.0,0.0,0.015,0.015,/home/layik/miniconda3/lib/python3.8/site-pack...,from_data_frame
31,1,0.0,0.0,0.015,0.015,/home/layik/code/python/visualising-data-profi...,eval_data
32,1,0.0,0.0,0.015,0.015,/home/layik/code/python/visualising-data-profi...,compute_missingness
41,1,0.0,0.0,0.007,0.007,/home/layik/miniconda3/lib/python3.8/site-pack...,ngroup
377,8/6,0.0,0.0,0.004,0.001,/home/layik/miniconda3/lib/python3.8/site-pack...,wrapper
...,...,...,...,...,...,...,...
192,6,0.0,0.0,0.000,0.000,/home/layik/miniconda3/lib/python3.8/site-pack...,_get_axis
191,20,0.0,0.0,0.000,0.000,/home/layik/miniconda3/lib/python3.8/site-pack...,_get_axis_number
190,2,0.0,0.0,0.000,0.000,/home/layik/miniconda3/lib/python3.8/site-pack...,_construct_axes_from_arguments
189,2,0.0,0.0,0.000,0.000,/home/layik/miniconda3/lib/python3.8/site-pack...,<dictcomp>


In [51]:
results

['pace', 'monotone', 10, 5, 10, 0.015, 0.015, 1]