# Perfomance PACE (paper - technical evaluation)

PACE was designed/implemented to be used in a notebook environment. Hence, a notebook should be used to evaluate its performance.

We need to evaluate three spects:
1. Missingness computation (time in sec)
2. Visualisation (time in sec)
3. RAM (`psutil.virtual(memory)`)

The results of the evaluation are written to a csv file. 

<line>


In [None]:
from pace.membership import Membership
from pace.plots import PlotSession
from utils import generate_pattern, eval_data, set_seed
from datetime import datetime, time
import psutil
import yaml
import pandas as pd
import csv
from datetime import datetime
from pathlib import Path

## Load config file

In [None]:
config_yaml = open("config.yaml")
config = yaml.load(config_yaml, Loader=yaml.FullLoader)

In [None]:
path = Path.cwd() / "evaluation"

In [None]:
num_rows = config["dataset"]["num_rows"]
num_cols = config["dataset"]["num_cols"]
num_int = config["dataset"]["intersections"]  # number of intersections
data_type = config["dataset"]["datatype"]
seed = config["seed"][0]
patterns = config["patterns"]
filename = config["output"]["filename"]
package = "pace"
output_file =path / f"{filename}_{package}_{datetime.now().strftime('%Y%m%d-%I%M%S')}.csv"

In [None]:
output_file

## Generate data and evaluate

TODO:

- [x] import eval from utils.py
- [ ] write plots/figures to tmp file and take timestep before that to make sure we get a comparable timing.
- [x] Fix seed thing -> all values within a row are the same
- [ ] Save config file with git commit id
- [ ] Write script to write dataset from csv to postgres

In [None]:
with open(output_file, "w", newline="\n") as csvfile:
    try:

        w = csv.writer(csvfile, delimiter=",")
        w.writerow(
            [
                "Package",
                "Pattern",
                "Num_rows",
                "Num_cols",
                "Num_intersections",
                "Stage",
                "Tims (s)",
                "RAM",
            ]
        )
        # set seed
        set_seed(seed)
        # run evaluation
        for pattern in patterns:
            for dtype in data_type:
                for inter in num_int:
                    for row in num_rows:
                        for col in num_cols:
                            # step 1: generate data
                            df = generate_pattern(
                                pattern, row, col, inter, dtype
                            )
                            # step 2: evaluate data
                            results = eval_data(
                                df, package, pattern, row, col, inter, dtype,
                            )
                            # step 3: write result to file
                            w.writerows(results)

    except:
        raise

## Functions

In [None]:
# def eval_pace(df, package, pattern, num_rows, num_cols):
#     """
#     Evaluates the performance of PACE by timing the 
#     missingness computation and the visualisation of the provided data.

#     Parameters
#     ----------
#     df : pd.DataFrame
#         data frame
#     package : str
#         name of the evaluated visualisation package 
#     pattern : str
#         name of the pattern used to generate data
#     num_rows : int
#         number of rows in the dataset (records)
#     num_cols : int
#         number of columns in the dataset
#     Returns 
#     -------
#     """
#     try:
#         results = [
#             [
#                 package,
#                 pattern,
#                 num_rows,
#                 num_cols,
#                 "START",
#                 None,
#                 psutil.virtual_memory(),
#             ]
#         ]
#         # compute missingness
#         start_time = datetime.now()
#         data_missing = Membership.from_data_frame(df)
#         time2 = datetime.now()
#         td = time2 - start_time
#         results.append(
#             [
#                 package,
#                 pattern,
#                 num_rows,
#                 num_cols,
#                 "COMPUTE",
#                 td.seconds + td.microseconds / 1e6,
#                 psutil.virtual_memory(),
#             ]
#         )
#         # visualisations
#         time3 = datetime.now()
#         session = PlotSession(df)
#         time4 = datetime.now()
#         session.add_plot("a")
#         time5 = datetime.now()
#         td = time5 - time3 # decide what's a fair comparison
#         results.append(
#             [
#                 package,
#                 pattern,
#                 num_rows,
#                 num_cols,
#                 "VISUALIZE",
#                 td.seconds + td.microseconds / 1e6,
#                 psutil.virtual_memory(),
#             ]
#         )
#         return results
#     except:
#         raise
    