In [None]:
# -*- coding: utf-8 -*-
"""
Created on 19 Jan 2023

Exploratory code for running technical evaluation for missing data and set visualization with UpSetPlot/PACE

@author: Roy Ruddle
"""
import sys
import os
import csv
from datetime import datetime
import string
import random

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import upsetplot
from pace.membership import Membership

from utils import generate_general_missing, generate_planned_missing, compute_missingness, get_dataframe_sets, generate_set_data

In [None]:
# constants
PM = 'planned missing'
GM = 'general missing'
SET = 'sets'
HEADER_ROW = ['Package', 'Pattern', 'Mode', 'Num_rows', 'Num_cols', 'Num_intersections', 'Stage', 'Time (s)', 'RAM']
ROWS_RANGE = [100, 1000]
COLS_RANGE = [10, 100]
FILENAME = output_filename = 'tmp_tech_eval_results.csv'

In [None]:
def evaluate(package, pattern, set_mode, num_rows, num_cols, num_int):
    """
    Generate set data and evaluate the time/memory for visualzing the sets.
    

    Parameters
    ----------
    package : string
        upset, pace
    pattern : string
        monotone, planned
    set_mode : boolean
        False (missingness) or True (set-typed data)
    num_cols : int
        The number of columns in the data frame
    num_rows : int
        The number of rows in the data frame
    num_int : int
        The number of intersections or combinations of missing values in the data frame

    Returns
    -------
    None.

    """
    if pattern == PM:
        df = generate_planned_missing(num_rows, num_cols)
    elif pattern == GM:
        df = generate_general_missing(num_rows, num_cols, num_int)
    elif pattern == SET:
#         print('HACK num_values')
        num_values = num_cols
        df = generate_set_data(num_rows, num_values)
    else:
        df = None

    if df is not None:
        mode = 'set_mode' if set_mode else 'missingness'
        results = [[package, pattern, mode, num_rows, num_cols, num_int, 'DATA_FRAME', None, sys.getsizeof(df)]]
        start_time = datetime.now()
    
        if package == 'upset':
            if set_mode:
                contents = get_dataframe_sets(df)
                results.append([package, pattern, mode, num_rows, num_cols, num_int, 'SET_MEMBERS', None, sys.getsizeof(df)])
                data = upsetplot.from_contents(contents)
            else:
                data = compute_missingness(df, package)

            time2 = datetime.now()
            td = time2 - start_time
            results.append ([package, pattern, mode, num_rows, num_cols, num_int, 'COMPUTE', td.seconds + td.microseconds/1e6, sys.getsizeof(data)])
        else: # setvis
            if set_mode:
                results.append([package, pattern, mode, num_rows, num_cols, num_int, 'SET_MEMBERS', None, sys.getsizeof(df)])
                data = Membership.from_data_frame(df)
            else:
                data = compute_missingness(df, package)
            time2 = datetime.now()
            td = time2 - start_time
            results.append ([package, pattern, mode, num_rows, num_cols, num_int, 'COMPUTE', td.seconds + td.microseconds/1e6, sys.getsizeof(data)])
    else:
        results = []

    return results    


def run_evaluation(option, package="upset"):
    """
    Run the technical evaluation.
        

    Parameters
    ----------
    option : int
        The options are:
        - 1  Missing data; each row is missing a value in one column; no intersections
    package : str
        Either upset or setvis, defaults to upset
        
    Returns
    -------
    None.

    """
    print('Outputting results of ', package, ' to: ', output_filename)

    with open(output_filename, 'a', newline='\n') as csvfile:
        try:
            w = csv.writer(csvfile, delimiter = ',')
            # only add header if just opened file
            if os.stat(FILENAME).st_size == 0: 
                w.writerow(HEADER_ROW)
            try:
                if option == 1:
                    for num_rows in ROWS_RANGE:
                        for num_cols in COLS_RANGE:
                            # Evaluate for planned missingness (each row is missing the value for one variable).
                            # There are no intersections, so the number of combinations of missing values equals num_cols.
                            set_mode = False
                            results = evaluate(package, PM, set_mode, num_rows, num_cols, num_int=None)
                            try:
                                w.writerows(results)
                            except:
                                raise
                elif option == 2:
                    for num_rows in [1000]:
                        for num_cols in [10]:
                            for num_int in ROWS_RANGE:
                                # Evaluate UpSetPlot for planned missingness (each row is missing the value for one variable).
                                # There are no intersections, so the number of combinations of missing values equals num_cols.
                                set_mode = False
                                results = evaluate(package, GM, set_mode, num_rows, num_cols, num_int)
                                try:
                                    w.writerows(results)
                                except:
                                    raise
                elif option == 10:
                    for num_rows in ROWS_RANGE:
                        for num_cols in COLS_RANGE:
                            # Evaluate UpSetPlot for planned missingness (each row is missing the value for one variable).
                            # There are no intersections, so the number of combinations of missing values equals num_cols.
                            set_mode = True
                            results = evaluate(package, SET, set_mode, num_rows, num_cols, num_int=None)
                            try:
                                w.writerows(results)
                            except:
                                raise
            except:
                raise
        except: 
            #print('*** ERROR *** Cannot open file: ' + output_filename)
            raise

In [None]:
########### 
# Evaluate for both upset & pace:
# 1. Planned missing
# 2. General missing
# 3. set mode
###########
if os.path.exists(FILENAME):
    os.remove(FILENAME)

def tech_evaluation():
    for option in [1, 2, 10]:
        for package in ['upset', 'pace']:
            run_evaluation(option, package)

tech_evaluation()

## Below to be deleted 

In [None]:
sd = generate_set_data(500000,10000)

In [None]:
print(sys.getsizeof(sd)/1024/1024)
ds = get_dataframe_sets(sd)
print(sys.getsizeof(ds)/1024/1024)

In [None]:
# ud = upsetplot.from_contents(ds)
# ud

# crashed at generate_set_data(1e6, 1e4)
# crashed at generate_set_data(5e5, 1e4)
# crashed at generate_set_data(250000, 1e4)
# crashed at generate_set_data(125000, 1e4)
# crashed at generate_set_data(50000,10000) after 8 minutes


In [None]:
from pace.membership import Membership
import pandas as pd

In [None]:
def pace_intersections():
    Membership.from_data_frame(pd.DataFrame(ds))
    m.intersections()

%timeit pace_intersections()

In [None]:
IPython.sys_info()