# Introduction

What kind of question would we like to answer ... 

 Overall, these experiments will help use to estimate by how much the use of the upper bound in computing the age latency can be beneficial. 
 
 there are two case to consider, optimum and eapproximate. 
 First the optimum case, by how much using the lower can accelerate the search (in term of iterations, or more generally in term of execution time). 
 Secondly by introducing the lower bound to our mechanism, we also enable approximate solution with garanties in delta. By considering several situations of acceptable errors (such as 1%, 5%, 10%, 15%), by how much the computation of age latency can be improved. 
 

In [None]:
!pip install seaborn cairosvg

## Generate sample data files and loading them... 

In [None]:
EXEC_PATH="../cmake-build-release/src/"
BENCHMARK = EXEC_PATH + "/benchmarkAgelatency"
ANALYSE = EXEC_PATH + "/lig-analyse"
ALL_KIND=["automotive", "generic", "harmonic"]

In [None]:
!mkdir -p data
!if [ ! -e data/automotive.csv ]; then echo Not found; fi

In [None]:
%%script env BENCHMARK="$BENCHMARK" bash

for kind in automotive harmonic generic; do 
    if [ ! -e data/$kind.csv ]; then 
        $BENCHMARK  -kind $kind -begin_n 10 -end_n 30  -step_n 10 -sample_count 5 -iter_count 1 -detailed -logfile data/"$kind".csv; 
    fi

    if [ ! -e data/"$kind"diti.csv ]; then  
        $BENCHMARK  -kind $kind -begin_n 10 -end_n 30  -step_n 10 -sample_count 5 -iter_count 1 -DiEqualTi -detailed -logfile data/"$kind"diti.csv; 
    fi
done

In [None]:
%%script env BENCHMARK="$BENCHMARK" bash

if [ ! -e data/containsAnomalies.csv ]; then $BENCHMARK -begin_n 4 -end_n 5 -kind generic -step_n 1 -detailed -iter_count 1 -sample_count 10000 -logfile data/containsAnomalies.csv; fi
if [ ! -e data/containsAnomalies_diti.csv ]; then $BENCHMARK -begin_n 4 -end_n 5 -kind generic -step_n 1 -detailed -iter_count 1 -sample_count 10000 -logfile data/containsAnomalies_diti.csv -DiEqualTi; fi
if [ ! -e data/seek3.csv ]; then $BENCHMARK -begin_n 3 -end_n 5 -kind generic -step_n 1 -detailed -iter_count 1 -sample_count 10000 -logfile data/seek3.csv; fi

## Tools to load the files

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def openLog(filename, with_gain=[]):
    toIntList = lambda x: [int(y) for y in x.strip("[]").split(",")] if x != "[]" else []
    toFloatList = lambda x: [float(y) for y in x.strip("[]").split(",")] if x != "[]" else []
    df = pd.read_csv(filename, sep=";",
                     converters={
                         "ExpansionVertex": toIntList,
                         "ExpansionEdges": toIntList,
                         "LowerBounds": toIntList,
                         "UpperBounds": toIntList,
                         "gen_time": toFloatList,
                         "lbp_time": toFloatList,
                         "ubp_time": toFloatList,
                     }
                    )
    
    # Check data is as expected
    assert("seed" in df.columns)

    # Remove zero-task cases, annoying and useless.
    df = df[df["n"] > 0]
    
    # Add extra data about filename
    df["filename"] = filename
    
    # Pick up the edge dentisty 
    df["edge_density"] = df["m"] / (df["n"]  * (df["n"] - 1))
    df["edge_density"] = df["edge_density"].apply(lambda x : "1.High" if x > 0.3 else "2.Medium" if x > 0.2 else "3.Low")
    
    
    # Rename kinds
    kind_name = {1:"generic", 2:"harmonic", 3:"automotive"}
    df["kind"] = df["kind"].apply(lambda x : kind_name[x])
    
    # Normalize listes 
    df.LowerBounds = df.apply(lambda x :  [y/x.AgeLatency for y in x.LowerBounds]  , axis = 1)
    df.UpperBounds = df.apply(lambda x :  [y/x.AgeLatency for y in x.UpperBounds]  , axis = 1)
    df.ExpansionVertex = df.apply(lambda x :  [y/(x.sum_n+2) for y in x.ExpansionVertex]  , axis = 1)

    
    ## Old test
    #assert(df.ExpansionVertex.apply(max).max())
    #assert(df.LowerBounds.apply(max).max())
    #assert(df.UpperBounds.apply(max).max())

    for label in ["gen_time", "lbp_time", "ubp_time"] :
        df[label+"_detail"] = df[label].copy()
        df[label] = df[label].apply(sum)
                        
                            
    df["BoundsDistances"] = df.apply(lambda x : [abs( l - r ) for (l,r) in zip(x.LowerBounds,x.UpperBounds)], axis=1)
    df["bounds_shift"] = df.apply ( lambda x : (x["LowerBounds"].index(1) - x["UpperBounds"].index(1)) if x["LowerBounds"].count(1) else None , axis=1)
    df["early_bounds"] = df.apply ( lambda x : ((x["LowerBounds"].index(1) + 1< len(x["LowerBounds"])) and  (x["UpperBounds"].index(1) + 1 < len(x["UpperBounds"]) ))  if x["LowerBounds"].count(1) else None , axis=1)
    df['remain_time'] = (df["total_time"] - df["gen_time"] - df["lbp_time"] - df["ubp_time"])
    

    if len(with_gain) > 0 :
        
        df["ori_time"] =  df["gen_time"] + df["ubp_time"]
        df["ori_space"] =  df.apply(lambda x : x.ExpansionVertex[x.IterationCount-1] , axis=1)

        for delta in with_gain :
            deltaStr  = str(delta)
            target = 1.0 - (delta/100.0)

            def when_to_finish (row) :
                result = len(row.UpperBounds) - 1
                for i in range(result):
                    if (min(row.UpperBounds[:i+1]) - max(row.LowerBounds[:i+1])) <= (delta/100.0):
                        return i
                return result
            df["NewIterationCount"+deltaStr] = df.apply(when_to_finish, axis = 1) + 1
            new_gen_time =  df.apply(lambda x: sum(x["gen_time_detail"][:x["NewIterationCount"+deltaStr]]), axis = 1)
            new_lbp_time =  df.apply(lambda x: sum(x["lbp_time_detail"][:x["NewIterationCount"+deltaStr]]), axis = 1)
            new_ubp_time =  df.apply(lambda x: sum(x["ubp_time_detail"][:x["NewIterationCount"+deltaStr]]), axis = 1)

            new_time =   new_gen_time + new_lbp_time + new_ubp_time
            
            gain =  (df["ori_time"] - new_time) / df["ori_time"]
            df["time_gain"+deltaStr] = gain.fillna(0.0)
            new_space = df.apply(lambda x : x.ExpansionVertex[x["NewIterationCount"+deltaStr]-1] , axis=1)
            gain =  (df["ori_space"] - new_space) / df["ori_space"]
            df["space_gain"+deltaStr] = gain.fillna(0.0)
        
        for label in ["gen_time", "lbp_time", "ubp_time"] :
            df[label+"_new"] =  df.apply(lambda x: sum(x[label+"_detail"][:x["NewIterationCount0"]]), axis = 1)

        df["Iterations saved"] = (df["IterationCount"] - df["NewIterationCount0"])
        df["Iterations saved"] = pd.Categorical(df["Iterations saved"], ordered=True)
    
    
    return df.reset_index()

In [None]:
def openLogs(filelist, **kwargs):
    df = None
    for f in filelist:
        df = pd.concat([df,openLog(f, **kwargs)], ignore_index=True)
    return df

In [None]:
data_base_df = openLogs(["data/automotive.csv", "data/harmonic.csv", "data/generic.csv"])
data_full_df = openLogs(["data/automotive.paper.csv", "data/harmonic.paper.csv", "data/generic.paper.csv", "data/automotive.csv", "data/automotivediti.csv","data/harmonic.csv","data/harmonicditi.csv", "data/containsAnomalies.csv", "data/generic.csv", "data/containsAnomalies_diti.csv"])

paper_df = openLogs(["data/automotive.paper.csv", "data/harmonic.paper.csv", "data/generic.paper.csv"])
paper_df_withGain = openLogs(["data/automotive.paper.csv", "data/harmonic.paper.csv", "data/generic.paper.csv"], with_gain=np.arange(0,151,5))
paper_df_withGain1 = openLogs(["data/automotive.paper.csv", "data/harmonic.paper.csv", "data/generic.paper.csv"], with_gain=np.arange(0,151,5))




##  What is the maximum and minimum error we get from the first lower bound compared with the final latency ?

In [None]:
def lower_upper_detailled (df, alpha = None) :
    if not alpha :
        alpha = max ( 0.02,  1.0 / len(df) )
    for (x,y) in list(df.apply(lambda x :  (x.ExpansionVertex, x.LowerBounds)  , axis = 1)) :
        plt.plot(x,y, marker="o", color = "r", alpha = alpha)
    for (x,y) in list(df.apply(lambda x :  (x.ExpansionVertex, x.UpperBounds)  , axis = 1)) :
        plt.plot(x,y, marker="o", color = "b", alpha = alpha)
    _ = plt.xlabel("Expansion Ratio to maximal")
    _ = plt.ylabel("Upper bound Ratio to optimal")
    _ = plt.title("Lower and Upper bounds progress over iterations")
df = data_base_df.copy()    
lower_upper_detailled (df)

In [None]:
import seaborn as sns
# Set the font size
#sns.set_context("notebook", font_scale=1.5)  # Adjust font_scale as needed

def summarizePlots(df) :
    with sns.axes_style('white'):
        #_ = sns.jointplot("n", "m", data=df, kind='hex')
        _ = sns.pairplot(data=df[[ "n","m",  "sum_n", "IterationCount"]], hue="n")
summarizePlots(df)

## Study of the bounds

In [None]:
def plotBoundsByIteration(df, itercount) :
    sdf  =  df[df.IterationCount == itercount]
    _ = plt.violinplot(pd.DataFrame(sdf.LowerBounds.to_list(), columns=range(itercount)))
    _ = plt.violinplot(pd.DataFrame(sdf.UpperBounds.to_list(), columns=range(itercount)))
    _ = plt.xlabel("Iteration")
    _ = plt.ylabel("Lower/Upper bounds ratio to optimal")
    
plotBoundsByIteration(df, 5)
_ = plt.title("Lower and Upper bounds progress over iterations for 5-iterations cases")

In [None]:
for it in range(1,df.IterationCount.max() + 1) :
    if len(df[df.IterationCount == it]) > 1:
        plotBoundsByIteration(df, it)
_ = plt.title("Lower and Upper bounds progress over iterations for all cases")

In [None]:
def plotBounds(df, title = None) :
    sdf = df.copy()
    sdf.LowerBounds = sdf.LowerBounds.apply(lambda x: [max(x[:i]+[x[i]]) for i in range(len(x))])
    sdf.LowerBounds = sdf.apply (lambda x : x.LowerBounds + (df.IterationCount.max() - x.IterationCount)*x.LowerBounds[-1:], axis = 1)
    sdf.UpperBounds = sdf.apply (lambda x : x.UpperBounds + (df.IterationCount.max() - x.IterationCount)*x.UpperBounds[-1:], axis = 1)
    plt.figure()
    _ = plt.violinplot(pd.DataFrame(sdf.LowerBounds.to_list(), columns=range(df.IterationCount.max())))
    _ = plt.violinplot(pd.DataFrame(sdf.UpperBounds.to_list(), columns=range(df.IterationCount.max())))
    _ = plt.xlabel("Iteration")
    _ = plt.ylabel("Lower/Upper bounds ratio to optimal")
    if title is None :
        _ = plt.title("Lower and Upper bounds progress over iterations for all cases")
    else :
        _ = plt.title(title)

In [None]:
plotBounds(df) 

In [None]:
def plotBoundsDistances(df) :
    sdf = df.copy()
    sdf.BoundsDistances = sdf.apply (lambda x : x.BoundsDistances + (sdf.IterationCount.max() - x.IterationCount)*x.BoundsDistances[-1:], axis = 1)
    _ = plt.violinplot(pd.DataFrame(sdf.BoundsDistances.to_list(), columns=range(sdf.IterationCount.max())))
    _ = plt.title("Bounds distance for every iterations")
    _ = plt.xlabel("Iteration")
    _ = plt.ylabel("Bounds distance ratio to optimal")
plotBoundsDistances(df)

In [None]:
def plotMinMaxBoundDistance (df, label1 = "Minimal distance", label2 = "Maximal distance") :
    sdf = df.copy()
    sdf.BoundsDistances = sdf.apply (lambda x : x.BoundsDistances + (sdf.IterationCount.max() - x.IterationCount)*x.BoundsDistances[-1:], axis = 1)
    sdf["BoundsDistancesMax"] = sdf.BoundsDistances.apply(lambda x : max(x))

    x = sdf.groupby("n").max().reset_index()["n"]
    y1 = sdf.groupby("n").min().reset_index()["BoundsDistancesMax"]
    y2 = sdf.groupby("n").max().reset_index()["BoundsDistancesMax"]

    _ = plt.plot(x,y1,  label = label1)
    _ = plt.plot(x,y2,  label = label2)
    _ = plt.title("Min and Maximum distance between bounds per graph size")
    _ = plt.xlabel("Graph size (N)")
    _ = plt.ylabel("Distance between bounds")
    _ = plt.legend()
plotMinMaxBoundDistance(df[df.IterationCount > 5], label1="Min Distance (iter > 5)", label2="Max Distance (iter > 5)")
plotMinMaxBoundDistance(df[df.IterationCount <= 5], label1="Min Distance (iter <= 5)", label2="Max Distance (iter <= 5)")

In [None]:
import seaborn as sns
    
def plotDataFrame(_df, header="Untitled") :
    fig, axes = plt.subplots(3,2,figsize=(10,10))
    fig.suptitle(f'Summary from {header}')
    plt.sca(axes[0,0])
    plotBounds(_df)
    plt.sca(axes[1,0])
    plotBoundsDistances(_df)
    plt.sca(axes[0,1])
    lower_upper_detailled(_df)
    plt.sca(axes[1,1])
    plotMinMaxBoundDistance(_df)
    plt.sca(axes[2,0])
    df["bounds_shift"].hist()
    plt.tight_layout()
    summarizePlots(_df)
    _ = df[[ "n","m",  "sum_n", "IterationCount"]].hist()
    
def plotDataFile(filename) :
    _df = openLog(filename)
    plotDataFrame(_df, filename)
   

In [None]:
plotDataFile("data/generic.csv")

In [None]:
plotDataFile("data/automotivediti.csv")

## Drawing tools

In [None]:
!mkdir figures -p
!rm -f figures/*

In [None]:
class Instance :
    def __init__ (self, n,m,seed,kind, DiEqTi) :
        self.n   = n
        self.m   = m
        self.kind   = kind
        self.seed   = seed
        self.DiEqTi = DiEqTi
        
    def __str__ (self) :
        return f"Instance ({self.n}, {self.m}, {self.seed}, {self.kind}, {self.DiEqTi})"
    def __repr__ (self) :
        return self.__str__()

def getLET(inst): 
    import os 
    import subprocess
    import sys
    from IPython import display
    valid_kinds = ALL_KIND
    n  = inst.n
    m = inst.m
    seed = inst.seed
    kind = inst.kind
    DiEqTi = inst.DiEqTi
    
    if kind == 1 : kind = "generic"
    if not kind in valid_kinds :
        print (f"invalid kind '{kind}'")
    assert( kind in valid_kinds )
        
    cmd = [ANALYSE, "-n", str(n), "-m", str(m), "-seed", str(seed), "-kind", kind, "-outputsvg"]
    if DiEqTi :
        cmd .append("-DiEqualTi")
    sys.stderr.write(f" " + " ".join(cmd))
    
    proc = subprocess.Popen([str(x) for x in cmd], stdout=subprocess.PIPE)
    out,err = proc.communicate()
    return display.SVG(out)

def getPEG(inst, upper=None, lower=None): 
    import os 
    import subprocess
    import sys
    from IPython import display
    assert (upper == None or lower==None) # Need only on of them
    
    n  = inst.n
    m = inst.m
    seed = inst.seed
    kind = inst.kind
    DiEqTi = inst.DiEqTi
    
    if kind == 1 : kind = "generic"
        
    if (upper) : peg_k = " ".join([str(x) for x in upper])    
    if (lower) : peg_k = " ".join([str(x) for x in lower])   
        
    cmd = [ANALYSE, "-n", str(n), "-m", str(m), "-seed", str(seed), "-kind", kind, 
           "-outputsvg", "-peg", peg_k]
    if DiEqTi :
        cmd .append("-DiEqualTi")
    sys.stderr.write(f" " + " ".join(cmd))
    proc = subprocess.Popen([str(x) for x in cmd], stdout=subprocess.PIPE)
    out,err = proc.communicate()
    required = ""
    res = ""
    current = ""
    if upper :
        required = f"// Upper bound"
    if lower :
        required = f"// Lower bound"
    print (f"<!-- {required} -->")
    for line in out.decode("utf-8").split("\n"):
        if line[:2] == "//" or line[:1] == "%":
            current = line
            if required == "" :
                print (line)
        elif current.startswith(required) :
            res += (line)
    return display.SVG(res)

def saveSVGIntoPNG(obj, filename):
    from cairosvg import svg2png
    svg2png(bytestring=obj.data,write_to=filename)
        
def saveInto(obj, filename):
    with open(filename, 'w') as fdesc:
        fdesc.write(obj)

import subprocess

def execute_and_process(df, cmd_template, process):
    """
    Executes a command for each row in the dataframe and processes the output.

    Args:
        df (pd.DataFrame): DataFrame containing columns 'n', 'm', and 'seed'.
        cmd_template (str): Command template with placeholders for arguments, e.g., "cmd -n {n} -m {m} -seed {seed}".
        process (function): A function that takes the output of the command and processes it.

    Returns:
        list: A list of processed results.
    """
    results = []

    for _, row in df.iterrows():
        # Prepare the command by substituting arguments
        cmd = cmd_template.format(n=row['n'], m=row['m'], seed=row['seed'])

        try:
            # Run the command and capture the output
            result = subprocess.run(
                cmd, shell=True, capture_output=True, text=True, check=True
            )
            
            # Process the command output
            processed_result = process(row, result.stdout)
            results.append(processed_result)
        except subprocess.CalledProcessError as e:
            print(f"Error executing command: {cmd}\n{e.stderr}")
            results.append(None)  # Append None if the command fails

    return results
        

# Looking for a sample

I'm looking for a sample that showcases pimin/pimax, and lower/upper.

In [None]:
samples_df = data_full_df.copy()

# small instances but with interesting properties
samples_df = samples_df[samples_df["sum_n"] > 100]
samples_df = samples_df[samples_df["n"] == 4]
samples_df = samples_df[samples_df["m"] == 3]
samples_df = samples_df[samples_df["IterationCount"] == 3]

# Lower bound find it first.
samples_df = samples_df[samples_df.apply(lambda x : x.LowerBounds.count(1.0) != 0,axis=1)]
#samples_df = samples_df[samples_df.apply(lambda x : x.LowerBounds.index(1.0) < x.UpperBounds.index(1.0),axis=1)]
samples_df = samples_df[samples_df.apply(lambda x: all(x.LowerBounds[i] < x.LowerBounds[i+1] for i in range(len(x.LowerBounds) - 1)), axis=1)]

# Output instances
print(samples_df.columns)
samples_df[["seed","kind","n", "m", "DiEqTi","sum_n", "LowerBounds", "UpperBounds", "filename"]]


In [None]:
def sample_process(row, output):
    if "len(UP): 3" in output.strip() :
        return row
    else:
        return None

# Command template
command_template = ANALYSE + " -kind generic  -agelatency -n {n} -m {m} -seed {seed}"

# Execute and process
results = execute_and_process(samples_df, command_template, sample_process)
for l in results:
    if l is not None :
        print(l["n"],l["m"],l["seed"])

In [None]:
GENERATION_PARAMETERS=ANALYSE + " -n 4 -m 3 -seed 818 -kind generic " 

# Figures and Tables for the example

## Figure 2

DAG and r,D,T

In [None]:
%%script env CMD="$GENERATION_PARAMETERS" bash
$CMD  -outputtikzdag -outputtabularLET

## Figure 3

Schedule view

In [None]:
%%script env CMD="$GENERATION_PARAMETERS" bash
$CMD   -outputtikzschedule -schedule_duration 25

## Figure 4

Alphas

In [None]:
%%script env CMD="$GENERATION_PARAMETERS" bash
$CMD   -outputalphas 1 -agelatency

## Figure 5

PEG of the first iteration

In [None]:
%%script env CMD="$GENERATION_PARAMETERS" bash
$CMD   -outputtikzPEG  -agelatency

## Table 1

List of iterations for Kiter

In [None]:
%%script env CMD="$GENERATION_PARAMETERS" bash
$CMD      -outputtabularAlgo

# Use-cases

## Use-case 1: When Lower bound reach optimality first

This example shows a situation where lower bound can help to interupt computation earlier.

In [None]:
samples_df = data_full_df.copy()

# small instances but with interesting properties
#samples_df = samples_df[samples_df["sum_n"] > 100]
#samples_df = samples_df[samples_df["n"] == 4]
#samples_df = samples_df[samples_df["m"] == 3]
#samples_df = samples_df[samples_df["IterationCount"] == 3]

# Lower bound find it first.
samples_df = samples_df[samples_df.apply(lambda x : x.LowerBounds.count(1.0) == 1,axis=1)]
samples_df = samples_df[samples_df.apply(lambda x : x.LowerBounds.index(1.0) < x.UpperBounds.index(1.0),axis=1)]
#samples_df = samples_df[samples_df.apply(lambda x: all(x.LowerBounds[i] < x.LowerBounds[i+1] for i in range(len(x.LowerBounds) - 1)), axis=1)]

# Output instances
print(samples_df.columns)
samples_df[["seed","kind","n", "m", "DiEqTi","sum_n", "LowerBounds", "UpperBounds", "filename"]]


In [None]:
%%script env ANALYSE="$ANALYSE" bash
$ANALYSE -n 4 -m 4 -seed 3783 -kind generic  -agelatency  -outputtikzschedule -schedule_duration 30 -outputtabularAlgo

## Use-case 2: When lower bound does not reach optimality

It is very interesting to note the existence of instances where, considering Algoritm 1, the lower bound will not be able to reach optimality. 
These very rare cases are possible, and force us to verify the original condition from Ning in addition of comparing lower and upper bound. 

The following example is one of these cases.

In [None]:
samples_df = data_full_df.copy()

# small instances but with interesting properties
samples_df = samples_df[samples_df["sum_n"] > 100]
samples_df = samples_df[samples_df["n"] == 4]
samples_df = samples_df[samples_df["m"] == 3]
samples_df = samples_df[samples_df["IterationCount"] == 3]

# Lower bound find it first.
samples_df = samples_df[samples_df.apply(lambda x : x.LowerBounds.count(1.0) == 0,axis=1)]
#samples_df = samples_df[samples_df.apply(lambda x : x.LowerBounds.index(1.0) < x.UpperBounds.index(1.0),axis=1)]

# Output instances
print(samples_df.columns)
samples_df[["seed","kind","n", "m", "DiEqTi","sum_n", "LowerBounds", "UpperBounds", "filename"]]


In [None]:
%%script env ANALYSE="$ANALYSE" bash
$ANALYSE -n 4 -m 3 -DiEqualTi -seed 7929 -kind generic -agelatency  -outputtikzschedule -schedule_duration 30 -outputtabularAlgo

## User-case 3: When lower bound decrease while K increase. 

Since Bodin2016, it is accepted that increasing the values of K arbitrarily does not necessarily improve estiation results.
However a dominant subset has been identified and consistantly improve the upper bound. 
Meanwhile we found example where an update of K can degrade the lower bound despite improving the upper bound.

In this example, the upper bound ciritical path for K=1,1,1,1 indicate a new vector K=[8,3,24,1,3]. This new vector K reach optimality.
However, the same update of K for the lower bound has the oposite effect that while lower bound was 19 initially, it went down to 12 after this update of K.



In [None]:
filtered_df = data_full_df.copy()
filtered_df = filtered_df[filtered_df["IterationCount"] >=  3]
filtered_df = filtered_df[filtered_df["sum_n"] < 60]
filtered_df = filtered_df[filtered_df["n"] >= 4]
filtered_df = filtered_df[filtered_df["m"] >= 4]
filtered_df = filtered_df[filtered_df.apply(lambda x : x.LowerBounds[1] > x.LowerBounds[2],axis=1)]

filtered_df[["seed","n", "m", "sum_n", "LowerBounds", "UpperBounds"]]

# Figures for experiments 

## Generate DataSet analysis

In [None]:
%%script env BENCHMARK="$BENCHMARK" bash

for kind in automotive harmonic generic ; do 
    if [ ! -e data/$kind.dataset.csv ]; then 
        $BENCHMARK -kind $kind -begin_n 1 -end_n 500  -step_n 1 -sample_count 1 -iter_count 3 -dryrun -detailed -logfile data/$kind.dataset.csv;
    fi
done


In [None]:
dataset_df = openLogs(["data/automotive.dataset.csv", "data/harmonic.dataset.csv", "data/generic.dataset.csv"])

In [None]:
dataset_df = dataset_df[["kind","edge_density", "n", "m", "sum_n"]]

In [None]:
dataset_df

In [None]:
# Ensure only numeric columns are included
numeric_columns = dataset_df.select_dtypes(include='number')

# Group and compute the mean
result = numeric_columns.groupby([dataset_df["n"], dataset_df["kind"]]).mean().transpose()

print(result)

In [None]:
g = sns.lmplot(x="n", y="m", hue="edge_density", data=dataset_df, legend=False, height=4, aspect= 2, order=3)
_ = g.ax.legend(loc=2)

# Manually set specific font sizes (optional)
#g.ax.set_title("", fontsize=16)
g.ax.set_xlabel("Task count", fontsize=16)
g.ax.set_ylabel("Edge count", fontsize=16)
g.ax.tick_params(axis='both', which='major', labelsize=16)
g.ax.legend(fontsize=16, title_fontsize=16)  # Legend font sizes

plt.savefig("figures/dataset_size.pdf", bbox_inches='tight')

In [None]:
g = sns.lmplot(x="n", y="sum_n", hue="kind", data=dataset_df, legend=False, height=4, aspect= 2)
g = g.set_axis_labels("Task count", "Expansion size")
_ = g.ax.legend(loc=2)

#g.ax.set_title("", fontsize=16)
g.ax.set_xlabel("Task count", fontsize=16)
g.ax.set_ylabel("Expansion size", fontsize=16)
g.ax.tick_params(axis='both', which='major', labelsize=16)
g.ax.legend(fontsize=16, title_fontsize=16)  # Legend font sizes


plt.savefig("figures/dataset_complexity.pdf", bbox_inches='tight')

## Generating timing analysis

In [None]:
# Ensure only numeric columns are included
numeric_columns = df.select_dtypes(include='number')

# Group and compute the mean
result = numeric_columns.groupby([dataset_df["n"], dataset_df["kind"]]).mean().transpose()


for kind in ALL_KIND:
    df = paper_df_withGain1[paper_df_withGain1["kind"] == kind].copy() 

    # Ensure 'n' is numeric for pd.cut
    df['n'] = pd.to_numeric(df['n'], errors='coerce')

    # Create 'NRange' column
    df['NRange'] = pd.cut(df['n'], range(0, 501, 50))

    # Compute mean for numeric columns grouped by 'NRange'
    df_mean = df.select_dtypes(include='number').groupby(df["NRange"]).mean()
    # Define consistent colors for bars
    bar_colors = {
        "gen_time_new": "tab:blue",
        "lbp_time_new": "tab:orange",
        "ubp_time_new": "tab:green",
        "gen_time": "tab:blue",
        "lbp_time": "tab:orange",
        "ubp_time": "tab:green"
    }
    # Create subplots for new and old timings
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 6), sharex=False, sharey=True)

    # Plot new timings
    df_mean.plot(
        kind='bar',
        y=["gen_time_new", "lbp_time_new", "ubp_time_new"],
        stacked=True,
        ax=axes[0],
        color=[bar_colors[col] for col in ["gen_time_new", "lbp_time_new", "ubp_time_new"]],
        legend=False
    )
    axes[0].set_title(f"Using the lower bound - {kind}", fontsize=14)
    axes[0].set_ylabel("Time (seconds)", fontsize=12)
    axes[0].set_xlabel("Graph Size", fontsize=12)
    axes[0].tick_params(axis='x', rotation=45)

    # Plot old timings
    df_mean.plot(
        kind='bar',
        y=["gen_time", "ubp_time"],
        stacked=True,
        ax=axes[1],
        color=[bar_colors[col] for col in ["gen_time", "ubp_time"]],
        legend=False
    )
    axes[1].set_title(f"Without using the lower bound - {kind}", fontsize=14)
    axes[1].set_ylabel("")  # Remove redundant Y-axis label
    axes[1].set_xlabel("Graph Size", fontsize=12)
    axes[1].tick_params(axis='x', rotation=45)

    # Add a single legend for the entire figure
    fig.legend(
        ["Graph Generation Time", "Lower Bound Processing", "Upper Bound Processing"],
        loc="upper center",
        ncol=3,
        fontsize=12
    )

    # Adjust layout and save
    plt.tight_layout(rect=[0, 0, 1, 0.92])  # Leave space for the legend
    plt.savefig(f"figures/comparison_timings_{kind}.pdf", bbox_inches='tight')
    plt.close()





## Generate Improvement analysis

### Execution time analysis

In [None]:
## This is so brutal, I should find how to do this properly !!! 
df = paper_df_withGain.copy()
tmp = df.groupby(["kind","edge_density","n"]).min()[[x for x in df.columns if "time_gain" in x]].reset_index()

res = None
for gainVal in np.arange(0,151,5) :
    start = tmp[["kind","edge_density","n"]].copy()
    start["Accepted error"] = gainVal
    start["Computational gain"] = tmp["time_gain" + str(gainVal)] * 100
    start
    res = pd.concat([res,start], ignore_index=True)
res

In [None]:
# Create the relplot without the default legend

# Create the line plot
fig, ax = plt.subplots(figsize=(8, 4))  # Adjust the figure size (aspect ratio = 2)

sns.lineplot(
    x="Accepted error",
    y="Computational gain",
    hue="kind",
    data=res,
    ax=ax
)

# Move the legend inside the plot
sns.move_legend(ax, "lower right", bbox_to_anchor=(1, 0))


# Manually set specific font sizes (optional)
ax.set_title("Computational Gain vs Accepted Error", fontsize=16)
ax.set_xlabel("Accepted Error", fontsize=16)
ax.set_ylabel("Computational Gain", fontsize=16)
ax.tick_params(axis='both', which='major', labelsize=16)
ax.legend(fontsize=16, title_fontsize=16)  # Legend font sizes

plt.savefig("figures/computational_gain.pdf", bbox_inches='tight')

### What is the time spent on the first iteration over N compared to the time spent on the total?

### Space analysis

In [None]:
# Create a copy of the DataFrame
df = paper_df_withGain.copy()

# Ensure numeric columns for space gain
space_gain_cols = [col for col in df.columns if "space_gain" in col]

# Group by "kind" and "edge_density", and calculate the mean for space gain columns
tmp = df.groupby(["kind", "edge_density"])[space_gain_cols].mean().reset_index()

# Initialize the result DataFrame
res = []

# Iterate over gain values and construct the result DataFrame
for gainVal in np.arange(0, 151, 5):
    temp = tmp[["kind", "edge_density"]].copy()  # Preserve 'kind' and 'edge_density'
    temp["Accepted error"] = gainVal
    temp["Spacial gain"] = tmp[f"space_gain{gainVal}"] * 100  # Dynamically select the column
    res.append(temp)

# Concatenate all the intermediate results
res = pd.concat(res, ignore_index=True)

# Display or process `res` further
print(res)


In [None]:
# Create the line plot
fig, ax = plt.subplots(figsize=(8, 4))  # Adjust the figure size (aspect ratio = 2)

sns.lineplot(
    x="Accepted error",
    y="Spacial gain",
    hue="kind",
    data=res,
    ax=ax
)

# Move the legend inside the plot
sns.move_legend(ax, "lower right", bbox_to_anchor=(1, 0))


# Manually set specific font sizes (optional)
ax.set_title("Spacial Gain vs Accepted Error", fontsize=16)
ax.set_xlabel("Accepted Error", fontsize=16)
ax.set_ylabel("Spacial Gain", fontsize=16)
ax.tick_params(axis='both', which='major', labelsize=16)
ax.legend(fontsize=16, title_fontsize=16)  # Legend font sizes



plt.savefig("figures/spacial_gain.pdf", bbox_inches='tight')

In [None]:
# Create the line plot
fig, ax = plt.subplots(figsize=(8, 4))  # Adjust the figure size (aspect ratio = 2)

sns.lineplot(
    x="Accepted error",
    y="Spacial gain",
    hue="kind",
    data=res,
    ax=ax
)

# Move the legend inside the plot
sns.move_legend(ax, "lower right", bbox_to_anchor=(1, 0))


# Manually set specific font sizes (optional)
ax.set_title("Spacial Gain vs Accepted Error", fontsize=16)
ax.set_xlabel("Accepted Error", fontsize=16)
ax.set_ylabel("Spacial Gain", fontsize=16)
ax.tick_params(axis='both', which='major', labelsize=16)
ax.legend(fontsize=16, title_fontsize=16)  # Legend font sizes

plt.xlim(0, 10)
plt.ylim(0, 30)

## Bound analysis

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

def plotBounds(df, title=None):
    sdf = df.copy()
    sdf.LowerBounds = sdf.LowerBounds.apply(lambda x: [max(x[:i] + [x[i]]) for i in range(len(x))])
    sdf.LowerBounds = sdf.apply(
        lambda x: x.LowerBounds + (df.IterationCount.max() - x.IterationCount) * x.LowerBounds[-1:], axis=1
    )
    sdf.UpperBounds = sdf.apply(
        lambda x: x.UpperBounds + (df.IterationCount.max() - x.IterationCount) * x.UpperBounds[-1:], axis=1
    )
    plt.figure(figsize=(10, 6))  # Set figure size
    _ = plt.violinplot(pd.DataFrame(sdf.LowerBounds.to_list(), columns=range(df.IterationCount.max())))
    _ = plt.violinplot(pd.DataFrame(sdf.UpperBounds.to_list(), columns=range(df.IterationCount.max())))
    _ = plt.xlabel("Iteration", fontsize=14)  # Larger font size for x-axis label
    _ = plt.ylabel("Lower/Upper bounds ratio to optimal", fontsize=16)  # Larger font size for y-axis label
    if title is None:
        _ = plt.title("Lower and Upper bounds progress over iterations for all cases", fontsize=16)  # Larger title font
    else:
        _ = plt.title(title, fontsize=16)  # Larger title font
    plt.xticks(fontsize=16)  # Increase font size of x-ticks
    plt.yticks(fontsize=16)  # Increase font size of y-ticks
    plt.tight_layout()  # Adjust layout to fit everything


In [None]:
df = paper_df.copy()
for kind in ALL_KIND:
    plotBounds(df[df["kind"] == kind], title = f"Lower and Upper bounds for {kind}") 
    plt.savefig(f"figures/bounds_{kind}.pdf", bbox_inches='tight')

## Iteration Count 

Edge density has no impact on edge density

In [None]:
df = paper_df.copy()

# Create the figure and axis
fig, ax = plt.subplots(figsize=(10, 5))  # Adjust the figure size (aspect=2)

# Plot using histplot
sns.histplot(
    data=df,
    x="IterationCount",
    hue="kind",
    multiple="dodge",
    ax=ax,
    legend=True  # Keep legend enabled
)

# Customize labels and title
ax.set_title("Distribution of Iteration Count by Kind", fontsize=16)
ax.set_xlabel("Iteration Count", fontsize=16)
ax.set_ylabel("Frequency", fontsize=16)
ax.tick_params(axis="both", which="major", labelsize=16)
    
plt.setp(ax.get_legend().get_texts(), fontsize=16) # for legend text
plt.setp(ax.get_legend().get_title(), fontsize=16) # for legend title

plt.savefig(f"figures/iterations_distribution_by_kind.pdf", bbox_inches='tight')

In [None]:

df = paper_df.copy()
# Create the figure and axis
fig, ax = plt.subplots(figsize=(10, 5))  # Adjust the figure size (aspect=2)

# Plot using histplot
sns.histplot(
    data=df,
    x="IterationCount",
    hue="edge_density",
    multiple="dodge",
    ax=ax,
    legend=True  # Keep legend enabled
)

# Customize labels and title
ax.set_title("Distribution of Iteration Count by Kind", fontsize=16)
ax.set_xlabel("Iteration Count", fontsize=16)
ax.set_ylabel("Frequency", fontsize=16)
ax.tick_params(axis="both", which="major", labelsize=16)
    
plt.setp(ax.get_legend().get_texts(), fontsize=16) # for legend text
plt.setp(ax.get_legend().get_title(), fontsize=16) # for legend title

plt.savefig(f"figures/iterations_distribution_by_density.pdf", bbox_inches='tight')



## Iterations saved

In [None]:
df = paper_df_withGain.copy()


# Create the figure and axis
fig, ax = plt.subplots(figsize=(10, 5))  # Adjust the figure size

# Plot using histplot
sns.histplot(
    x="Iterations saved",
    data=df,
    log_scale=False,  # No log scale applied
    kde=False,  # Disable KDE
    shrink=0.8,  # Shrink the bars
    hue="kind",  # Group by 'kind'
    multiple="dodge",  # Dodge overlapping bars
    ax=ax  # Use the defined axis
)


# Customize labels and title
ax.set_title("Iterations Saved by Kind", fontsize=16)
ax.set_xlabel("Iterations Saved", fontsize=16)
ax.set_ylabel("Frequency", fontsize=16)
ax.tick_params(axis="both", which="major", labelsize=16)
# Adjust legend font size

plt.setp(ax.get_legend().get_texts(), fontsize=16) # for legend text
plt.setp(ax.get_legend().get_title(), fontsize=16) # for legend title

# Save the figure
plt.savefig(f"figures/saved_iterations.pdf", bbox_inches='tight')