# Analysis of output files
## Prepare environment, functions etc.

In [1]:
import os
import sys

# Add local src directory to the path. Then we are able to import our files.
module_path = os.path.abspath(os.path.join('src'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas
import matplotlib
import numpy
from helpers import get_analysis_files, getFiles, FilePair
from myDataClasses import AnalysisFile

path = 'analysisOutput/results'

In [3]:
#pandas.set_option('display.max_rows', None)
#pandas.read_csv?

In [4]:
# Important functions

def remove_bag_results(table):
    delim_index = list(table.iloc[0]).index("|")
    
    return table.iloc[:, 0:delim_index]

def get_cols_list(path: str):
    cols = pandas.read_csv(path, index_col=None, delimiter=" ", header=None)
    cols = remove_bag_results(cols)
    return list(cols.iloc[0])

def load_analysis_files(folder_path: str, column_list):
    files: AnalysisFile = get_analysis_files(folder_path)
    output_table = None
    
    for file in files:
        curr_table = pandas.read_csv(file.full_path, index_col=None, delimiter=" ", header=None)
        curr_table = remove_bag_results(curr_table)
        curr_table.columns = column_list
        curr_table["dataset"] = file.dataset
        curr_table["strategy"] = file.strategy
        
        if output_table is not None:
            output_table = output_table.append(curr_table, ignore_index=True)
        else:
            output_table = curr_table
    
    output_table = output_table.set_index(['strategy', 'dataset', 'id', "item_count"])
    return output_table

def construct_table_from(filePair: FilePair):
    solution_table = pandas.read_csv(filePair.solutionFile, header=None, index_col=None, delimiter=" ")
    data_table = pandas.read_csv(filePair.dataFile, header=None, index_col=None, delimiter=" ")
    
    item_count = data_table.iloc[0, 1]
    
    solution_table = solution_table.drop_duplicates(subset=[0], keep='first').reset_index()

    data_table = data_table.iloc[:, 4:]
    data_table = data_table[data_table.columns[::2]]

    info_table = pandas.concat([solution_table.iloc[:, 1], solution_table.iloc[:, 3], data_table.max(axis=1)], axis=1)
    info_table.columns = ["id", "best_value", "max_cost"]
    info_table["item_count"] = item_count
    return info_table

def get_info_from_datafiles(path: str):
    dataset = path.split("/")[-1]
    output_table = None
    for filePair in getFiles(path):
        curr_table = construct_table_from(filePair)
        curr_table["dataset"] = dataset
        curr_table = curr_table.set_index(["dataset", "item_count", "id"])
        
        if output_table is not None:
            output_table = output_table.append(curr_table)
        else:
            output_table = curr_table
    
    return output_table

## Put data from all analysis files into tables

In [5]:
# Create column lists

dp_cols = get_cols_list(f'{path}/DP/column_description_DP.dat')
dpweight_cols = get_cols_list(f'{path}/DPWeight/column_description_DPWeight.dat')
greedy_cols = get_cols_list(f'{path}/Greedy/column_description_Greedy.dat')
greedyone_cols = get_cols_list(f'{path}/GreedyOne/column_description_GreedyOne.dat')
fptas_cols = get_cols_list(f'{path}/FPTAS/column_description_FPTAS.dat')

In [6]:
# Load tables of all strategies
dp_table = load_analysis_files(f'{path}/DP', dp_cols)
dpweight_table = load_analysis_files(f'{path}/DPWeight', dpweight_cols)
greedy_table = load_analysis_files(f'{path}/Greedy', greedy_cols)
greedyone_table = load_analysis_files(f'{path}/GreedyOne', greedyone_cols)
fptas_table = load_analysis_files(f'{path}/FPTAS', fptas_cols)

## Get average time values for all strategies

In [7]:
# Add all table rows into 1 table
avg_times = fptas_table.drop(["relative_error"], axis=1).append(dp_table).append(dpweight_table).append(greedy_table).append(greedyone_table)

# Create a table of average times according to strategy and item_count columns
avg_times = avg_times.groupby(["strategy", "item_count"])["time"].mean().reset_index().set_index(["strategy", "item_count"])
avg_times = avg_times.round(4)

# Move all values of strategy column into separate columns
avg_times = avg_times.unstack("strategy")
avg_times.columns = avg_times.columns.droplevel()

# Save the dataframe to csv
avg_times.to_csv ('analysisOutput/avg_times.csv', header=True)

avg_times

strategy,DP,DPWeight,FPTAS,Greedy,GreedyOne
item_count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4,93.6982,1.4304,0.8519,0.0105,0.0079
10,456.8244,7.7131,10.227,0.016,0.0099
15,1106.8275,21.3662,32.6015,0.0204,0.0119
20,1950.3325,37.899,83.623,0.0269,0.0133
22,1876.8124,51.397,116.6782,0.0317,0.0171
25,3221.8843,61.1677,175.6165,0.0305,0.0163
27,2785.1095,72.0347,222.2353,0.0372,0.0168
30,3349.9526,85.181,292.5677,0.037,0.0186
32,4461.4599,103.9543,331.5438,0.0387,0.0179
35,7109.8726,119.1089,400.9741,0.044,0.0194


## FPTAS error/item_count analysis

In [8]:
# Create info_table which contains best values and highest cost for all instances
nk_info = get_info_from_datafiles("data/NK")
zkc_info = get_info_from_datafiles("data/ZKC")
zkw_info = get_info_from_datafiles("data/ZKW")

info_table = nk_info.append(zkc_info).append(zkw_info)

In [9]:
# Prepare the table for fptas analysis which has data from info_table
fptas_analysis_table = fptas_table.reset_index().drop(columns="strategy")
fptas_analysis_table = fptas_analysis_table.set_index(["dataset", "item_count", "relative_error", "id"])
fptas_analysis_table = fptas_analysis_table.join(info_table)
fptas_analysis_table.rename(columns={'maximum_sum':'best_found_value'}, inplace=True)

# Rearrange columns
fptas_analysis_table = fptas_analysis_table[["max_cost", "best_value", "best_found_value", "time"]]

In [10]:
fptas_analysis_table["ommited_bits"] = (fptas_analysis_table.index.get_level_values("relative_error") * fptas_analysis_table["max_cost"])/fptas_analysis_table.index.get_level_values("item_count")
fptas_analysis_table["ommited_bits"] = numpy.log2(fptas_analysis_table["ommited_bits"])

fptas_analysis_table["measured_relative_error"] = (fptas_analysis_table.index.get_level_values("item_count") * numpy.power(2, numpy.floor(fptas_analysis_table["ommited_bits"])))/fptas_analysis_table["max_cost"]

In [11]:
pandas.set_option('display.max_rows', None)

fptas_analysis_table.query("item_count == 4 and relative_error == 0.1")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,max_cost,best_value,best_found_value,time,ommited_bits,measured_relative_error
dataset,item_count,id,relative_error,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
NK,4,1,0.1,2084,1129,1129,0.871974,5.703211,0.06142
NK,4,2,0.1,1503,2966,2966,1.191389,5.231701,0.085163
NK,4,3,0.1,2253,0,0,1.205255,5.815704,0.056813
NK,4,4,0.1,1826,70,70,1.166392,5.512543,0.070099
NK,4,5,0.1,1755,2758,2758,1.255445,5.455327,0.072934
NK,4,6,0.1,1725,1641,1641,1.403681,5.430453,0.074203
NK,4,7,0.1,1676,4607,4607,1.6726,5.388878,0.076372
NK,4,8,0.1,2341,4422,4422,1.460405,5.870981,0.054677
NK,4,9,0.1,1635,3451,3451,1.241336,5.353147,0.078287
NK,4,10,0.1,2388,744,744,1.422481,5.899659,0.053601


In [39]:
error_group = fptas_analysis_table.groupby(["item_count", "relative_error"])["measured_relative_error"]
time_group = fptas_analysis_table.groupby(["item_count", "relative_error"])["time"]

error_max = error_group.max().reset_index().set_index(["item_count", "relative_error"]).rename(columns={'measured_relative_error':'measured_relative_error_max'})
error_avg = error_group.mean().reset_index().set_index(["item_count", "relative_error"]).rename(columns={'measured_relative_error':'measured_relative_error_avg'})
time_sum = time_group.sum().reset_index().set_index(["item_count", "relative_error"])

time_sum["time"] = time_sum["time"]/1000

error_max.join(error_avg).join(time_sum)

Unnamed: 0_level_0,Unnamed: 1_level_0,measured_relative_error_max,measured_relative_error_avg,time
item_count,relative_error,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4,0.1,0.1,0.06701,2.977725
4,0.2,0.2,0.13402,2.030097
4,0.3,0.299941,0.222117,1.405725
4,0.4,0.4,0.26804,1.090785
4,0.5,0.5,0.378338,0.818478
10,0.1,0.099984,0.070702,25.947069
10,0.2,0.199969,0.141404,15.736153
10,0.3,0.299766,0.244475,11.234011
10,0.4,0.399938,0.282808,7.73513
10,0.5,0.5,0.29869,5.618324


In [33]:
fptas_analysis_table["test"] = numpy.abs(fptas_analysis_table["best_value"] - fptas_analysis_table["best_found_value"])/fptas_analysis_table["best_value"]
fptas_analysis_table.fillna(0.0, inplace=True)
one = fptas_analysis_table.query('test != 1.0 and test != 0.0').shape[0]
two = fptas_analysis_table.query('test == 0.0').shape[0]
three = fptas_analysis_table.query('test == 1.0').shape[0]
four = fptas_analysis_table.query('relative_error < test and test != 1').shape[0]

print("Četnosti naměřených relativních chyb:\n")
print(f'(e == 0): {two}\n(e == 1): {three}\n(0 < e < 1): {one}')
print(f'(e_wanted < e_measured and e_measured != 1): {four}')


Četnosti naměřených relativních chyb:

(e == 0): 47820
(e == 1): 21
(0 < e < 1): 21339
(e_wanted < e_measured and e_measured != 1): 0
