# Analysis of output files
## Prepare environment, functions etc.

In [1]:
import os
import sys

# Add local src directory to the path. Then we are able to import our files.
module_path = os.path.abspath(os.path.join('src'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas
import matplotlib
import numpy
from helpers import get_analysis_files, getFiles, FilePair
from myDataClasses import AnalysisFile

# Zapnout zobrazování grafů (procento uvozuje „magickou” zkratku IPythonu):
%matplotlib inline

path = 'analysisOutput/results'

In [3]:
#pandas.set_option('display.max_rows', None)
#pandas.read_csv?

In [4]:
# Important functions

def remove_bag_results(table):
    delim_index = list(table.iloc[0]).index("|")
    
    return table.iloc[:, 0:delim_index]

def get_cols_list(path: str):
    cols = pandas.read_csv(path, index_col=None, delimiter=" ", header=None)
    cols = remove_bag_results(cols)
    return list(cols.iloc[0])

def load_analysis_files(folder_path: str, column_list):
    files: AnalysisFile = get_analysis_files(folder_path)
    output_table = None
    
    for file in files:
        curr_table = pandas.read_csv(file.full_path, index_col=None, delimiter=" ", header=None)
        curr_table = remove_bag_results(curr_table)
        curr_table.columns = column_list
        curr_table["dataset"] = file.dataset
        curr_table["strategy"] = file.strategy
        
        if output_table is not None:
            output_table = output_table.append(curr_table, ignore_index=True)
        else:
            output_table = curr_table
    
    output_table = output_table.set_index(['strategy', 'dataset', 'id', "item_count"])
    return output_table

def construct_table_from(filePair: FilePair):
    solution_table = pandas.read_csv(filePair.solutionFile, header=None, index_col=None, delimiter=" ")
    data_table = pandas.read_csv(filePair.dataFile, header=None, index_col=None, delimiter=" ")
    
    item_count = data_table.iloc[0, 1]
    
    solution_table = solution_table.drop_duplicates(subset=[0], keep='first').reset_index()

    data_table = data_table.iloc[:, 4:]
    data_table = data_table[data_table.columns[::2]]

    info_table = pandas.concat([solution_table.iloc[:, 1], solution_table.iloc[:, 3], data_table.max(axis=1)], axis=1)
    info_table.columns = ["id", "best_value", "max_cost"]
    info_table["item_count"] = item_count
    return info_table

def get_info_from_datafiles(path: str):
    dataset = path.split("/")[-1]
    output_table = None
    for filePair in getFiles(path):
        curr_table = construct_table_from(filePair)
        curr_table["dataset"] = dataset
        curr_table = curr_table.set_index(["dataset", "item_count", "id"])
        
        if output_table is not None:
            output_table = output_table.append(curr_table)
        else:
            output_table = curr_table
    
    return output_table

## Put data from all analysis files into tables

In [5]:
# Create column lists

dp_cols = get_cols_list(f'{path}/DP/column_description_DP.dat')
dpweight_cols = get_cols_list(f'{path}/DPWeight/column_description_DPWeight.dat')
greedy_cols = get_cols_list(f'{path}/Greedy/column_description_Greedy.dat')
greedyone_cols = get_cols_list(f'{path}/GreedyOne/column_description_GreedyOne.dat')
fptas_cols = get_cols_list(f'{path}/FPTAS/column_description_FPTAS.dat')

In [6]:
# Load tables of all strategies
dp_table = load_analysis_files(f'{path}/DP', dp_cols)
dpweight_table = load_analysis_files(f'{path}/DPWeight', dpweight_cols)
greedy_table = load_analysis_files(f'{path}/Greedy', greedy_cols)
greedyone_table = load_analysis_files(f'{path}/GreedyOne', greedyone_cols)
fptas_table = load_analysis_files(f'{path}/FPTAS', fptas_cols)

## Get average time values for all strategies

In [7]:
# Add all table rows into 1 table
avg_times = fptas_table.drop(["relative_error"], axis=1).append(dp_table).append(dpweight_table).append(greedy_table).append(greedyone_table)

# Create a table of average times according to strategy and item_count columns
avg_times = avg_times.groupby(["strategy", "item_count"])["time"].mean().reset_index().set_index(["strategy", "item_count"])
avg_times = avg_times.round(4)

# Move all values of strategy column into separate columns
avg_times = avg_times.unstack("strategy")
avg_times.columns = avg_times.columns.droplevel()

# Save the dataframe to csv
avg_times.to_excel('analysisOutput/avg_times.xlsx', header=True)

avg_times

strategy,DP,DPWeight,FPTAS,Greedy,GreedyOne
item_count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4,93.6982,1.4304,0.8519,0.0105,0.0079
10,456.8244,7.7131,10.227,0.016,0.0099
15,1106.8275,21.3662,32.6015,0.0204,0.0119
20,1950.3325,37.899,83.623,0.0269,0.0133
22,1876.8124,51.397,116.6782,0.0317,0.0171
25,3221.8843,61.1677,175.6165,0.0305,0.0163
27,2785.1095,72.0347,222.2353,0.0372,0.0168
30,3349.9526,85.181,292.5677,0.037,0.0186
32,4461.4599,103.9543,331.5438,0.0387,0.0179
35,7109.8726,119.1089,400.9741,0.044,0.0194


## Relative error/item_count analysis

In [8]:
# Create info_table which contains best values and highest cost for all instances
nk_info = get_info_from_datafiles("data/NK")
zkc_info = get_info_from_datafiles("data/ZKC")
zkw_info = get_info_from_datafiles("data/ZKW")

info_table = nk_info.append(zkc_info).append(zkw_info)

### Greedy and GreedyOne

In [9]:
greedy_analysis_table = greedy_table.reset_index().set_index(["dataset", "item_count", "id"])
greedyone_analysis_table = greedyone_table.reset_index().set_index(["dataset", "item_count", "id"])

error_analysis = greedy_analysis_table.append(greedyone_analysis_table).join(info_table)
error_analysis.rename(columns={'maximum_sum':'best_found_value'}, inplace=True)
error_analysis.drop(columns="max_cost", inplace=True)

error_analysis = error_analysis.reset_index().set_index(["strategy", "dataset", "item_count", "id"])
error_analysis.sort_values(by=["strategy", "dataset", "item_count", "id"], inplace=True)
error_analysis["measured_relative_error"] = numpy.abs(error_analysis["best_value"] - error_analysis["best_found_value"])/error_analysis["best_value"]
error_analysis.fillna(0.0, inplace=True)
error_analysis

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,best_found_value,time,best_value,measured_relative_error
strategy,dataset,item_count,id,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Greedy,NK,4,1,1129,0.012579,1129,0.000000
Greedy,NK,4,2,2966,0.011403,2966,0.000000
Greedy,NK,4,3,0,0.010307,0,0.000000
Greedy,NK,4,4,70,0.013882,70,0.000000
Greedy,NK,4,5,2528,0.011245,2758,0.083394
...,...,...,...,...,...,...,...
GreedyOne,ZKW,40,1729,4725,0.015815,5048,0.063986
GreedyOne,ZKW,40,1846,7956,0.015716,7956,0.000000
GreedyOne,ZKW,40,1935,7609,0.015919,7609,0.000000
GreedyOne,ZKW,40,1987,8712,0.015642,8712,0.000000


In [53]:
# Create Greedy/GreedyOne comparisons table
error_group = error_analysis.groupby(["strategy", "item_count"])["measured_relative_error"]

error_max = error_group.max().reset_index().set_index(["strategy", "item_count"]).rename(columns={'measured_relative_error':'max_measured_relative_error'})
error_avg = error_group.mean().reset_index().set_index(["strategy", "item_count"]).rename(columns={'measured_relative_error':'avg_measured_relative_error'})

# Construct, unstack
greedies_comparison = error_max.join(error_avg).round(6).unstack("strategy")
greedies_comparison.columns = ["Greedy_max_error", "GreedyOne_max_error", "Greedy_avg_error", "GreedyOne_avg_error"]

greedies_comparison.to_excel("analysisOutput/greedies_comparison.xlsx")

greedies_comparison


Unnamed: 0_level_0,Greedy_max_error,GreedyOne_max_error,Greedy_avg_error,GreedyOne_avg_error
item_count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4,0.99422,0.663519,0.201841,0.17552
10,0.980897,0.8689,0.094145,0.542395
15,0.9119,0.906724,0.057076,0.692689
20,0.870702,0.925672,0.04121,0.789942
22,0.986863,0.934436,0.04063,0.797961
25,0.882412,0.945159,0.033858,0.828923
27,0.869567,0.945211,0.031298,0.842392
30,0.68206,0.943471,0.025137,0.856279
32,0.916667,0.952653,0.024302,0.870382
35,0.839994,0.955039,0.02082,0.881199


### FPTAS

In [36]:
# Prepare the table for fptas analysis which has data from info_table
fptas_analysis_table = fptas_table.reset_index().drop(columns="strategy")
fptas_analysis_table = fptas_analysis_table.set_index(["dataset", "item_count", "relative_error", "id"])
fptas_analysis_table = fptas_analysis_table.join(info_table)
fptas_analysis_table.rename(columns={'maximum_sum':'best_found_value'}, inplace=True)

# Rearrange columns
fptas_analysis_table = fptas_analysis_table[["max_cost", "best_value", "best_found_value", "time"]]

# Add column with the measured relative errors
fptas_analysis_table["measured_relative_error"] = numpy.abs(fptas_analysis_table["best_value"] - fptas_analysis_table["best_found_value"])/fptas_analysis_table["best_value"]
fptas_analysis_table.fillna(0.0, inplace=True)

# Remove all rows where error == 100 %. These are statistical errors and there aren't many of them (< 25).
anomalies = fptas_analysis_table.query("measured_relative_error == 1.0").shape[0]
print(f'anomalies/all_values: {anomalies}/{fptas_analysis_table.shape[0]}')
fptas_analysis_table = fptas_analysis_table.query("measured_relative_error != 1.0")

fptas_analysis_table

anomalies/all_values: 21/69180


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,max_cost,best_value,best_found_value,time,measured_relative_error
dataset,item_count,id,relative_error,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
NK,4,1,0.2,2084,1129,1129,0.880371,0.0
NK,4,1,0.4,2084,1129,1129,0.491997,0.0
NK,4,1,0.3,2084,1129,1129,0.832692,0.0
NK,4,1,0.1,2084,1129,1129,0.871974,0.0
NK,4,1,0.5,2084,1129,1129,0.419170,0.0
...,...,...,...,...,...,...,...,...
ZKW,40,1991,0.2,4660,4660,4660,325.573507,0.0
ZKW,40,1991,0.4,4660,4660,4660,211.777098,0.0
ZKW,40,1991,0.1,4660,4660,4660,650.592674,0.0
ZKW,40,1991,0.3,4660,4660,4660,181.874641,0.0


In [37]:
#fptas_analysis_table["ommited_bits"] = (fptas_analysis_table.index.get_level_values("relative_error") * fptas_analysis_table["max_cost"])/fptas_analysis_table.index.get_level_values("item_count")
#fptas_analysis_table["ommited_bits"] = numpy.log2(fptas_analysis_table["ommited_bits"])

#fptas_analysis_table["actual_max_relative_error"] = (fptas_analysis_table.index.get_level_values("item_count") * numpy.power(2, numpy.floor(fptas_analysis_table["ommited_bits"])))/fptas_analysis_table["max_cost"]

In [38]:
# Create the aggregated table

error_group = fptas_analysis_table.groupby(["item_count", "relative_error"])["measured_relative_error"]
time_group = fptas_analysis_table.groupby(["item_count", "relative_error"])["time"]

error_max = error_group.max().reset_index().set_index(["item_count", "relative_error"]).rename(columns={'measured_relative_error':'max_measured_relative_error'})
error_avg = error_group.mean().reset_index().set_index(["item_count", "relative_error"]).rename(columns={'measured_relative_error':'avg_measured_relative_error'})
time_avg = time_group.mean().reset_index().set_index(["item_count", "relative_error"])

time_avg["time"] = time_avg["time"]
time_avg.rename(columns={'time':'avg_time[ms]'}, inplace=True)

fptas_error_time = error_max.join(error_avg).join(time_avg)
fptas_error_time = fptas_error_time.round(6)

fptas_error_time.to_excel("analysisOutput/fptas_error_time.xlsx")

fptas_error_time

Unnamed: 0_level_0,Unnamed: 1_level_0,max_measured_relative_error,avg_measured_relative_error,avg_time[ms]
item_count,relative_error,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4,0.1,0.021425,0.000288,1.523912
4,0.2,0.197895,0.001096,1.039187
4,0.3,0.197895,0.001224,0.7192
4,0.4,0.197895,0.003718,0.558511
4,0.5,0.258182,0.006181,0.418535
10,0.1,0.01622,0.000148,20.020886
10,0.2,0.078431,0.00058,12.142093
10,0.3,0.078431,0.001035,8.668218
10,0.4,0.078431,0.001736,5.968464
10,0.5,0.102442,0.002845,4.335126
