# Analysis of output files
## Prepare environment, functions etc.

In [1]:
import os
import sys

# Add local src directory to the path. Then we are able to import our files.
# module_path = os.path.abspath(os.path.join('algorithm_tester'))
# if module_path not in sys.path:
#    sys.path.append(module_path)

In [2]:
import pandas
# import matplotlib
import numpy
from algorithm_tester.helpers import get_analysis_files, getFiles, FilePair
from algorithm_tester.mydataclasses import AnalysisFile

# Zapnout zobrazování grafů (procento uvozuje „magickou” zkratku IPythonu):
# %matplotlib inline

path = 'tester_results'

In [3]:
#pandas.set_option('display.max_rows', None)
#pandas.read_csv?

In [4]:
# Important functions

def remove_bag_results(table):
    delim_index = list(table.iloc[0]).index("|")
    
    return table.iloc[:, 0:delim_index]

def get_cols_list(path: str):
    cols = pandas.read_csv(path, index_col=None, delimiter=" ", header=None)
    cols = remove_bag_results(cols)
    return list(cols.iloc[0])

def load_analysis_files(folder_path: str, column_list):
    files: AnalysisFile = get_analysis_files(folder_path)
    output_table = None
    
    for file in files:
        curr_table = pandas.read_csv(file.full_path, index_col=None, delimiter=" ", header=None)
        curr_table = remove_bag_results(curr_table)
        curr_table.columns = column_list
        curr_table["instance_info"] = file.instance_info
        curr_table["dataset"] = file.dataset
        
        # curr_table["strategy"] = file.strategy
        
        if output_table is not None:
            output_table = output_table.append(curr_table, ignore_index=True)
        else:
            output_table = curr_table
    
    output_table = output_table.set_index(['strategy', 'dataset', 'id', "item_count"])
    output_table.sort_values(by=["strategy", "dataset", "item_count", "id"], inplace=True)
    return output_table

def construct_table_from(filePair: FilePair):
    solution_table = pandas.read_csv(filePair.solutionFile, header=None, index_col=None, delimiter=" ")
    data_table = pandas.read_csv(filePair.dataFile, header=None, index_col=None, delimiter=" ")
    
    item_count = data_table.iloc[0, 1]
    
    solution_table = solution_table.drop_duplicates(subset=[0], keep='first').reset_index()

    data_table = data_table.iloc[:, 4:]
    data_table = data_table[data_table.columns[::2]]

    info_table = pandas.concat([solution_table.iloc[:, 1], solution_table.iloc[:, 3], data_table.max(axis=1)], axis=1)
    info_table.columns = ["id", "best_value", "max_cost"]
    info_table["item_count"] = item_count
    return info_table

def get_info_from_datafiles(path: str):
    dataset = path.split("/")[-1]
    output_table = None
    for filePair in getFiles(path):
        curr_table = construct_table_from(filePair)
        curr_table["dataset"] = dataset
        curr_table = curr_table.set_index(["dataset", "item_count", "id"])
        
        if output_table is not None:
            output_table = output_table.append(curr_table)
        else:
            output_table = curr_table
    
    return output_table

def create_avg_time(table, name: str, column: str = "item_count"):
    # Create a table of average times according to strategy and item_count columns
    avg_times = table.groupby(["strategy", column])['time[#configs]'] \
        .mean().reset_index().set_index(["strategy", column])
    avg_times = avg_times.round(2)

    # Move all values of strategy column into separate columns
    avg_times = avg_times.unstack("strategy")
    avg_times.columns = avg_times.columns.droplevel()
    avg_times.name = f"Avg #configs per {column}"
    #avg_times.fillna("-", inplace=True)

    # Save the dataframe to csv
    avg_times.to_excel(f'excel/{name}_avg_times.xlsx', sheet_name=name)
    
    return avg_times

def create_avg_error(table, name: str, column: str, exact_strategy_name: str = "DP"):
    greedy_table = table.iloc[table.index.get_level_values('strategy') == "Greedy"] \
        .rename(columns={'maximum_sum':'found_sum'})
    exact_table = table.iloc[table.index.get_level_values('strategy') == exact_strategy_name] \
        .drop(columns="time[#configs]")
    
    if column not in greedy_table.index.names:
        greedy_table = greedy_table.loc[:, ["found_sum", column]]
        greedy_table = pandas.merge(greedy_table, exact_table, how="left", on=['id', 'item_count', column])
    else:
        greedy_table = greedy_table.loc[:, ["found_sum"]]
        greedy_table = pandas.merge(greedy_table, exact_table, how="left", on=['id', 'item_count'])
        
    greedy_table["relative_error"] = numpy.abs(greedy_table["maximum_sum"] - greedy_table["found_sum"])/greedy_table["maximum_sum"]
        
    # Create a table with max and average relative_error.
    error_group = greedy_table.groupby([column])["relative_error"]

    error_max = error_group.max().reset_index().set_index([column]) \
        .rename(columns={'relative_error':'max_relative_error'})
    error_avg = error_group.mean().reset_index().set_index([column]) \
        .rename(columns={'relative_error':'avg_relative_error'})

    # Construct, unstack
    avg_error = error_max.join(error_avg).round(6)
    avg_error.columns = ["max_relative_error", "avg_relative_error"]
    avg_error.name = f"Avg & max relative error per {column}"

    avg_error.to_excel(f"excel/{name}_avg_error.xlsx", sheet_name=name)

    return avg_error

## Put data from all analysis files into tables

In [5]:
# Create column lists

cols = get_cols_list(f'{path}/column_description.dat')

In [6]:
# Load tables of all strategies
balance_table = load_analysis_files(f'{path}/Balance', cols) \
    .rename(columns={'instance_info':'balance'})
correlation_table = load_analysis_files(f'{path}/Correlation', cols) \
    .rename(columns={'instance_info':'correlation'})
granularity_heavy_table = load_analysis_files(f'{path}/GranularityHeavy', cols) \
    .rename(columns={'instance_info':'constant'})
granularity_light_table = load_analysis_files(f'{path}/GranularityLight', cols) \
    .rename(columns={'instance_info':'constant'})
maxcost_table = load_analysis_files(f'{path}/MaxCost', cols) \
    .rename(columns={'instance_info':'maxcost'})
maxweight_table = load_analysis_files(f'{path}/MaxWeight', cols) \
    .rename(columns={'instance_info':'maxweight'})
robust_table = load_analysis_files(f'{path}/Robust', cols) \
    .drop(columns="instance_info")
things_table = load_analysis_files(f'{path}/Things', cols) \
    .drop(columns="instance_info")
weight_cap_ratio_table = load_analysis_files(f'{path}/WeightCapRation', cols) \
    .rename(columns={'instance_info':'ratio'})

## Things analysis

In [7]:
# Create a table of average times according to strategy and item_count columns
create_avg_time(things_table, "things")
create_avg_error(things_table, column = "item_count", name = "things")

Unnamed: 0_level_0,max_relative_error,avg_relative_error
item_count,Unnamed: 1_level_1,Unnamed: 2_level_1
5,0.197938,0.009992
10,0.058071,0.007162
15,0.053123,0.002479
20,0.029691,0.00215
25,0.013593,0.001485
30,0.01412,0.001421


## Robust analysis

In [8]:
robust_analysis = robust_table.reset_index().drop_duplicates(subset=["strategy", "time[#configs]"])
robust_analysis.query("strategy == 'BB'").to_excel("excel/not_robust.xlsx")
robust_analysis.query("strategy != 'BB'").to_excel("excel/is_robust.xlsx")

## MaxCost analysis

In [9]:
# Create a table of average times according to strategy and item_count columns
create_avg_time(maxcost_table, "maxcost", column = "maxcost")
create_avg_error(maxcost_table, column = "maxcost", name = "maxcost")

Unnamed: 0_level_0,max_relative_error,avg_relative_error
maxcost,Unnamed: 1_level_1,Unnamed: 2_level_1
100,0.052326,0.002438
200,0.053246,0.0025
300,0.05358,0.002513
400,0.053708,0.002533
500,0.053248,0.002512


## MaxWeight analysis

In [10]:
# Create a table of average times according to strategy and item_count columns
create_avg_time(maxweight_table, "maxweight", column = "maxweight")
create_avg_error(maxweight_table, column = "maxweight", name = "maxweight")

Unnamed: 0_level_0,max_relative_error,avg_relative_error
maxweight,Unnamed: 1_level_1,Unnamed: 2_level_1
100,0.053123,0.002334
200,0.053123,0.002102
300,0.053123,0.002166
400,0.053123,0.002095
500,0.053123,0.002166


## WeightCapRatio analysis

In [11]:
# Create a table of average times according to strategy and item_count columns
create_avg_time(weight_cap_ratio_table, "weight_cap_ratio", column = "ratio")
create_avg_error(weight_cap_ratio_table, column = "ratio", name = "weight_cap_ratio")

Unnamed: 0_level_0,max_relative_error,avg_relative_error
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.235931,0.030398
2,0.109572,0.018047
3,0.115385,0.016484
4,0.091994,0.016049
5,0.080266,0.012525
6,0.075496,0.008059
7,0.051835,0.005326
8,0.053123,0.002479
9,0.016807,0.000398
10,0.0,0.0


## Correlation analysis

In [12]:
# Create a table of average times according to strategy and item_count columns
create_avg_time(correlation_table, "correlation", column = "correlation")
create_avg_error(correlation_table, column = "correlation", name = "correlation")

Unnamed: 0_level_0,max_relative_error,avg_relative_error
correlation,Unnamed: 1_level_1,Unnamed: 2_level_1
Corr,0.060227,0.011222
Strong,0.145148,0.047499
Uni,0.053123,0.002479


## Balance analysis

In [13]:
# Create a table of average times according to strategy and item_count columns
create_avg_time(balance_table, "balance", column = "balance")
create_avg_error(balance_table, column = "balance", name = "balance")

Unnamed: 0_level_0,max_relative_error,avg_relative_error
balance,Unnamed: 1_level_1,Unnamed: 2_level_1
Bal,0.053123,0.002479
Heavy,0.057889,0.004116
Light,0.054489,0.0023


## Granularity analysis

In [14]:
# Create a table of average times according to strategy and item_count columns
create_avg_time(granularity_heavy_table, "granularity_heavy", column = "constant")
create_avg_error(granularity_heavy_table, column = "constant", name = "granularity_heavy")

Unnamed: 0_level_0,max_relative_error,avg_relative_error
constant,Unnamed: 1_level_1,Unnamed: 2_level_1
5,0.082845,0.004416
10,0.077232,0.008134
15,0.121383,0.007316
20,0.084399,0.007431
25,0.093782,0.004766


In [15]:
# Create a table of average times according to strategy and item_count columns
create_avg_time(granularity_light_table, "granularity_light", column = "constant")
create_avg_error(granularity_light_table, column = "constant", name = "granularity_light")

Unnamed: 0_level_0,max_relative_error,avg_relative_error
constant,Unnamed: 1_level_1,Unnamed: 2_level_1
5,0.043785,0.004306
10,0.022936,0.001757
15,0.045276,0.002317
20,0.068942,0.001865
25,0.068942,0.002481
