# Analysis of output files
## Prepare environment, functions etc.

In [1]:
import os
import sys
import pandas
# import matplotlib
import numpy
from algorithm_tester.helpers import FilePair

# Zapnout zobrazování grafů (procento uvozuje „magickou” zkratku IPythonu):
# %matplotlib inline

path = 'tester_results'

In [2]:
#pandas.set_option('display.max_rows', None)
#pandas.read_csv?

## Important functions

In [7]:
def get_file_paths_from_dir(path: str) -> str:
    for root, _, files in os.walk(path):
        for file in files:
            if "column" not in file:
                yield f'{root}/{file}'

def get_cols_list(path: str):
    cols = pandas.read_csv(path, index_col=None, delimiter=" ", header=None)
    return list(cols.iloc[0])

def load_data_from_dir(folder_path: str, column_list):
    """ Load data from directory files into table. """
    it = get_file_paths_from_dir(folder_path)
    output_table = None
    
    for filepath in it:
        curr_table = pandas.read_csv(filepath, index_col=None, delimiter=" ", header=None)
        curr_table.columns = column_list
                
        if output_table is not None:
            output_table = output_table.append(curr_table, ignore_index=True)
        else:
            output_table = curr_table
    
    #output_table = output_table.set_index(['algorithm', 'dataset', 'id', "item_count"])
    #output_table.sort_values(by=["algorithm", "dataset", "item_count", "id"], inplace=True)
    return output_table

def construct_table_from(filePair: FilePair):
    solution_table = pandas.read_csv(filePair.solutionFile, header=None, index_col=None, delimiter=" ")
    data_table = pandas.read_csv(filePair.dataFile, header=None, index_col=None, delimiter=" ")
    
    item_count = data_table.iloc[0, 1]
    
    solution_table = solution_table.drop_duplicates(subset=[0], keep='first').reset_index()

    data_table = data_table.iloc[:, 4:]
    data_table = data_table[data_table.columns[::2]]

    info_table = pandas.concat([solution_table.iloc[:, 1], solution_table.iloc[:, 3], data_table.max(axis=1)], axis=1)
    info_table.columns = ["id", "best_value", "max_cost"]
    info_table["item_count"] = item_count
    return info_table

def create_avg_time(table, name: str, column: str = "item_count"):
    # Create a table of average times according to algorithm and item_count columns
    avg_times = table.groupby(["algorithm", column])['time[#configs]'] \
        .mean().reset_index().set_index(["algorithm", column])
    avg_times = avg_times.round(2)

    # Move all values of algorithm column into separate columns
    avg_times = avg_times.unstack("algorithm")
    avg_times.columns = avg_times.columns.droplevel()
    avg_times.name = f"Avg #configs per {column}"
    #avg_times.fillna("-", inplace=True)

    # Save the dataframe to csv
    avg_times.to_excel(f'excel/{name}_avg_times.xlsx', sheet_name=name)
    
    return avg_times

def create_avg_error(table, name: str, column: str, exact_strategy_name: str = "DP"):
    greedy_table = table.iloc[table.index.get_level_values('algorithm') == "Greedy"] \
        .rename(columns={'maximum_sum':'found_sum'})
    exact_table = table.iloc[table.index.get_level_values('algorithm') == exact_strategy_name] \
        .drop(columns="time[#configs]")
    
    if column not in greedy_table.index.names:
        greedy_table = greedy_table.loc[:, ["found_sum", column]]
        greedy_table = pandas.merge(greedy_table, exact_table, how="left", on=['id', 'item_count', column])
    else:
        greedy_table = greedy_table.loc[:, ["found_sum"]]
        greedy_table = pandas.merge(greedy_table, exact_table, how="left", on=['id', 'item_count'])
        
    greedy_table["relative_error"] = numpy.abs(greedy_table["maximum_sum"] - greedy_table["found_sum"])/greedy_table["maximum_sum"]
        
    # Create a table with max and average relative_error.
    error_group = greedy_table.groupby([column])["relative_error"]

    error_max = error_group.max().reset_index().set_index([column]) \
        .rename(columns={'relative_error':'max_relative_error'})
    error_avg = error_group.mean().reset_index().set_index([column]) \
        .rename(columns={'relative_error':'avg_relative_error'})

    # Construct, unstack
    avg_error = error_max.join(error_avg).round(6)
    avg_error.columns = ["max_relative_error", "avg_relative_error"]
    avg_error.name = f"Avg & max relative error per {column}"

    avg_error.to_excel(f"excel/{name}_avg_error.xlsx", sheet_name=name)

    return avg_error

## Put data from all analysis files into tables

In [4]:
# Create column lists

cols = get_cols_list(f'{path}/column_description.dat')

In [9]:
# Load tables of all strategies
#balance_table = load_data_from_dir(f'{path}/Balance', cols) \
#    .rename(columns={'instance_info':'balance'})
#robust_table = load_data_from_dir(f'{path}/Robust', cols) \
#    .drop(columns="instance_info")

nk_table = load_data_from_dir(f'{path}/NK', cols) \
    .drop(columns="things")
zkc_table = load_data_from_dir(f'{path}/ZKC', cols) \
    .drop(columns="things")
zkw_table = load_data_from_dir(f'{path}/ZKW', cols) \
    .drop(columns="things")
nk_table

Unnamed: 0,id,item_count,algorithm_name,init_temperature,cooling,min_temperature,cycles,found_value,elapsed_configs,elapsed_time
0,1,40,SA,1000.0,0.995,1.0,50,36583,68950,354.030617
1,2,40,SA,1000.0,0.995,1.0,50,28611,68950,584.387324
2,3,40,SA,1000.0,0.995,1.0,50,41155,68950,393.383550
3,4,40,SA,1000.0,0.995,1.0,50,28104,68950,577.558609
4,5,40,SA,1000.0,0.995,1.0,50,45847,68950,351.685979
...,...,...,...,...,...,...,...,...,...,...
4995,496,30,SA,1000.0,0.995,1.0,50,14950,68950,642.261385
4996,497,30,SA,1000.0,0.995,1.0,50,13240,68950,698.296095
4997,498,30,SA,1000.0,0.995,1.0,50,38957,68950,341.111273
4998,499,30,SA,1000.0,0.995,1.0,50,14416,68950,647.209041
