In [1]:
import glob, os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import json
import traceback
%matplotlib inline

In [2]:
def loadTSPInstances(path):
    dataframe = None
    frameCreated = False
    for path in glob.glob(path + "*.json"):
        try:
            with open(path) as file:
                jsonDf = json.load(file)
            newFrame = pd.io.json.json_normalize(jsonDf)
            newFrame["name"] = os.path.splitext(os.path.basename(path))[0]
                        
            if not frameCreated:
                dataframe = newFrame
                frameCreated = True
            else:
                dataframe = pd.concat([dataframe, newFrame])
        except:
            traceback.print_exc()
            
    cols = dataframe.columns.tolist()
    cols.remove("name")
    cols.insert(0, "name")
    dataframe = dataframe[cols]
    
    return dataframe.reset_index().drop("index", axis=1)

In [3]:
instances = loadTSPInstances("../data/features/tsplib/")
# Remove unnecessary tours
instances = instances.drop(["heuristics.simulatedAnnealingValues", "heuristics.graspValues", "heuristics.tabuValues", "heuristics.antColonyValues", "heuristics.geneticValues"], axis=1)
# Replace all -1 values with None
instances = instances.where(instances != -1, None)

# Remove due to bug in creation
instances = instances.loc[instances["name"] != "pr2392"]

In [4]:
import re
columnNames = list(instances)
regexTimes = re.compile(".*Times")
timesColumnNames = list(filter(regexTimes.match, columnNames))

regexValues = re.compile(".*Values")
valuesColumnNames = list(filter(regexValues.match, columnNames))

In [5]:
# From https://stackoverflow.com/a/40449726
def explode(df, lst_cols, fill_value=''):
    # make sure `lst_cols` is a list
    if lst_cols and not isinstance(lst_cols, list):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)

    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()

    if (lens > 0).all():
        # ALL lists in cells aren't empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, df[lst_cols[0]].str.len())
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .loc[:, df.columns]
    else:
        # at least one list in cells is empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, df[lst_cols[0]].str.len())
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .append(df.loc[lens==0, idx_cols]).fillna(fill_value) \
          .loc[:, df.columns]

In [6]:
timesColumns = list(timesColumnNames)
timesColumns.remove("heldKarpTimes")
instances = explode(instances, timesColumns + valuesColumnNames)

operationTimes = instances.filter(regex="Times")
operationValues = instances.filter(regex="Values")

In [7]:
# Convert columns to numeric
newInstances = pd.DataFrame()
for column in list(instances):
    if column != "name":
        numericColumn = instances[column].apply(pd.to_numeric, errors="coerce")
    else:
        numericColumn = instances[column]
    newInstances = pd.concat([newInstances, numericColumn], axis=1)
instances = newInstances

In [8]:
# Group by name and compute means
group = instances.groupby(["name"])
averagedInstances = pd.DataFrame()
for column in list(instances):
    if column == "name":
        continue
    try:
        groupedMean = pd.DataFrame(group[column].mean())
        averagedInstances = pd.concat([averagedInstances, groupedMean], axis=1)
    except:
        pass

In [9]:
averagedInstances.to_pickle("../data/features/analysis.pickle")