In [1]:
import glob, os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import json
import traceback
import pickle
import solver
%matplotlib inline

In [2]:
def loadTSPInstances(path):
    dataframe = None
    frameCreated = False
    for path in glob.glob(path + "*.json"):
        try:
            with open(path) as file:
                jsonDf = json.load(file)
            newFrame = pd.io.json.json_normalize(jsonDf)
            newFrame["name"] = os.path.splitext(os.path.basename(path))[0]
                        
            if not frameCreated:
                dataframe = newFrame
                frameCreated = True
            else:
                dataframe = pd.concat([dataframe, newFrame])
        except:
            traceback.print_exc()
            
    cols = dataframe.columns.tolist()
    cols.remove("name")
    cols.insert(0, "name")
    dataframe = dataframe[cols]
    
    return dataframe.reset_index().drop("index", axis=1)

In [3]:
genInstances = loadTSPInstances("../data/features/SymHeur15/generated/")

In [4]:
genInstances["generated"] = True

In [5]:
len(genInstances)

498

In [6]:
instances = loadTSPInstances("../data/features/SymHeur15/tsplib/")
instances["generated"] = False

instances = genInstances.append(instances)
instances = instances.reset_index().drop("index", axis=1)

# Remove unnecessary tours
instances = instances.drop(["heuristics.simulatedAnnealingValues", "heuristics.graspValues", "heuristics.tabuValues", "heuristics.antColonyValues", "heuristics.geneticValues"], axis=1)
# Remove unimplemented features
# instances = instances.drop(["complexFeatures.entropyDegreeDistribution", "complexFeatures.vertexParticipationCoefficient"], axis=1)
# Replace all -1 values with NaN
instances = instances.replace(-1, np.NaN)

# Remove due to bug in creation
# instances = instances.loc[instances["name"] != "pr2392"]

# Randomize instances
instances = instances.reindex(np.random.permutation(instances.index))

In [7]:
def loadTSPInstances(path, extension):
    instances = []
    for file in glob.glob(path + "*." + extension):
        try:
            tsp = solver.loadTSPLib(file)
            name = os.path.basename(file)
            if not tsp:
                print("Invalid file at " + name)
                continue

            tsp.setName(name)
            instances.append(tsp)
        except:
            traceback.print_exc()
    
    return instances

def loadGeneratedInstances(path):
    instances = []
    for file in glob.glob(path + "*.pytsp"):
        try:
            tspFile = open(file, "rb")
            tsp = pickle.load(tspFile)
            name = os.path.basename(file)
            
            if not tsp:
                print("Invalid file at " + name)
                continue

            tsp.setName(name)
            instances.append(tsp)
            tspFile.close()
        except:
            traceback.print_exc()
            
    return instances

In [8]:
# Paths
tspLibPath = "../data/tsplib/tsp/"
atspLibPath = "../data/tsplib/atsp/"
generatedPath = "../data/generated2/"

tspLibInstances = loadTSPInstances(tspLibPath, "tsp")
atspLibInstances = loadTSPInstances(atspLibPath, "atsp")
generatedInstances = loadGeneratedInstances(generatedPath)

allTSPInstances = tspLibInstances + atspLibInstances + generatedInstances

In [9]:
# Merge cost matrices into instances
costInstances = pd.DataFrame(columns=["name", "costs"])
for instance in allTSPInstances:
    name = os.path.splitext(os.path.basename(instance.getName()))[0]
    costInstances = costInstances.append(pd.DataFrame([[name, instance.costs]], columns=["name", "costs"]))
costInstances = costInstances.reset_index().drop("index", axis=1)
instances = pd.merge(instances, costInstances, on="name")

In [10]:
len(instances)

562

In [16]:
import re
columnNames = list(instances)
regexTimes = re.compile(".*Times")
timesColumnNames = list(filter(regexTimes.match, columnNames))

regexValues = re.compile(".*Values")
valuesColumnNames = list(filter(regexValues.match, columnNames))

regexCosts = re.compile("heuristics.*Costs")
heuristicCostsColumnNames = list(filter(regexCosts.match, columnNames))

In [17]:
# From https://stackoverflow.com/a/40449726
def explode(df, lst_cols, fill_value=''):
    # make sure `lst_cols` is a list
    if lst_cols and not isinstance(lst_cols, list):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)

    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()

    if (lens > 0).all():
        # ALL lists in cells aren't empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, df[lst_cols[0]].str.len())
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .loc[:, df.columns]
    else:
        # at least one list in cells is empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, df[lst_cols[0]].str.len())
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .append(df.loc[lens==0, idx_cols]).fillna(fill_value) \
          .loc[:, df.columns]

In [18]:
deterministicColumnNames = list(columnNames)
multivaluedColumnNames = timesColumnNames + valuesColumnNames + heuristicCostsColumnNames
multivaluedColumnNames.remove("heldKarpTimes")
for column in multivaluedColumnNames:
    try:
        deterministicColumnNames.remove(column)
    except:
        pass

instances = explode(instances, multivaluedColumnNames)

In [19]:
len(instances)

968

In [20]:
# Convert columns to numeric
newInstances = pd.DataFrame()
for column in list(instances):
    if column != "name" and column != "costs":
        numericColumn = instances[column].apply(pd.to_numeric, errors="coerce")
    else:
        numericColumn = instances[column]
    newInstances = pd.concat([newInstances, numericColumn], axis=1)
instances = newInstances
# Replace all -1 values with NaN
instances = instances.replace(-1, np.NaN)

In [21]:
# Group by name and compute means
# group = instances.groupby(["name"])
# averagedInstances = pd.DataFrame()
# for column in list(instances):
#     if column == "name":
#         continue
#     try:
#         groupedMean = pd.DataFrame(group[column].mean())
#         averagedInstances = pd.concat([averagedInstances, groupedMean], axis=1)
#     except:
#         pass

size = instances.shape[0]
trainValidSize = int(size * 0.8)
testSize = size - trainValidSize

averagedInstances = instances[0:trainValidSize]
testInstances = instances[trainValidSize:]

In [22]:
averagedInstances.to_pickle("../data/features/symHeur15analysis.pickle")