In [1]:
import glob, os, json
import solver
import pickle

import numpy as np
import pandas as pd

import tensorflow as tf

from sklearn.preprocessing import scale
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import normalize
from sklearn.metrics import accuracy_score

import time
import math

import traceback

  return f(*args, **kwds)


In [2]:
df = pd.read_pickle("../data/features/optGenAntAsymAnalysis.pickle")
# # Drop rows with NA
# rowsBefore = df.shape[0]
# df = df.dropna()
# print("Dropped %d rows due to None values" % (rowsBefore - df.shape[0]))

In [3]:
def loadDeepWalkInstance(path):
    file = open(path, "r")
    
    i = 0
    
    nodeCount = None
    shape = None
    
    instance = None
    
    for line in file:
        if i == 0:
            split = line.split(" ")
            nodeCount = int(split[0])
            length = int(split[1])
            
            instance = np.zeros(shape=(nodeCount, length))
        else:
            split = line.split(" ")
            
            node = split[0]
            encoding = np.array(list(map(float, split[1:])))
            
            instance[i - 1] = encoding
            
        i += 1
    
    file.close()
    
    return instance

def loadDeepWalkInstances(path):
    instances = []
    names = []
    for file in glob.glob(path + "*.deep"):
        try:
            instance = loadDeepWalkInstance(file)
            name = os.path.splitext(os.path.splitext(os.path.basename(file))[0])[0]

            instances.append(instance)
            names.append(name)
        except:
            traceback.print_exc()
    
    return instances, names

In [4]:
instances, names = loadDeepWalkInstances("../data/deepwalk2/")

In [5]:
len(names)

840

In [6]:
len(df["name"].unique())

984

In [7]:
MAX_SIZE = 300

In [8]:
# Merge in DeepWalk data
dwInstances = pd.DataFrame(columns=["name", "deepWalk", "sequenceLength"])
reshapedInstances = []
for index, name in enumerate(names):
    instance = instances[index]
    instance = instance.reshape(-1)
    
    size = instance.shape[0]
    
    if name == "pr2392":
        continue
    
    if size >= MAX_SIZE * MAX_SIZE:
        print(instances[index].shape)
        print("Instance %s is too large" % (name))
    
    zeroed = np.zeros((MAX_SIZE * MAX_SIZE))
    zeroed[0: size] = instance
    
    instance = scale(zeroed.astype('float64')).reshape(MAX_SIZE, MAX_SIZE)
        
#     reshapedInstances.append(scale(zeroed.astype('float64')).reshape(MAX_SIZE, MAX_SIZE))
    
#     reshapedInstances.append(instance)
#     instance = scale(instance.astype('float64'),axis=1)
    dwInstances = dwInstances.append(pd.DataFrame([[name, instance, size]], columns=["name", "deepWalk", "sequenceLength"]))
    
dwInstances = dwInstances.reset_index().drop("index", axis=1)
df = pd.merge(df, dwInstances, on="name")

In [9]:
len(df["name"].unique())

245

In [10]:
minCostIndices = df[["heuristics.tabuCosts", "heuristics.simulatedAnnealingCosts", "heuristics.graspCosts", "heuristics.geneticCosts", "heuristics.antColonyCosts"]].idxmin(axis=1)
# minCostIndices = df[["heuristics.tabuCosts", "heuristics.simulatedAnnealingCosts", "heuristics.geneticCosts", "heuristics.antColonyCosts"]].idxmin(axis=1)

In [11]:
import collections
collections.Counter(minCostIndices.tolist())

Counter({'heuristics.antColonyCosts': 454,
         'heuristics.geneticCosts': 1,
         'heuristics.graspCosts': 629,
         'heuristics.simulatedAnnealingCosts': 9,
         'heuristics.tabuCosts': 128})

In [12]:
inputs = np.array(df["deepWalk"].tolist())
sequenceLengths = np.array(df["sequenceLength"].tolist())

In [13]:
costValues = df[["heuristics.tabuCosts", "heuristics.simulatedAnnealingCosts", "heuristics.graspCosts", "heuristics.geneticCosts", "heuristics.antColonyCosts"]].values
indexRankings = costValues.argsort()

In [14]:
indexRankings

array([[4, 1, 3, 2, 0],
       [4, 1, 3, 2, 0],
       [4, 1, 3, 2, 0],
       ..., 
       [2, 0, 1, 4, 3],
       [2, 0, 1, 4, 3],
       [0, 2, 4, 1, 3]])

In [15]:
# intLabels = LabelEncoder().fit_transform(minCostIndices).reshape(-1, 1)
# # 5 values for 5 different heuristics
# # Drop grasp from analysis
# outputs = OneHotEncoder(sparse=False, n_values=5).fit_transform(intLabels)

# inputs = df

size = df.shape[0]
# Test data is separated in cleaning stage
trainSize = int(size * 0.75)
validSize = size - trainSize

inputsTrain = inputs[0:trainSize]
lengthsTrain = sequenceLengths[0:trainSize]
outputsTrainUnnorm = indexRankings[0:trainSize]
outputsTrain = normalize(outputsTrainUnnorm)

inputsValid = inputs[trainSize:]
lengthsValid = sequenceLengths[trainSize:]
outputsValidUnnorm = indexRankings[trainSize:]
outputsValid = normalize(outputsValidUnnorm)

In [16]:
EPOCHS = 10000

N1 = trainSize
LABEL_COUNT = 5

NODES1 = 512
NODES2 = 256
NODES3 = 64

LSTM_SIZE = 100
LSTM_LAYER_COUNT = 2
LSTM_DROPOUT_PROB = 0.5

ALPHA = 0.08

BATCH_SIZE = 30

STD = 0.1

LEARNING_RATE = 0.001

In [17]:
# Define the input function for training
inputFunc = tf.estimator.inputs.numpy_input_fn(
    x={"input": inputsTrain, "length": lengthsTrain}, y=outputsTrainUnnorm.astype(float),
    batch_size=BATCH_SIZE, num_epochs=EPOCHS, shuffle=True)

In [18]:
outputsTrain[0]

array([ 0.73029674,  0.18257419,  0.54772256,  0.36514837,  0.        ])

In [19]:
# Define the neural network
def network(xDict):
    x = xDict["input"]
    
    length = xDict["length"]
    
#     input_layer = tf.reshape(x, shape=[-1, MAX_SIZE, MAX_SIZE, 1])

    initialCell = tf.contrib.rnn.BasicLSTMCell(LSTM_SIZE)
    initialCell = tf.contrib.rnn.DropoutWrapper(initialCell, input_keep_prob=LSTM_DROPOUT_PROB, variational_recurrent=True, input_size=x.shape[2], dtype=tf.float64)
    
    secondCell = tf.contrib.rnn.BasicLSTMCell(LSTM_SIZE)
    secondCell = tf.contrib.rnn.DropoutWrapper(secondCell, input_keep_prob=LSTM_DROPOUT_PROB, variational_recurrent=True, input_size=initialCell.output_size, dtype=tf.float64)
        
    stackedLstm = tf.contrib.rnn.MultiRNNCell([initialCell, secondCell])

    lstmOutput, _ = tf.nn.dynamic_rnn(stackedLstm, x, dtype=tf.float64, sequence_length=length)
    
    flatten = tf.contrib.layers.flatten(lstmOutput)
    
    regularizer = tf.contrib.layers.l2_regularizer(scale=ALPHA)
    
    # Hidden fully connected layer
    layer1 = tf.layers.dense(flatten, NODES1, kernel_regularizer=regularizer, activation=tf.nn.relu)
    layer2 = tf.layers.dense(layer1, NODES2, kernel_regularizer=regularizer, activation=tf.nn.relu)
    layer3 = tf.layers.dense(layer2, NODES3, kernel_regularizer=regularizer, activation=tf.nn.relu)
    # Output fully connected layer with a neuron for each class
    outLayer = tf.layers.dense(layer3, LABEL_COUNT)
    return outLayer

## Loss Functions

In [55]:
# Kullback-Leibler Divergence, as per https://stackoverflow.com/a/43298483
def klDivergence(p, q):
    pClipped = tf.clip_by_value(p, 1e-10, 1.0)
    qClipped = tf.clip_by_value(q, 1e-10, 1.0)
    return tf.reduce_sum(pClipped * tf.log(pClipped/qClipped))

# Loss function based off of Jensen-Shannon Divergence
def loss(label, prediction):
    mean = 0.5 * (label + prediction)
    return 0.5 * klDivergence(label, mean) + 0.5 * klDivergence(prediction, mean)

def log2(x):
    numerator = tf.log(x)
    denominator = tf.log(tf.constant(2, dtype=numerator.dtype))
    return numerator / denominator

def listNetLoss(label, prediction):
    softMaxLabel = tf.nn.softmax(label)
    softMaxPrediction = tf.nn.softmax(prediction)
    return -tf.reduce_mean(softMaxLabel * tf.log(softMaxPrediction))

def listMLE(label, prediction):
    sortedPrediction = tf.gather(prediction, tf.nn.top_k(label, k=5).indices)
    final = tf.log(tf.reduce_sum(tf.exp(sortedPrediction)))
    return tf.reduce_sum(final - sortedPrediction)

def listMLE2Loss(labels, predictions, length, length64):
    i = tf.constant(0, dtype=tf.int32)
    innerSum = tf.constant(0, dtype=tf.float64)
    
    def loop(label, prediction, i, innerSum):
        return tf.add(i, 1), tf.add(innerSum, listMLE2(label, prediction))
    
    cond = lambda i, _: tf.less(i, length)
    operation = lambda i, innerSum: loop(labels[i], predictions[i], i, innerSum)
    result = tf.while_loop(cond, operation, [i, innerSum])

    return result[1]/length64
#     return tf.constant(1.0, dtype=tf.float64) * labels + predictions

def listMLE2(label, prediction):
    # Length of vectors
    k = tf.constant(LABEL_COUNT, dtype=tf.int32)
    
    sortedPrediction = tf.gather(prediction, tf.nn.top_k(label, k=k).indices)
    
    j = tf.constant(0, dtype=tf.int32)
    innerSum = tf.constant(0, dtype=tf.float64)
    cond = lambda j, _: tf.less(j, k)
    operation = lambda j, innerSum: listMLE2Loop(sortedPrediction, j, k, innerSum)
    result = tf.while_loop(cond, operation, [j, innerSum])
    
    print(result[1].shape)
    
    return -result[1]
    
def listMLE2Loop(sortedPrediction, j, k, innerSum):
    return tf.add(j, 1), tf.add(innerSum, listMLE2Inner(sortedPrediction, j, k))

def listMLE2Inner(sortedPrediction, j, k):
    numerator = tf.exp(tf.gather(sortedPrediction, j))
    denominator = tf.reduce_sum(tf.exp(sortedPrediction[j:k]))
    
    return tf.log(numerator/denominator)

# Builds an integer ranking out of a 1-D tensor
def convertPredToRank(prediction):
    return tf.cast(tf.nn.top_k(prediction, k=5).indices, dtype=tf.float64)

In [56]:
with tf.Session() as sess:
    array1 = tf.constant([[0, 1, 2, 3, 4]], dtype=tf.float64)
    array2 = tf.constant([[0, 2, 3, 4, 1]], dtype=tf.float64)
    print(listMLE2Loss(array1, array2).eval())
    print(listMLE2(array1, array1).eval())
    print(listMLE2(array1, array2).eval())
    print(listMLE2(array2, array1).eval())

TypeError: listMLE2Loss() missing 2 required positional arguments: 'length' and 'length64'

In [57]:
with tf.Session() as sess:
    array1 = tf.constant([2, 1, 0, 3, 4], dtype=tf.float64)
#     array2 = tf.constant([0, 2, 3, 4, 1], dtype=tf.float64)
#     final = tf.log(tf.reduce_sum(tf.exp(array1)))
#     print(final.eval())
    print(array1[0:5].eval())
#     print(tf.slice(array1, [2], [4]).eval())

[ 2.  1.  0.  3.  4.]


## Accuracy Measures

In [58]:
# Accuracy metric using Normalized Discounted Cumulative Gain, as per https://github.com/shiba24/learning2rank/
def ndcg(labels, predictions, k=5):
    topK = tf.nn.top_k(labels, k=5)
    sortedValues = topK.values
    sortedIndices = topK.indices
#         print(labelSorted)
#         labelSorted = sorted(label, reverse=True)
    ideal_dcg = 0
    for i in range(k):
#             ideal_dcg += (2 ** labelSorted[:i] - 1.) / log2(tf.cast(i + 2, tf.float64))
        ideal_dcg += (sortedValues[i] + 1) / log2(tf.cast(i + 2, tf.float64))
    dcg = 0
#         argsort_indices = np.argsort(predictions)[::-1]
#         argsort_indices = tf.nn.top_k(predictions, k=5).indices
#         print(argsort_indices)
    for i in range(k):
        dcg += (tf.gather(predictions, sortedIndices[i]) + 1) / log2(tf.cast(i + 2, tf.float64))
#         dcg += (predictions[i] + 1) / log2(tf.cast(i + 2, tf.float64))
    return dcg / ideal_dcg

def spearmanCorrelation(label, prediction):
    length = tf.cast(tf.shape(prediction)[0], tf.float64)
    sumVal = tf.reduce_sum(tf.square(tf.subtract(prediction, label)))
    return 1 - 6 * sumVal / (length ** 3 - length)

# Bound Spearman coeff. between 0 and 1
def boundedSpearman(label, prediction):
    return (spearmanCorrelation(label, prediction) + 1.)/2

def top1Match(label, prediction):
    return tf.cast(tf.equal(label[0], prediction[0]), tf.float64)

In [59]:
# # Define the model function (following TF Estimator Template)
# def modelFunc(features, labels, mode):
#     # Build the neural network
#     logits = network(features)
    
# #     resizedLogits = tf.reshape(logits, shape=[-1, MAX_SIZE * MAX_SIZE, 1])
    
#     # Predictions
#     # TODO: Possibly need to change
#     pred_classes = logits
# #     pred_classes = tf.argmax(logits, axis=1)
# #     pred_probas = tf.nn.softmax(logits)
#     pred_probas = tf.nn.sigmoid(logits)
    
#     # If prediction mode, early return
#     if mode == tf.estimator.ModeKeys.PREDICT:
#         return tf.estimator.EstimatorSpec(mode, predictions=pred_classes)
    
#     print(logits.shape)
# #     print(resizedLogits.shape)
#     print(labels.shape)
#     print(pred_classes.shape)
        
#     # Define loss and optimizer
# #     loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
# #         logits=logits, labels=tf.cast(labels, dtype=tf.int32)))
#     loss_op = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
#         logits=logits, labels=labels))
#     optimizer = tf.train.GradientDescentOptimizer(learning_rate=LEARNING_RATE)
#     train_op = optimizer.minimize(loss_op, global_step=tf.train.get_global_step())
    
#     # Evaluate the accuracy of the model
# #     acc_op = tf.metrics.accuracy(labels=tf.argmax(labels, axis=1), predictions=pred_classes)
#     acc_op = tf.metrics.accuracy(labels=labels, predictions=pred_classes)
    
#     # TF Estimators requires to return a EstimatorSpec, that specify
#     # the different ops for training, evaluating, ...
#     estim_specs = tf.estimator.EstimatorSpec(
#       mode=mode,
#       predictions=pred_classes,
#       loss=loss_op,
#       train_op=train_op,
#       eval_metric_ops={'accuracy': acc_op})

#     return estim_specs

# Define the model function (following TF Estimator Template)
def modelFunc(features, labels, mode):
    # Build the neural network
    logits = network(features)
    
#     resizedLogits = tf.reshape(logits, shape=[-1, MAX_SIZE * MAX_SIZE, 1])
    
    # Predictions
    # TODO: Possibly need to change
#     pred_classes = logits
    pred_classes = tf.map_fn(convertPredToRank, logits)
#     pred_classes = tf.argmax(logits, axis=1)
#     pred_probas = tf.nn.softmax(logits)
    
    # If prediction mode, early return
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode, predictions=pred_classes)
    
    # Define loss and optimizer
#     loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
#         logits=logits, labels=tf.cast(labels, dtype=tf.int32)))
#     loss_op = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
#         logits=logits, labels=labels))
#     loss_op = tf.reduce_mean(loss(labels, logits))
#     loss_op = tf.reduce_mean(listNetLoss(labels, logits))
#     loss_map = tf.map_fn(lambda x: listMLE2(x[0], x[1]), (labels, pred_classes), dtype=tf.float64)
#     print(labels.get_shape()[0])
    labels_length = tf.shape(labels)[0]
    loss_op = tf.reduce_mean(listMLE2Loss(labels, logits, labels_length, tf.cast(labels_length, dtype=tf.float64)))
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=LEARNING_RATE)
#     optimizer = tf.train.AdamOptimizer()
    train_op = optimizer.minimize(loss_op, global_step=tf.train.get_global_step())
    
    # Evaluate the accuracy of the model
#     acc_op = tf.metrics.accuracy(labels=tf.argmax(labels, axis=1), predictions=pred_classes)
#     acc_op = tf.metrics.accuracy(labels=labels, predictions=pred_classes)
    ndcg_map = tf.map_fn(lambda x: ndcg(x[0], x[1]), (labels, pred_classes), dtype=tf.float64)
    ndcg_op = tf.metrics.mean(ndcg_map)
    top1_map = tf.map_fn(lambda x: top1Match(x[0], x[1]), (labels, pred_classes), dtype=tf.float64)
    top1_op = tf.metrics.mean(top1_map)
    spearman_map = tf.map_fn(lambda x: boundedSpearman(x[0], x[1]), (labels, pred_classes), dtype=tf.float64)
    acc_op = tf.metrics.mean(spearman_map)
    
    # TF Estimators requires to return a EstimatorSpec, that specify
    # the different ops for training, evaluating, ...
    estim_specs = tf.estimator.EstimatorSpec(
      mode=mode,
      predictions=pred_classes,
      loss=loss_op,
      train_op=train_op,
      eval_metric_ops={'accuracy': acc_op, 'ndcg': ndcg_op, 'top1Classification': top1_op})

    return estim_specs

In [60]:
# Build the Estimator
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.5
model = tf.estimator.Estimator(modelFunc, config=tf.contrib.learn.RunConfig(session_config=config))

INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1c74dbdc18>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_session_config': gpu_options {
  per_process_gpu_memory_fraction: 0.5
}
, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': '/var/folders/2v/nktg94cn4cvfw3vprys2rgtm0000gn/T/tmpywtxm1q6'}


In [71]:
# Train the Model
model.train(inputFunc, steps=200)

()


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from /var/folders/2v/nktg94cn4cvfw3vprys2rgtm0000gn/T/tmpywtxm1q6/model.ckpt-1000
INFO:tensorflow:Saving checkpoints for 1001 into /var/folders/2v/nktg94cn4cvfw3vprys2rgtm0000gn/T/tmpywtxm1q6/model.ckpt.
INFO:tensorflow:loss = 4.18405637852, step = 1001
INFO:tensorflow:global_step/sec: 0.859135
INFO:tensorflow:loss = 4.14390417705, step = 1101 (116.399 sec)
INFO:tensorflow:Saving checkpoints for 1200 into /var/folders/2v/nktg94cn4cvfw3vprys2rgtm0000gn/T/tmpywtxm1q6/model.ckpt.
INFO:tensorflow:Loss for final step: 3.7793384686.


<tensorflow.python.estimator.estimator.Estimator at 0x1c74dbd588>

In [72]:
# Evaluate the Model
# Define the input function for evaluating
validFunc = tf.estimator.inputs.numpy_input_fn(
    x={"input": inputsValid, "length": lengthsValid}, y=outputsValidUnnorm.astype(float),
    batch_size=BATCH_SIZE, shuffle=False)
# Use the Estimator 'evaluate' method
model.evaluate(validFunc)

()


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


INFO:tensorflow:Starting evaluation at 2018-03-13-16:49:15
INFO:tensorflow:Restoring parameters from /var/folders/2v/nktg94cn4cvfw3vprys2rgtm0000gn/T/tmpywtxm1q6/model.ckpt-1200
INFO:tensorflow:Finished evaluation at 2018-03-13-16:49:19
INFO:tensorflow:Saving dict for global step 1200: accuracy = 0.681373, global_step = 1200, loss = 4.08465, ndcg = 0.908805, top1Classification = 0.506536


{'accuracy': 0.68137252,
 'global_step': 1200,
 'loss': 4.08465,
 'ndcg': 0.90880531,
 'top1Classification': 0.50653595}

In [207]:
with tf.Session() as sess:
    array1 = tf.constant([4, 3, 2, 1, 0])
    print(tf.cast(tf.equal(array1[0], 4), tf.float64).eval())

1.0


In [202]:
outputsTrainUnnorm[0][0]

4

In [123]:
import collections
collections.Counter(list(map(str, outputsTrainUnnorm)))

Counter({'[0 1 2 4 3]': 1,
         '[0 2 1 4 3]': 12,
         '[0 2 4 1 3]': 83,
         '[0 2 4 3 1]': 2,
         '[1 2 4 0 3]': 1,
         '[1 4 2 0 3]': 1,
         '[1 4 2 3 0]': 1,
         '[2 0 1 3 4]': 1,
         '[2 0 1 4 3]': 37,
         '[2 0 4 1 3]': 379,
         '[2 0 4 3 1]': 5,
         '[2 1 0 4 3]': 1,
         '[2 1 4 0 3]': 5,
         '[2 1 4 3 0]': 1,
         '[2 3 1 4 0]': 1,
         '[2 4 0 1 3]': 33,
         '[2 4 0 3 1]': 3,
         '[2 4 1 0 3]': 6,
         '[2 4 1 3 0]': 1,
         '[2 4 3 1 0]': 1,
         '[3 1 4 2 0]': 1,
         '[4 1 2 3 0]': 7,
         '[4 1 3 0 2]': 38,
         '[4 1 3 2 0]': 273,
         '[4 2 0 1 3]': 3,
         '[4 2 0 3 1]': 1,
         '[4 2 1 0 3]': 4,
         '[4 3 1 2 0]': 13})