# PNML Experiments

In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import collections
tf.enable_eager_execution()

%matplotlib inline

## Create a sample 2D dataset
One half is going to be unexplored, similar to the Easy maze task. Then, we'll:
1. collect goal examples in the unexplored region
2. train the goal classifier (offline) similar to VICE
3. query test points (training an extra logistic regression layer on top for points in and out of the region to check what the reward is

In [None]:
# 4x4 pointmass environment, sampling from the left half
data = np.random.uniform([-4, -4], [0, 4], size=(500, 2))

In [None]:
goal = np.array([2.5, -2.5])

In [None]:
plt.figure(figsize=(5, 5))
plt.xlim(-4, 4)
plt.ylim(-4, 4)
plt.scatter(data[:, 0], data[:, 1], alpha=0.5)
plt.scatter(goal[0], goal[1], marker='*', s=100)
plt.title('Data Collected by Policy')
plt.show()

In [None]:
positives = np.random.uniform(goal - 0.1, goal + 0.1, size=(100, 2))

In [None]:
plt.figure(figsize=(5, 5))
plt.xlim(-4, 4)
plt.ylim(-4, 4)
plt.scatter(positives[:, 0], positives[:, 1], alpha=0.5)
plt.scatter(goal[0], goal[1], marker='*', s=100)
plt.title('Collected Positives')

## Initialize and Train Goal Classifier

In [None]:
from softlearning.models.vice_models import create_feedforward_reward_classifier_function

In [None]:
input_shapes = collections.OrderedDict({
    'state_obesrvation': np.array([2]),
})

classifier = create_feedforward_reward_classifier_function(
    input_shapes=input_shapes,
    hidden_layer_sizes=(64, ),
    activation='linear',
    output_activation='sigmoid',
)

In [None]:
classifier.layers[1].summary()

In [None]:
loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=False)
def loss(model, x, y, training):
    # training=training is needed only if there are layers with different
    # behavior during training versus inference (e.g. Dropout).
    y_ = model(x, training=training)

    return loss_object(y_true=y, y_pred=y_)

def grad(model, inputs, targets):
    with tf.GradientTape() as tape:
        loss_value = loss(model, inputs, targets, training=True)
    return loss_value, tape.gradient(loss_value, model.trainable_variables)

optimizer = tf.keras.optimizers.Adam(learning_rate=3e-4)

In [None]:
# Keep results for plotting
train_loss_results = []

num_epochs = 101

for epoch in range(num_epochs):
    epoch_loss_avg = tf.keras.metrics.Mean()

    for i in range(10):
        negatives_batch_idx = np.random.choice(data.shape[0], size=32)
        positives_batch_idx = np.random.choice(positives.shape[0], size=32)
        negatives_batch = data[negatives_batch_idx]
        positives_batch = positives[positives_batch_idx]
        
        x = np.vstack((negatives_batch, positives_batch))
        y = np.vstack((
            np.zeros((len(negatives_batch), 1)),
            np.ones((len(positives_batch), 1)),
        ))
        
        loss_value, grads = grad(classifier, x, y)
        optimizer.apply_gradients(zip(grads, classifier.trainable_variables))

        # Track progress
        epoch_loss_avg.update_state(loss_value)  # Add current batch loss

    # End epoch
    train_loss_results.append(epoch_loss_avg.result())

    if epoch % 50 == 0:
        print("Epoch {:03d}: Loss: {:.6f}".format(epoch, epoch_loss_avg.result()))

In [None]:
n_samples = 50
xs = np.linspace(-4, 4, n_samples)
ys = np.linspace(-4, 4, n_samples)

plt.figure(figsize=(5, 5))
xys = np.meshgrid(xs, ys)
grid_vals = np.array(xys).transpose(1, 2, 0).reshape((n_samples * n_samples, 2))

rewards = classifier.predict(grid_vals)

plt.contourf(xys[0], xys[1], rewards.reshape(xys[0].shape), levels=20)
plt.colorbar(fraction=0.046, pad=0.04)

plt.scatter(goal[0], goal[1], color='red', marker='*', s=200)

## pNML Reward Querying

In [None]:
def do_training_iter(model, optimizer, test_point=None, positive=True, batch_size=64):
    negatives_batch_idx = np.random.choice(data.shape[0], size=batch_size)
    positives_batch_idx = np.random.choice(positives.shape[0], size=batch_size)
    negatives_batch = data[negatives_batch_idx]
    positives_batch = positives[positives_batch_idx]
    
    if test_point is not None:
        if positive:
            x = np.vstack((negatives_batch,
                           positives_batch,
                           test_point.reshape((1, 2))))
            y = np.vstack((
                np.zeros((len(negatives_batch), 1)),
                np.ones((len(positives_batch) + 1, 1)),
            ))
        else:
            x = np.vstack((negatives_batch,
                           test_point.reshape((1, 2)),
                           positives_batch))
            y = np.vstack((
                np.zeros((len(negatives_batch) + 1, 1)),
                np.ones((len(positives_batch), 1)),
            ))
    else:
        x = np.vstack((negatives_batch,
                       positives_batch))
        y = np.vstack((
            np.zeros((len(negatives_batch), 1)),
            np.ones((len(positives_batch), 1)),
        ))

    loss_value, grads = grad(model, x, y)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

In [None]:
def get_reward(test_point, n_train_steps=10):
    query_classifier = tf.keras.models.clone_model(classifier)
    original_weights = classifier.get_weights()
    query_classifier.set_weights(original_weights)
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-4)

    # Train with negative label

    for i in range(n_train_steps):
        do_training_iter(query_classifier,
                         optimizer,
                         test_point.reshape((1, 2)),
                         positive=False)
        
#     do_training_iter(query_classifier,
#                      optimizer,
#                      test_point.reshape((1, 2)),
#                      positive=False)
#     for i in range(10):
#         do_training_iter(query_classifier,
#                          optimizer)
        
    p_minus = query_classifier.predict(test_point.reshape((1, 2)))
    
    # Plot
    n_samples = 50
    xs = np.linspace(-4, 4, n_samples)
    ys = np.linspace(-4, 4, n_samples)

    plt.figure(figsize=(5, 5))
    xys = np.meshgrid(xs, ys)
    grid_vals = np.array(xys).transpose(1, 2, 0).reshape((n_samples * n_samples, 2))

    rewards = query_classifier.predict(grid_vals)

    plt.contour(xys[0], xys[1], rewards.reshape(xys[0].shape), levels=1)
#     plt.colorbar(fraction=0.046, pad=0.04)

    plt.scatter(test_point[0], test_point[1], color='purple', marker='*', s=200)
    plt.scatter(goal[0], goal[1], color='red', marker='*', s=200)
    
    # Train Again
    query_classifier.set_weights(original_weights)
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-4)

    for i in range(n_train_steps):
        do_training_iter(query_classifier,
                         optimizer,
                         test_point.reshape((1, 2)),
                         positive=True)
    
#     do_training_iter(query_classifier,
#                      optimizer,
#                      test_point.reshape((1, 2)),
#                      positive=True)
#     for i in range(10):
#         do_training_iter(query_classifier,
#                          optimizer)
        
    p_plus = query_classifier.predict(test_point.reshape((1, 2)))
    reward = p_plus / (p_plus + p_minus)
    
    # Plot
    plt.figure(figsize=(5, 5))
    rewards = query_classifier.predict(grid_vals)
    plt.contour(xys[0], xys[1], rewards.reshape(xys[0].shape), levels=1)
#     plt.colorbar(fraction=0.046, pad=0.04)
    plt.scatter(test_point[0], test_point[1], color='purple', marker='*', s=200)
    plt.scatter(goal[0], goal[1], color='red', marker='*', s=200)
    
    print(p_minus, p_plus, reward)

In [None]:
get_reward(np.array([2.5, -2.5]))