In [1]:
# Linear, nonlinear, simple state spaces, complex state spaces
# if the kde doesn't work, use a GP.

# It's nan because you can't calculate the cosine distance of 0.0
# Hyperparameters don't have a huge effect on the heatmap. The scale looks a little better with these though.
# Looks like the result is any linear combination of the reward. So maybe modify the distance?
# Larger grid doesn't really have an effect on it. Maybe larger state space would tho.
# Uniform grid of reward function is much better. It's sort of cheating though.

In [2]:
import json
import sys
sys.path.append('../simulated_fqi/')
import seaborn as sns
import tqdm
import matplotlib.pyplot as plt 
import numpy as np
import torch
import random
import shap
import configargparse
import torch
import torch.optim as optim
import scipy
from environments import Gridworld
from models.agents import NFQAgent
from models.networks import NFQNetwork, ContrastiveNFQNetwork
from util import get_logger, close_logger, load_models, make_reproducible, save_models
import matplotlib.pyplot as plt
import numpy as np
import itertools
from train import fqi
import tqdm
from sklearn.metrics import mean_absolute_error, mean_squared_error
import math
from environments import CartPoleRegulatorEnv
from scipy.stats import norm, multivariate_normal
from sklearn.linear_model import LinearRegression
from irl_gridworld import find_feature_expectations, plot_reward, norm, find_valid_actions, generate_rollout, generate_policy_rollout, runLinearFQI, l2_norm
from multiprocessing import Pool
from matplotlib.colors import LogNorm, Normalize
plt.rcParams.update({'font.size': 22})

# Biased data experiments

In [None]:
ox = []
for it in range(100):
    r = np.random.normal(0.5, 0.5, 2)
    behavior_opt, opt_agent = runLinearFQI(dataset='bg', behavior=True, reward_weights_shared=r)
    ox.extend(behavior_opt)
# Calculate posterior
# Choose regular grid on -1 to 1 and 1 to 1
x = np.arange(-1, 1.1, 0.1)
x = np.around(x, decimals=1)
y = np.arange(-1, 1.1, 0.1)
y = np.around(y, decimals=1)

xx, yy = np.meshgrid(x, y, sparse=True)
# The pairing of xx and yy gives the coordinates of the grid

In [None]:
y.shape

In [None]:
heatmap_posterior = np.zeros((21, 21))
heatmap_true = np.zeros((21, 21))
for i in range(xx.shape[1]):
    for j in range(yy.shape[0]):
        r_j = [xx[0][i], yy[j][0]]
        states = [list(r[0]) for r in behavior_opt]
        rewards = [np.dot(r_j, list(s)) for s in states]
        reward = sum(rewards)
        
        alpha = 0.001
        density = multivariate_normal.pdf(r_j, mean=[0.5, 0.5], cov=[[0.5, 0], [0, 0.5]])
        post = density * np.exp(alpha*reward)
        #print("Density: " + str(density) + " Post: " + str(np.exp(alpha * reward)) + " Val: " + str(post) + " Reward: " + str(r_j))
        
        heatmap_posterior[j, i] = post
        heatmap_true[j, i] = density
        
#heatmap_posterior = np.divide(heatmap_posterior, np.sum(heatmap_posterior))
ax = sns.heatmap(heatmap_posterior)
ax.invert_yaxis()
ax.set_xticklabels(x, rotation=90)
ax.set_yticklabels(y, rotation=360)
plt.show()
plt.close()
# ax = sns.heatmap(heatmap_true)
# ax.set_xticklabels(x, rotation=90)
# ax.set_yticklabels(y, rotation=360)
# ax.invert_yaxis()
# plt.show()

In [None]:
ox = []
r = [0.1, 0.1]
for it in range(100):
    behavior_opt, opt_agent = runLinearFQI(dataset='bg', reward_weights_shared=r, behavior=True)
    ox.extend(behavior_opt)
# Calculate posterior
# Choose regular grid on -1 to 1 and 1 to 1
x = np.arange(-1, 1.1, 0.1)
y = np.arange(-1, 1.1, 0.1)
xx, yy = np.meshgrid(x, y, sparse=True)
# The pairing of xx and yy gives the coordinates of the grid

In [None]:
heatmap_posterior = np.zeros((21, 21))
heatmap_true = np.zeros((21, 21))
for i in range(xx.shape[1]):
    for j in range(yy.shape[0]):
        r_j = [xx[0][i], yy[j][0]]
        states = [list(r[0]) for r in behavior_opt]
        rewards = [np.dot(r_j, list(s)) for s in states]
        reward = sum(rewards)
        
        alpha = 0.001
        density = multivariate_normal.pdf(r_j, mean=[0, 0], cov=[[1, 0], [0, 1]])
        post = density * np.exp(alpha*reward)
        #print("Density: " + str(density) + " Post: " + str(np.exp(alpha * reward)) + " Val: " + str(post) + " Reward: " + str(r_j))
        
        heatmap_posterior[j, i] = post
        heatmap_true[j, i] = density
        
ax = sns.heatmap(heatmap_posterior)
ax.invert_yaxis()
plt.show()
plt.close()
ax = sns.heatmap(heatmap_true)
ax.invert_yaxis()
plt.show()

# Generating a more accurate posterior

In [3]:
# Make the training reward functions evenly distributed
def generate_observations(init_experience=5, n=4):
    rewards = []
    for i in range(-10, 11, 3):
        for j in range(-10, 11, 3):
            rewards.append([i/10, j/10])
    
    observations = []
    for r in rewards:
        behavior_opt, opt_agent = runLinearFQI(dataset='bg', init_experience=init_experience, behavior=True, reward_weights_shared=r, n=n)
        for sample in behavior_opt:
            s = (sample[0], sample[1], r)
            observations.append(s)
    return observations

def distance_r(r, r_prime):
    h_prime = 0.0012 # Proportional to standard deviation of all reward function distances (or variance)
#     h_prime = 0.001 # Proportional to mean
    h_prime = 0.05#149
#     h=10e-3
    dist = scipy.spatial.distance.cosine(r, r_prime)
    return np.exp(-(np.power(dist, 2)/(2*h_prime)))

def distance_points(p1, p2):
    h = 0.326 # Proportional to standard deviation of all distances (or variance)
#     h = 0.4093 # Proportional to mean
    h = 0.032#32
#     h=10e-3
    # Removing the action part
    dist = scipy.spatial.distance.euclidean(p1[0], p2[0]) #+ scipy.spatial.distance.euclidean(p1[1], p2[1])
    return np.exp(-np.power(dist, 2)/(2*h))

def distance_rewards(r_k, observations):
    sum_diff = 0
    for sample in observations:
        r = sample[2]
        sum_diff += distance_r(r_k, r)
    return sum_diff

def generate_dataset_plot(dataset, reward):
    xs = [b[0][0] for b in dataset]
    ys = [b[0][1] for b in dataset]
    
    heatmap_dataset = np.zeros((4, 4))
    for i, x_i in enumerate(xs):
        y_i = ys[i]
        heatmap_dataset[y_i, x_i] += 1
    
    heatmap_dataset /= np.max(np.abs(heatmap_dataset))
    x = np.arange(0, 4, 1)
    y = np.arange(0, 4, 1)
    
    fig = plt.figure()
    fig.set_figheight(5)
    fig.set_figwidth(5)
    ax = sns.heatmap(heatmap_dataset)
    ax.invert_yaxis()
    ax.set_xticklabels(x, rotation=90)
    ax.set_yticklabels(y, rotation=360)
#     plt.title("Demonstration density")
    plt.xlabel("X")
    plt.ylabel("Y")
    plt.show()
    plt.close()
    return heatmap_dataset
    

## Calculating what h and hprime should be

In [None]:
observations = generate_observations()

In [None]:
all_distances = []
for s in observations:
    for s_prime in observations:
        r = s[2]
        r_prime = s_prime[2]
        all_distances.append(distance_r(r, r_prime))
sns.distplot(all_distances)
print("var distance: ", np.std(all_distances)*np.std(all_distances))

In [None]:
all_distances = []
for o in observations:
    for o_prime in observations:
        all_distances.append(distance_points(o, o_prime))
sns.distplot(all_distances)
print("var distance: ", np.std(all_distances)*np.std(all_distances))

In [None]:
x = np.arange(-1, 1.1, 0.2)
x = np.around(x, decimals=1)
y = np.arange(-1, 1.1, 0.2)
y = np.around(y, decimals=1)
xx, yy = np.meshgrid(x, y, sparse=True)

In [None]:
def estimate_post(r_k):
    # Product
    post = 1
    prior = multivariate_normal.pdf(r_k, mean=[0, 0], cov=[[1, 0], [0, 1]])
    dist_rewards = distance_rewards(r_k)
    for s_i in observations:
        # Sum
        sum_si = 0
        for s_j in observations:
            likelihood = distance_points(s_i, s_j) * distance_r(r_k, s_i[2]) / dist_rewards
            sum_si += likelihood * prior
        post *= sum_si
    return post

In [None]:
heatmap_posterior = np.zeros((11, 11))
all_rks = []
for kk, i in enumerate(tqdm.tqdm(range(xx.shape[1]))):
    for j in range(yy.shape[0]):
        # Evaluate the posterior at this reward parameter
        r_k = [xx[0][i], yy[j][0]]
        post = estimate_post(r_k)
        heatmap_posterior[j, i] = post

In [None]:
fig = plt.figure()
fig.set_figheight(10)
fig.set_figwidth(10)
ax = sns.heatmap(heatmap_posterior)
ax.invert_yaxis()
ax.set_xticklabels(x, rotation=90)
ax.set_yticklabels(y, rotation=360)
plt.show()
plt.close()

# Expert trajectories
* TODO: parallel computing

In [4]:
def conditional_dist(s_i, dataset, reward):
    sum = 0
    dist_rewards = distance_rewards(reward, dataset)
    sum_weights = 0
    count_p = 0
    for s_j in dataset:
        weight = distance_r(reward, s_j[2]) / dist_rewards
        dist = distance_points(s_i, s_j)
        est = dist * weight 
        sum += est
#     h = 0.003
#     h_prime = 0.05
#     sum /= np.sqrt(np.power(2*np.pi, 2) * h) * np.sqrt(np.power(2*np.pi, 2) * h_prime)
    return sum

In [5]:
def estimate_expert_prior(r_k, behavior_opt, observations):
    post = 0
    dist_rewards = distance_rewards(r_k, observations)
    for s_i in behavior_opt:
        # Sum
#         post += np.log(conditional_dist(s_i, behavior_opt, r_k))
        sum_si = 0
        for s_j in observations:
            weight = distance_r(r_k, s_j[2]) / dist_rewards
            likelihood = distance_points(s_i, s_j) * weight
            sum_si += likelihood
#         h = 0.003
#         h_prime = 0.05
#         sum_si /= np.sqrt(np.power(2*np.pi, 2) * h) * np.sqrt(np.power(2*np.pi, 2) * h_prime)
        if sum_si == 0:
            post += np.log(0.000000000001)
        else:
            post += np.log(sum_si)
    return post

In [None]:
observations = generate_observations(init_experience=10, n=4)
print(str(len(observations)))
behavior_opt, opt_agent = runLinearFQI(dataset='bg', init_experience=10, behavior=True, reward_weights_shared=[1, 1], n=4)
print(str(len(behavior_opt)))

In [None]:
# Conditional density estimation
def heatmap_dataset_conditional_density_posterior(reward, plot=True):
    x = np.arange(-1, 1.1, 0.2)
    x = np.around(x, decimals=1)
    y = np.arange(-1, 1.1, 0.2)
    y = np.around(y, decimals=1)
    xx, yy = np.meshgrid(x, y, sparse=True)
    # n observations
    observations = generate_observations(init_experience=10, n=4)
    # m expert observations
    behavior_opt, opt_agent = runLinearFQI(dataset='bg', init_experience=10, behavior=True, reward_weights_shared=reward, n=4)
    
    if plot:
        heatmap_dataset = generate_dataset_plot(behavior_opt, reward)

        x = np.arange(0, 4, 1)
        y = np.arange(0, 4, 1)
        xx, yy = np.meshgrid(x, y, sparse=True)
        heatmap_conditional = np.zeros((4, 4))
        action_pos_dict = {1: [-1, 0], 2: [1, 0], 3: [0, -1], 4: [0, 1]}
        xs = []
        ys = []
        zs = []
        conditional_est = []
        for a in [1, 2, 3, 4]:
            action = action_pos_dict[a]
            for kk, i in enumerate(tqdm.tqdm(range(xx.shape[1]))):
                for j in range(yy.shape[0]):
                    # Evaluate the posterior at this reward parameter
                    state = ([xx[0][i], yy[j][0]], action)
                    c_est = conditional_dist(state, observations, reward)
                    conditional_est.append(c_est)
                    heatmap_conditional[j, i] += c_est
        fig = plt.figure()
        fig.set_figheight(5)
        fig.set_figwidth(5)
        ax = sns.heatmap(heatmap_conditional)
#         plt.title("Conditional density for reward=" + str(reward))
        ax.invert_yaxis()
        ax.set_xticklabels(x, rotation=90)
        ax.set_yticklabels(y, rotation=360)
        plt.show()
        plt.close()

        x = np.arange(-1, 1.1, 0.2)
        x = np.around(x, decimals=1)
        y = np.arange(-1, 1.1, 0.2)
        y = np.around(y, decimals=1)
        xx, yy = np.meshgrid(x, y, sparse=True)
        # n observations
        # m expert observations
        heatmap_posterior = np.zeros((11, 11))
        all_rks = []
        for kk, i in enumerate(tqdm.tqdm(range(xx.shape[1]))):
            for j in range(yy.shape[0]):
                # Evaluate the posterior at this reward parameter
                r_k = [xx[0][i], yy[j][0]]
                post = estimate_expert_prior(r_k, behavior_opt, observations)
                heatmap_posterior[j, i] = post
        fig = plt.figure()
        fig.set_figheight(5)
        fig.set_figwidth(5)
        ax = sns.heatmap(heatmap_posterior)
        ax.invert_yaxis()
        plt.xlabel("Reward parameter 1")
        plt.ylabel("Reward parameter 2")
#         plt.title("Expert posterior, true reward=" + str(reward))
        ax.set_xticklabels(x, rotation=90)
        ax.set_yticklabels(y, rotation=360)
        plt.show()
        plt.close()
        
    else:
        x = np.arange(-1, 1.1, 0.2)
        x = np.around(x, decimals=1)
        y = np.arange(-1, 1.1, 0.2)
        y = np.around(y, decimals=1)
        xx, yy = np.meshgrid(x, y, sparse=True)
        # n observations
        # m expert observations
        heatmap_posterior = np.zeros((11, 11))
        all_rks = []
        for kk, i in enumerate(tqdm.tqdm(range(xx.shape[1]))):
            for j in range(yy.shape[0]):
                # Evaluate the posterior at this reward parameter
                r_k = [xx[0][i], yy[j][0]]
                post = estimate_expert_prior(r_k, behavior_opt, observations)
                heatmap_posterior[j, i] = post
#         heatmap_posterior /= np.max(np.abs(heatmap_posterior))
    return heatmap_posterior
    
    

In [None]:
def plot_reward(n=4, title='Reward = x + y', reward_params=None):
    reward_matrix = np.zeros((n, n))
    positions = [i for i in range(n)]
    for i, x in enumerate(range(n)):
        for j, y in enumerate(range(n)):
            reward = x*reward_params[0] + y*reward_params[1]
            reward_matrix[j,i] = reward
    plt.figure(figsize=(5, 5))
    reward_matrix /= np.max(np.abs(reward_matrix))
    ax = sns.heatmap(reward_matrix, xticklabels=positions, yticklabels=positions)
    plt.xlabel("X")
    plt.ylabel("Y")
    ax.invert_yaxis()
    #plt.title(title)

In [None]:
plot_reward(reward_params=[-1, 1])

In [None]:
heatmap_posterior = heatmap_dataset_conditional_density_posterior([-1, 1], plot=False)

In [None]:
heatmap_posterior

In [None]:
hmap_posterior = heatmap_posterior.copy()
# hmap_posterior[5][5] = -83

In [None]:
hmap_posterior[5][5] = -74

In [None]:
hmap_posterior = (hmap_posterior - np.min(hmap_posterior))/np.ptp(hmap_posterior)


In [None]:
x = np.arange(-1, 1.1, 0.2)
x = np.around(x, decimals=1)
y = np.arange(-1, 1.1, 0.2)
y = np.around(y, decimals=1)
fig = plt.figure()
fig.set_figheight(7)
fig.set_figwidth(7)
ax = sns.heatmap(hmap_posterior)
ax.invert_yaxis()
plt.xlabel("Reward parameter 1")
plt.ylabel("Reward parameter 2")
#         plt.title("Expert posterior, true reward=" + str(reward))
ax.set_xticklabels(x, rotation=90)
ax.set_yticklabels(y, rotation=360)
plt.show()
plt.close()

In [None]:
# Sample from the posterior, then use that sample to calculate reward

In [None]:
heatmap_posterior

In [None]:
heatmap_posterior[5][5] = -74

In [None]:
post = heatmap_posterior / np.max(np.abs(heatmap_posterior))
post = np.exp(post)

In [None]:
ax = sns.heatmap(post)
ax.invert_yaxis()

In [None]:
post /= np.sum(post)

In [None]:
rewards = np.random.multinomial(1000000, post.flatten())
rewards

In [None]:
rewards = rewards.reshape((11, 11))
rewards

In [None]:
ax = sns.heatmap(rewards)
ax.invert_yaxis()

In [None]:
ax = sns.heatmap(rewards // 1000)
ax.invert_yaxis()

In [None]:
evaluations = rewards // 1000

In [None]:
evaluations = np.flip(evaluations, 0)

In [None]:
evaluations

In [None]:
x = np.arange(-1, 1.1, 0.2)
x = np.around(x, decimals=1)
y = np.arange(-1, 1.1, 0.2)
y = np.around(y, decimals=1)
rewards = []
xx, yy = np.meshgrid(x, y, sparse=True)
for kk, i in enumerate(tqdm.tqdm(range(11))):
    for ll, j in enumerate(range(11)):
        r_k = [xx[0][i], yy[j][0]]
        count = evaluations[ll, kk]
        for aa in range(count):
            rewards.append(r_k)

In [None]:
all_hmaps = np.zeros((1, 4, 4))
n = 4
for r in rewards:
    reward_matrix = np.zeros((4, 4))
    for i, x in enumerate(range(n)):
        for j, y in enumerate(range(n)):
            reward = x*r[0] + y*r[1]
            reward_matrix[j,i] = reward
    all_hmaps = np.append(all_hmaps, np.asarray([reward_matrix]), axis=0) 

In [None]:
mean_hmap = np.mean(all_hmaps[1:], axis=0)
mean_hmap = (mean_hmap - np.min(mean_hmap))/np.ptp(mean_hmap)
fig = plt.figure()
fig.set_figheight(5)
fig.set_figwidth(5)
ax = sns.heatmap(mean_hmap)
# ax.invert_yaxis()
plt.xlabel("X")
plt.ylabel("Y")
# plt.title("Reward Mean Scaled")
ax.set_xticklabels([0, 1, 2, 3], rotation=90)
ax.set_yticklabels([0, 1, 2, 3], rotation=360)
plt.show()
plt.close()

In [None]:
std_hmap = np.std(all_hmaps[1:], axis=0)
std_hmap /= np.max(np.abs(std_hmap))
fig = plt.figure()
fig.set_figheight(5)
fig.set_figwidth(5)
ax = sns.heatmap(std_hmap)
# ax.invert_yaxis()
plt.xlabel("X")
plt.ylabel("Y")
# plt.title("Reward SD Scaled")
ax.set_xticklabels([0, 1, 2, 3], rotation=90)
ax.set_yticklabels([0, 1, 2, 3], rotation=360)
plt.show()
plt.close()

# Cartpole environment