In [1]:
# Show that we're converging faster and to the correct posterior
# The posterior results in comparisons between the most dense rewards and least dense rewards

In [2]:
import json
import sys
sys.path.append('../simulated_fqi/')
import seaborn as sns
import tqdm
import matplotlib.pyplot as plt 
import numpy as np
import torch
import random
import shap
import configargparse
import torch
import torch.optim as optim
import scipy
from environments import Gridworld
from models.agents import NFQAgent
from models.networks import NFQNetwork, ContrastiveNFQNetwork
from util import get_logger, close_logger, load_models, make_reproducible, save_models
import matplotlib.pyplot as plt
import numpy as np
import itertools
from train import fqi
import tqdm
from sklearn.metrics import mean_absolute_error, mean_squared_error
import math
from environments import CartPoleRegulatorEnv
from scipy.stats import norm, multivariate_normal
from sklearn.linear_model import LinearRegression
from irl_gridworld import find_feature_expectations, plot_reward, norm, find_valid_actions, generate_rollout, generate_policy_rollout, runLinearFQI, l2_norm
from multiprocessing import Pool
from matplotlib.colors import LogNorm, Normalize
plt.rcParams.update({'font.size': 10})

# Original reward
* Parameterized by x_success_range and theta_success_range
* The reward has to be minimized for success
* Do we want to recover these parameters? I'm not sure if it'll guarantee that we can observe the same behavior. Or the tie is less certain as it is in Gridworld. Also this has nothing to do with improving the Q-function

In [3]:
def plot_cart_reward(x_success_range, theta_success_range):
    x_threshold = 2.4
    theta_threshold_radians = math.pi / 2
    cart_pos = [i/10 for i in range(-30, 30)]
    angles = [i/10 for i in range(-20, 25, 2)]
    reward_matrix = np.zeros((len(angles), len(cart_pos)))
    for i, pos in enumerate(cart_pos):
        for j, ang in enumerate(angles):
            indicator_pos_fg = 0
            indicator_ang_fg = 0
            indicator_pos_bg = 0
            indicator_ang_bg = 0
            # In a forbidden state
            if (pos < -x_threshold
                or pos > x_threshold
                or ang < -theta_threshold_radians
                or ang > theta_threshold_radians):
                reward = 1
            # In success range
            elif (-x_success_range < pos < x_success_range
                and -theta_success_range < ang < theta_success_range):
                reward = 0
            # Accumulating cost
            else:
                reward = 0.2
            reward_matrix[j,i] = reward
    plt.figure(figsize=(18, 7))
    ax = sns.heatmap(reward_matrix, xticklabels=cart_pos, yticklabels=angles)
    plt.xlabel("Cart Position")
    plt.ylabel("Pole Angle (radians)")
    ax.invert_yaxis()
    plt.title("Cartpole Reward: x_success="+str(x_success_range) + " theta_success:" + str(theta_success_range))
    #plt.close()

In [8]:
def generate_policy_rollouts(x_success, theta_success, init_experience=200, rollout_length=100, epoch=500, verbose=False, nns=10):
    is_contrastive=False
    train_env_max_steps=100
    eval_env_max_steps=3000
    discount=0.95
    increment_experience=0
    hint_to_goal=0
    if verbose:
        evaluations=5
    else:
        evaluations=0
    rollouts = []
    for n in range(nns):
        train_env = CartPoleRegulatorEnv(group=0,masscart=1.0,mode="train",  x_success_range=x_success,
            theta_success_range=theta_success)
        eval_env = CartPoleRegulatorEnv(group=0, masscart=1.0, mode='eval', x_success_range=x_success,
            theta_success_range=theta_success)
        logger = get_logger()

        # Setup agent
        nfq_net = ContrastiveNFQNetwork(
            state_dim=train_env.state_dim, is_contrastive=is_contrastive
        )
        optimizer = optim.Adam(nfq_net.parameters(), lr=1e-1)
        nfq_agent = NFQAgent(nfq_net, optimizer)

        # NFQ Main loop
        # A set of transition samples denoted as D
        bg_rollouts = []
        total_cost = 0
        if init_experience > 0:
            for _ in range(init_experience):
                rollout_bg, episode_cost = train_env.generate_rollout(
                    None, render=False, group=0
                )
                bg_rollouts.extend(rollout_bg)
                total_cost += episode_cost
        all_rollouts = bg_rollouts.copy()

        bg_rollouts_test = []
        if init_experience > 0:
            for _ in range(init_experience):
                rollout_bg, episode_cost = eval_env.generate_rollout(
                    None, render=False, group=0
                )
                bg_rollouts_test.extend(rollout_bg)
        all_rollouts_test = bg_rollouts_test.copy()

        bg_success_queue = [0] * 3
        for kk, ep in enumerate(tqdm.tqdm(range(epoch + 1))):
            state_action_b, target_q_values, groups = nfq_agent.generate_pattern_set(
                all_rollouts
            )

            if not nfq_net.freeze_shared:
                loss = nfq_agent.train((state_action_b, target_q_values, groups))

            (eval_episode_length_bg,eval_success_bg,eval_episode_cost_bg) = nfq_agent.evaluate(eval_env, render=False)
            bg_success_queue = bg_success_queue[1:]
            bg_success_queue.append(1 if eval_success_bg else 0)

            if sum(bg_success_queue) == 3 and not nfq_net.freeze_shared == True:
                printed_bg = True
                nfq_net.freeze_shared = True
                if verbose:
                    print("FREEZING SHARED")
                break

        eval_env.step_number = 0
        eval_env.max_steps = 1000
        performance_bg = []
        num_steps_bg = []
        for it in range(evaluations):
            (
                eval_episode_length_bg,
                eval_success_bg,
                eval_episode_cost_bg,
            ) = nfq_agent.evaluate(eval_env, False)
            if verbose:
                print(eval_episode_length_bg, eval_success_bg)
            num_steps_bg.append(eval_episode_length_bg)
            performance_bg.append(eval_episode_length_bg)
            train_env.close()
            eval_env.close()
        if verbose:
            print("BG stayed up for steps: ", num_steps_bg)

        for _ in range(rollout_length):
            rollout, episode_cost = eval_env.generate_rollout(agent=None, render=False, group=0)
            #rollout, episode_cost = eval_env.generate_rollout(nfq_agent, render=False, group=0)
            rollouts.extend(rollout)

    observations = []
    for r in rollouts:
        observations.append((r[0], r[1], [x_success, theta_success]))

    return rollouts, observations

In [5]:
# Reverse all rollouts
def demonstration_density(rollouts, reward="", vmax=150):
    cart_pos = [i/10 for i in range(-30, 30)]
    angles = [i/10 for i in range(-20, 25)]
    demonstration_density = np.zeros((len(angles), len(cart_pos)))
    for r in rollouts:
        state = r[0]
        x = np.round(state[0], 1)
        theta = np.round(state[2], 1)

        x_ind = cart_pos.index(x)
        theta_ind = angles.index(theta)

        demonstration_density[theta_ind, x_ind] += 1
    plt.figure(figsize=(18, 7))
    ax = sns.heatmap(demonstration_density, xticklabels=cart_pos, yticklabels=angles, vmax=vmax)
    plt.xlabel("Cart Position")
    plt.ylabel("Pole Angle (radians)")
    ax.invert_yaxis()
    plt.title("Demonstration Density with " + str(len(rollouts)) + " samples for reward " + str(reward))

In [None]:
reward_pos = [i/10 for i in range(16, 27, 4)]
reward_ang = [i/10 for i in range(2, 8, 3)]
for i, pos in enumerate(reward_pos):
    for j, ang in enumerate(reward_ang):
        reward = [pos, ang]
        print("Reward: ", reward)
        rollouts, b = generate_policy_rollouts(pos, ang, init_experience=200, rollout_length=50, nns=10)
        demonstration_density(rollouts, reward, vmax=200)

# Conditional density estimates
* Distance_r = absolute distance between the theta and x success ranges
* Distance_points = absolute distance between the x position and the theta angles

In [None]:
# Find the correct set of hyparameters
all_distances = []
for s in observations:
    for s_prime in observations:
        r = s[2]
        r_prime = s_prime[2]
        all_distances.append(distance_r(r, r_prime))
sns.distplot(all_distances)
print("var distance: ", np.std(all_distances)*np.std(all_distances))

In [None]:
all_distances = []
for o in observations:
    for o_prime in observations:
        all_distances.append(distance_points(o, o_prime))
sns.distplot(all_distances)
print("var distance: ", np.std(all_distances)*np.std(all_distances))

In [6]:
def distance_r(r, r_prime):
    h_prime = 0.196 # Proportional to standard deviation of all reward function distances (or variance)
    dist_pos = np.absolute(r[0] - r_prime[0])
    dist_ang = np.absolute(r[1] - r_prime[1])
    dist = dist_pos + dist_ang
    return np.exp(-(np.power(dist, 2)/(2*h_prime)))

def distance_points(p1, p2):
    h=0.147 # Proportional to standard deviation of all distances (or variance)
    # Removing the action part
#     import ipdb; ipdb.set_trace()
    state_2 = [p2[0][0], p2[0][2]]
    dist = scipy.spatial.distance.euclidean(p1[0], p2[0]) #+ scipy.spatial.distance.euclidean(p1[1], p2[1])
    return np.exp(-np.power(dist, 2)/(2*h))

def distance_rewards(r_k, observations):
    sum_diff = 0
    for sample in observations:
        r = sample[2]
        sum_diff += distance_r(r_k, r)
    return sum_diff

def conditional_dist(s_i, dataset, reward):
    sum = 0
    dist_rewards = distance_rewards(reward, dataset)
    sum_weights = 0
    count_p = 0
    for s_j in dataset:
        weight = distance_r(reward, s_j[2]) / dist_rewards
        dist = distance_points(s_i, s_j)
        est = dist * weight 
        sum += est
#     h = 0.003
#     h_prime = 0.05
#     sum /= np.sqrt(np.power(2*np.pi, 2) * h) * np.sqrt(np.power(2*np.pi, 2) * h_prime)
    return sum

def estimate_expert_prior(r_k, behavior_opt, observations):
    post = 0
    dist_rewards = distance_rewards(r_k, observations)
    for s_i in behavior_opt:
        # Sum
#         post += np.log(conditional_dist(s_i, behavior_opt, r_k))
        sum_si = 0
        for s_j in observations:
            weight = distance_r(r_k, s_j[2]) / dist_rewards
            likelihood = distance_points(s_i, s_j) * weight
            sum_si += likelihood
#         h = 0.003
#         h_prime = 0.05
#         sum_si /= np.sqrt(np.power(2*np.pi, 2) * h) * np.sqrt(np.power(2*np.pi, 2) * h_prime)
        if sum_si == 0:
            post += np.log(0.000000000001)
        else:
            post += np.log(sum_si)
    return post

In [7]:
def generate_data(init_experience=50, rollout_length=10, nns=2):
    reward_pos = [i/10 for i in range(10, 27, 10)]
    reward_ang = [i/10 for i in range(0, 25, 6)]
    behavior = []
    for i, pos in enumerate(reward_pos):
        for j, ang in enumerate(reward_ang):
            rollouts, b = generate_policy_rollouts(pos, ang, init_experience=init_experience, rollout_length=rollout_length, nns=nns)
            behavior.extend(b)
    
    return behavior

In [10]:
observations = generate_data(init_experience=200, rollout_length=20, nns=1)
rollouts, behavior_opt = generate_policy_rollouts(1.3, 12*2 * math.pi / 360, init_experience=200, rollout_length=20, nns=1)

  2%|▏         | 8/501 [00:00<00:29, 16.93it/s]

ERROR! Session/line number was not unique in database. History logging moved to new session 584


 92%|█████████▏| 463/501 [01:24<00:06,  5.51it/s]
100%|██████████| 501/501 [00:44<00:00, 11.32it/s]
100%|██████████| 501/501 [00:40<00:00, 12.27it/s]
100%|██████████| 501/501 [00:43<00:00, 11.53it/s]
100%|██████████| 501/501 [01:42<00:00,  4.87it/s]
100%|██████████| 501/501 [01:10<00:00,  7.06it/s]
100%|██████████| 501/501 [00:34<00:00, 14.33it/s]
100%|██████████| 501/501 [01:39<00:00,  5.02it/s]
100%|██████████| 501/501 [00:31<00:00, 15.84it/s]
 80%|███████▉  | 400/501 [01:30<00:22,  4.43it/s]
100%|██████████| 501/501 [00:50<00:00,  9.88it/s]


In [None]:
textfile = open("training_data.txt", "w")
for element in observations:
    state = np.array2string(element[0], precision=5, separator=',')
    action = str(element[1])
    reward = str(element[2]) #np.array2string(element[2], precision=5, separator=',')
    textfile.write(state + " " + action + " " + reward + "\n")
textfile.close()

In [None]:
textfile = open("behavior_opt_13_12.txt", "w")
for element in behavior_opt:
    state = np.array2string(element[0], precision=5, separator=',')
    action = str(element[1])
    reward = str(element[2]) #np.array2string(element[2], precision=5, separator=',')
    textfile.write(state + " " + action + " " + reward + "\n")
textfile.close()

In [None]:
reward = [1.3, 12*2 * math.pi / 360]
cart_pos = [i/10 for i in range(-30, 30)]
angles = [i/10 for i in range(-20, 25)]
heatmap_conditional = np.zeros((len(angles), len(cart_pos)))
for i, pos in enumerate(cart_pos):
    for j, ang in enumerate(angles):
        state = [pos, ang]
        c_est = conditional_dist(state, observations, reward)
        heatmap_conditional[j, i] += c_est
fig = plt.figure(figsize=(18, 7))
plt.xlabel("Cart position")
plt.ylabel("Pole angle")
ax = sns.heatmap(heatmap_conditional, xticklabels=cart_pos, yticklabels=angles)
ax.invert_yaxis()

In [None]:
reward_pos = [i/10 for i in range(10, 27)]
reward_ang = [i/10 for i in range(0, 25, 2)]
reward = [1.3, 12*2 * math.pi / 360]

heatmap_posterior = np.zeros((len(reward_ang), len(reward_pos)))
for i, pos in enumerate(tqdm.tqdm(reward_pos)):
    for j, ang in enumerate(reward_ang):
        r_k = [pos, ang]
        post = estimate_expert_prior(r_k, behavior_opt, observations)
        heatmap_posterior[j, i] = post
fig = plt.figure()
fig.set_figheight(5)
fig.set_figwidth(5)
ax = sns.heatmap(heatmap_posterior)
ax.invert_yaxis()
plt.xlabel("X Success Range")
plt.ylabel("Angle Success Range")
plt.title("Expert posterior, true reward=" + str(reward))
ax.set_xticklabels(reward_pos, rotation=90)
ax.set_yticklabels(reward_ang, rotation=360)
plt.show()
plt.close()

 47%|████▋     | 8/17 [5:10:34<5:54:40, 2364.54s/it]

In [None]:
with open("training_data.txt") as f:
    lines = f.readlines()
print(str(lines[0]))

In [None]:
rollouts, behavior_opt = generate_policy_rollouts(1.4, 12*2 * math.pi / 360, nns=15)

In [None]:
demonstration_density(rollouts, vmax=200)

In [None]:
rewards_to_states = {}
rewards_to_states[str(reward)] = states

In [None]:
import json
with open('rewards_to_states.json', 'w') as fp:
    json.dump(rewards_to_states, fp)

In [None]:
def dd(states, percentage='100', reward="[1.4, 0.0]", vmax=200):
    cart_pos = [i/10 for i in range(-30, 30)]
    angles = [i/10 for i in range(-20, 25)]
    demonstration_density = np.zeros((len(angles), len(cart_pos)))
    for state in states:
        x = np.round(state[0], 1)
        theta = np.round(state[2], 1)

        x_ind = cart_pos.index(x)
        theta_ind = angles.index(theta)

        demonstration_density[theta_ind, x_ind] += 1
    plt.figure(figsize=(18, 7))
    ax = sns.heatmap(demonstration_density, xticklabels=cart_pos, yticklabels=angles, vmax=vmax)
    plt.xlabel("Cart Position")
    plt.ylabel("Pole Angle (radians)")
    ax.invert_yaxis()
    plt.title(str(percentage) + " " + str(reward))

In [None]:
with open('rewards_to_states.json', 'w') as fp:
    json.dump(rewards_to_states, fp)

In [None]:
reward = [3.6, 1.5]
rollouts, behavior_opt = generate_policy_rollouts(reward[0], reward[1], nns=15)
states = []
for b in behavior_opt:
    state = b[0].tolist()
    states.append(state)
rewards_to_states[str(reward)] = states
percentages = [i/10 for i in range(1, 11)]
states = np.asarray(rewards_to_states[str(reward)])
for p in percentages:
    idx = np.random.randint(len(states), size=int(p*len(states)))
    s_p = states[idx, :]
    dd(s_p, p, reward)