In [1]:
import json
import sys
sys.path.append('../simulated_fqi/')
import seaborn as sns
import tqdm
import matplotlib.pyplot as plt 
import numpy as np
import torch
import random
import shap
import configargparse
import torch
import torch.optim as optim
import torch
import torch.nn as nn

from environments import CartPoleRegulatorEnv
from models.networks import NFQNetwork, ContrastiveNFQNetwork
from models.agents import NFQAgent
from util import get_logger, close_logger, load_models, make_reproducible, save_models
import matplotlib.pyplot as plt
import numpy as np
import itertools
from train import fqi
from sklearn.metrics import mean_absolute_error, mean_squared_error
import math
from sklearn.linear_model import LinearRegression
from irl_gridworld import find_feature_expectations, plot_reward, norm, find_valid_actions, generate_rollout, generate_policy_rollout, runLinearFQI
from train import fqi

# Modifying the state representation using a phi function
* [a, b, c, d] --> [a, b, c, d, e (indicator if cart in success range), f (indicator if pole in success range)]
* There's no combination of reward that allows the success range to appear as a box (?), maybe I'm plotting something wrong?

In [2]:
train_env = CartPoleRegulatorEnv(group=0,masscart=1.0,mode="train",shared_weights=[0.2, 0.1, 0.3, 0.4], fg_weights=[0.2, 0.2, 0.3, 0.3])
all_rollouts = []
for _ in range(10):
    rollout, episode_cost = train_env.generate_rollout(None, render=False, group=0)
    all_rollouts.extend(rollout)

TypeError: __init__() got an unexpected keyword argument 'shared_weights'

In [None]:
rewards = [r[2] for r in all_rollouts]
sns.distplot(rewards)

# Cartpole with Linear Reward
* Just define two reward functions (smaller allowable range) (bg has narrower range)
* The state is the indicators (1, 1) or (pos, angle) or (-4.8, -1)

In [None]:
def cart_feature_expectations(dataset):
    features_pos_fg = []
    features_ang_fg = []
    features_pos_bg = []
    features_ang_bg = []
    for state in dataset:
        s = state[0]
        _, _, _, _, indicator_pos_fg, indicator_pos_bg, indicator_ang_fg, indicator_ang_bg = s
        features_pos_fg.append(indicator_pos_fg)
        features_pos_bg.append(indicator_pos_bg)
        features_ang_fg.append(indicator_ang_fg)
        features_ang_bg.append(indicator_ang_bg)

    avg_pos_fg = np.mean(features_pos_fg)
    avg_pos_bg = np.mean(features_pos_bg)
    avg_ang_fg = np.mean(features_ang_fg)
    avg_ang_bg = np.mean(features_ang_bg)
    
    return [avg_pos_fg, avg_pos_bg, avg_ang_fg, avg_ang_bg]

In [None]:
def runFQI(init_experience=400, verbose=True, group=0, shared_weights=[0.2, 0.1, 0.3, 0.4], fg_weights=[0.2, 0.2, 0.3, 0.3]):
    if group == 0:
        force_left = 0
    else:
        force_left = 1
    train_env = CartPoleRegulatorEnv(group=0,masscart=1.0,mode="train",shared_weights=shared_weights, fg_weights=fg_weights)
    eval_env = CartPoleRegulatorEnv(group=0, masscart=1.0, mode='eval',shared_weights=shared_weights, fg_weights=fg_weights)
    all_rollouts = []
    if init_experience > 0:
        for _ in range(init_experience):
            rollout, episode_cost = train_env.generate_rollout(None, render=False, group=group)
            all_rollouts.extend(rollout)
    
    reg = LinearRegression()
    gamma = 0.95
    state_b, action_b, cost_b, next_state_b, done_b, group_b = zip(*all_rollouts)
    done_b = torch.FloatTensor(done_b)
    cost_b = torch.FloatTensor(cost_b)
    next_state_b = np.asarray(next_state_b)
    next_state_b = next_state_b[:, 4:]
    reg = LinearRegression().fit(next_state_b, cost_b)
    #import ipdb; ipdb.set_trace()
    rollouts = []
    if init_experience > 0:
        for _ in range(init_experience):
            rollout, episode_cost = eval_env.generate_rollout(reg, render=False, group=0)
            rollouts.extend(rollout)

    policy_rollouts = rollouts
    return policy_rollouts, reg

In [None]:
epochs=30; learning_rate=1; init_w_shared = [0.5]*4; init_w_fg = [0.5]*4
w_shared = init_w_shared
w_fg = init_w_fg

muB_shared = None
muB_fg = None

diff_shared = []
diff_fg = []

true_weights_bg = [0.2, 0.1, 0.3, 0.4]
true_weights_fg = [0.2, 0.2, 0.3, 0.3]
print("Generating behavior with true weights shared=" + str(true_weights_bg) + " true_weights_fg=" + str(true_weights_fg))
behavior_rollout_bg, opt_agent_bg = runFQI(group=0, shared_weights=true_weights_bg)
behavior_rollout_fg, opt_agent_fg = runFQI(group=1, shared_weights=true_weights_bg, fg_weights=true_weights_fg)

muB_shared = cart_feature_expectations(behavior_rollout_bg)
muB_fg = cart_feature_expectations(behavior_rollout_fg)

for i in range(epochs):
    print('Epoch', i, '- Train pi with current w_shared='+str(w_shared) + " w_fg=", w_fg )
    policy_rollout_bg, agent_pi_bg = runFQI(shared_weights=w_shared, fg_weights=None,group=0)
    policy_rollout_fg, agent_pi_fg = runFQI(shared_weights=w_shared, fg_weights=w_fg,group=1)
    
    #print('Evaluate feature expectations for pi')
    # Generate rollout with this policy, then do feature expectations
    mu_shared = cart_feature_expectations(policy_rollout_bg)
    mu_fg = cart_feature_expectations(policy_rollout_fg)
    print("Shared feature expectations: ", mu_shared)
    print("Fg feature expectations: ", mu_fg)

    #print('Gradient update for new w')
    grad_shared = norm(muB_shared) - norm(mu_shared) + norm(muB_fg) - norm(mu_fg)
    grad_fg = norm(muB_fg) - norm(mu_fg)
    print("Grad shared: " + str(grad_shared) + " Grad fg: " + str(grad_fg))

    w_shared_old = w_shared
    # w_shared += learning_rate*(0.95**i) * grad_shared
    w_shared += learning_rate * grad_shared
    w_shared = w_shared/np.sum(np.abs(w_shared))

    w_fg_old = w_fg
    # w_fg += learning_rate*(0.95**i) * grad_fg
    w_fg += learning_rate * grad_fg
    # Don't normalize here: Normalizing here doesn't really matter
    w_fg = w_fg/np.sum(np.abs(w_fg))

    diff_shared.append(mean_absolute_error(w_shared_old, w_shared))
    diff_fg.append(mean_absolute_error(w_fg_old, w_fg))

plt.plot(diff_shared)
plt.plot(diff_fg)

In [None]:
reward_fg = w_fg + w_shared
reward_bg = w_shared
print("True Reward FG: " + str(true_weights_fg) + " Learned Reward Fg: " + str(reward_fg))
print("True Reward BG: " + str(true_weights_bg) + " Learned Reward Bg: " + str(reward_bg))

## Plot the reward over a densely sampled grid of cart position and pole angle

In [1]:
def plot_reward_cart(reward_weights_shared=None, reward_weights_fg=None, agent=None, title='Learned Shared Reward'):
    x_success_range_fg = 2.4
    theta_success_range_fg = 12 * 2 * math.pi / 360
    x_success_range_bg = 2.0
    theta_success_range_bg = 10 * 2 * math.pi / 360
    cart_pos = [i/10 for i in range(-48, 48)]
    angles = [i/100 for i in range(-40, 40, 2)]
    reward_matrix = np.zeros((len(angles), len(cart_pos)))
    for i, pos in enumerate(cart_pos):
        for j, ang in enumerate(angles):
            indicator_pos_fg = 0
            indicator_ang_fg = 0
            indicator_pos_bg = 0
            indicator_ang_bg = 0
            if -x_success_range_fg < pos < x_success_range_fg:
                indicator_pos_fg = 1
            if -theta_success_range_fg < ang < theta_success_range_fg:
                indicator_ang_fg = 1
            if -x_success_range_bg < pos < x_success_range_bg:
                indicator_pos_bg = 1
            if -theta_success_range_bg < ang < theta_success_range_bg:
                indicator_ang_bg = 1
            state = [indicator_pos_fg, indicator_pos_bg, indicator_ang_fg, indicator_ang_bg]
            if reward_weights_shared is not None and reward_weights_fg is not None:
                reward = np.dot(state, np.add(reward_weights_shared, reward_weights_fg))
            elif reward_weights_shared is not None:
                reward = np.dot(state, reward_weights_shared)
            elif agent is not None:
                reward = agent.predict([state])
            reward_matrix[j,i] = reward
    plt.figure(figsize=(18, 7))
    ax = sns.heatmap(reward_matrix, xticklabels=cart_pos, yticklabels=angles)
    plt.xlabel("Cart Position")
    plt.ylabel("Pole Angle (radians)")
    ax.invert_yaxis()
    plt.title(title)

In [2]:
plot_reward_cart(agent=None, reward_weights_shared=true_weights_bg, title='Shared True Reward')
plot_reward_cart(agent=None, reward_weights_shared=w_shared, title='Learned Shared Reward')

plot_reward_cart(agent=None, reward_weights_shared=true_weights_bg, reward_weights_fg=true_weights_fg, title='Fg True Reward')
plot_reward_cart(agent=None, reward_weights_shared=w_shared, reward_weights_fg=w_fg+w_shared, title="Learned Fg Reward")

NameError: name 'true_weights_bg' is not defined

# Initialization vs. returned weights experiment

In [None]:
def cartpoleFQI(epochs=20, learning_rate=1, init_w_shared = [0.5]*4, init_w_fg = [0.5]*4, verbose=False):
    w_shared = init_w_shared
    w_fg = init_w_fg

    muB_shared = None
    muB_fg = None

    true_weights_bg = [1, 2, 4, 5]
    true_weights_fg = [5, 3, 2, 1]
    if verbose:
        print("Generating behavior with true weights shared=" + str(true_weights_bg) + " true_weights_fg=" + str(true_weights_fg))
    behavior_rollout_bg, opt_agent_bg = runFQI(group=0, shared_weights=true_weights_bg)
    behavior_rollout_fg, opt_agent_fg = runFQI(group=1, shared_weights=true_weights_bg, fg_weights=true_weights_fg)

    muB_shared = cart_feature_expectations(behavior_rollout_bg)
    muB_fg = cart_feature_expectations(behavior_rollout_fg)

    for i in range(epochs):
        if verbose:
            print('Epoch', i, '- Train pi with current w_shared='+str(w_shared) + " w_fg=", w_fg )
        policy_rollout_bg, agent_pi_bg = runFQI(shared_weights=w_shared, fg_weights=None,group=0)
        policy_rollout_fg, agent_pi_fg = runFQI(shared_weights=w_shared, fg_weights=w_fg,group=1)

        #print('Evaluate feature expectations for pi')
        # Generate rollout with this policy, then do feature expectations
        mu_shared = cart_feature_expectations(policy_rollout_bg)
        mu_fg = cart_feature_expectations(policy_rollout_fg)
        if verbose:
            print("Shared feature expectations: ", mu_shared)
            print("Fg feature expectations: ", mu_fg)

        #print('Gradient update for new w')
        grad_shared = norm(muB_shared) - norm(mu_shared) + norm(muB_fg) - norm(mu_fg)
        grad_fg = norm(muB_fg) - norm(mu_fg)
        if verbose:
            print("Grad shared: " + str(grad_shared) + " Grad fg: " + str(grad_fg))

        w_shared_old = w_shared
        # w_shared += learning_rate*(0.95**i) * grad_shared
        w_shared += learning_rate * grad_shared
        w_shared = w_shared/np.sum(np.abs(w_shared))

        w_fg_old = w_fg
        # w_fg += learning_rate*(0.95**i) * grad_fg
        w_fg += learning_rate * grad_fg
        # Don't normalize here: Normalizing here doesn't really matter
        w_fg = w_fg/np.sum(np.abs(w_fg))

    return norm(w_fg + w_shared), w_shared

In [None]:
fg_one = []
fg_two = []
fg_three = []
fg_four = []
shared_one = []
shared_two = []
shared_three = []
shared_four = []
for _, i in enumerate(tqdm.tqdm(range(20))):
    w_shared = np.random.normal(size=4)
    w_fg = np.random.normal(size=4)
    
    w_shared = w_shared / norm(w_shared)
    w_fg = w_fg / norm(w_fg)
    
    learned_fg, learned_shared = cartpoleFQI(init_w_shared=w_shared, init_w_fg=w_fg)
    
    fg_one.append(learned_fg[0])
    fg_two.append(learned_fg[1])
    fg_three.append(learned_fg[2])
    fg_four.append(learned_fg[3])
    shared_one.append(learned_shared[0])
    shared_two.append(learned_shared[1])
    shared_three.append(learned_shared[2])
    shared_four.append(learned_shared[3])
    print("learned fg: " + str(learned_fg) + " learned bg: " + str(learned_shared))

In [None]:
import pandas as pd
d = {'Fg Weight 0': fg_one, 'Fg Weight 1': fg_two, 'Fg Weight 2': fg_three, 'Fg Weight 3': fg_three,'Shared Weight 0': shared_one, 'Shared Weight 1': shared_two, 'Shared Weight 2': shared_three, 'Shared Weight 3': shared_four}
df = pd.DataFrame(data=d)
plt.figure(figsize=(12, 6))
ax = sns.boxplot(x="variable", y="value", data=pd.melt(df))
plt.title("Cartpole CIRL weight variation: original fg=[5, 3, 2, 1], original shared=[1, 2, 4, 5]")
plt.show()

# Phi function that is a CFQNetwork on Cartpole

In [None]:
class ContrastiveNFQNetwork(nn.Module):
    def __init__(self, state_dim, is_contrastive: bool = True, nonlinearity=nn.Sigmoid):
        super().__init__()
        self.state_dim = state_dim
        LAYER_WIDTH = self.state_dim + 1
        self.is_contrastive = is_contrastive
        self.freeze_shared = False
        self.freeze_fg = False

        self.layers_shared = nn.Sequential(
            nn.Linear(self.state_dim + 1, LAYER_WIDTH),
            nonlinearity(),
            nn.Linear(LAYER_WIDTH, LAYER_WIDTH),
            nonlinearity(),
        )
        self.layers_fg = nn.Sequential(
            nn.Linear(self.state_dim + 1, LAYER_WIDTH),
            nonlinearity(),
            nn.Linear(LAYER_WIDTH, LAYER_WIDTH),
            nonlinearity(),
        )
        self.layers_last_shared = nn.Sequential(
            nn.Linear(LAYER_WIDTH, 1), nonlinearity()
        )
        self.layers_last_fg = nn.Sequential(nn.Linear(LAYER_WIDTH, 1), nonlinearity())
        self.layers_last = nn.Sequential(nn.Linear(LAYER_WIDTH * 2, 1), nonlinearity())
        # Initialize weights to [-0.5, 0.5]
        def init_weights(m):
            if type(m) == nn.Linear:
                torch.nn.init.uniform_(m.weight, -0.5, 0.5)

        def init_weights_fg(m):
            if type(m) == nn.Linear:
                torch.nn.init.zeros_(m.weight)

        self.layers_shared.apply(init_weights)

        # if self.is_contrastive:
        self.layers_last_shared.apply(init_weights)
        self.layers_fg.apply(init_weights_fg)
        self.layers_last_fg.apply(init_weights_fg)
        self.layers_last.apply(init_weights)

        if is_contrastive:
            for param in self.layers_fg.parameters():
                param.requires_grad = False
            for param in self.layers_last_fg.parameters():
                param.requires_grad = False
        # else:
        #    self.layers_last.apply(init_weights)

    def forward(self, x: torch.Tensor, group=0) -> torch.Tensor:

        x = self.layers_shared(x)
        if not self.is_contrastive:
            group = 1
            
        x_shared = self.layers_last_shared(x)

        x_fg = self.layers_fg(x)
        x_fg = self.layers_last_fg(x_fg)
        
        return x_shared + x_fg * group

    def freeze_shared_layers(self):
        for param in self.layers_shared.parameters():
            param.requires_grad = False
        for param in self.layers_last_shared.parameters():
            param.requires_grad = False

    def unfreeze_fg_layers(self):
        for param in self.layers_fg.parameters():
            param.requires_grad = True
        for param in self.layers_last_fg.parameters():
            param.requires_grad = True

    def freeze_last_layers(self):
        for param in self.layers_last_shared.parameters():
            param.requires_grad = False
        for param in self.layers_last_fg.parameters():
            param.requires_grad = False

    def unfreeze_last_layers(self):
        for param in self.layers_last_shared.parameters():
            param.requires_grad = True
        for param in self.layers_last_fg.parameters():
            param.requires_grad = True

    def assert_correct_layers_frozen(self):

        if not self.is_contrastive:
            for param in self.layers_fg.parameters():
                assert param.requires_grad == True
            for param in self.layers_last_fg.parameters():
                assert param.requires_grad == True
            for param in self.layers_shared.parameters():
                assert param.requires_grad == True
            for param in self.layers_last_shared.parameters():
                assert param.requires_grad == True

        elif self.freeze_shared:
            for param in self.layers_fg.parameters():
                assert param.requires_grad == True
            for param in self.layers_last_fg.parameters():
                assert param.requires_grad == True
            for param in self.layers_shared.parameters():
                assert param.requires_grad == False
            for param in self.layers_last_shared.parameters():
                assert param.requires_grad == False
        else:

            for param in self.layers_fg.parameters():
                assert param.requires_grad == False
            for param in self.layers_last_fg.parameters():
                assert param.requires_grad == False
            for param in self.layers_shared.parameters():
                assert param.requires_grad == True
            for param in self.layers_last_shared.parameters():
                assert param.requires_grad == True

In [None]:
def fqi(verbose=True, is_contrastive=False, epochs=1000, init_experience=200, evaluations=5,force_left=5):

    # Setup environment
    bg_cart_mass = 1.0
    fg_cart_mass = 1.0
    train_env_bg = CartPoleRegulatorEnv(group=0,masscart=bg_cart_mass,mode="train",force_left=force_left,is_contrastive=is_contrastive)
    train_env_fg = CartPoleRegulatorEnv(group=1,masscart=fg_cart_mass,mode="train",force_left=force_left,is_contrastive=is_contrastive)
    eval_env_bg = CartPoleRegulatorEnv(group=0,masscart=bg_cart_mass,mode="eval",force_left=force_left,is_contrastive=is_contrastive)
    eval_env_fg = CartPoleRegulatorEnv(group=1,masscart=fg_cart_mass,mode="eval",force_left=force_left,is_contrastive=is_contrastive)
    nfq_net = ContrastiveNFQNetwork(state_dim=train_env_bg.state_dim, is_contrastive=is_contrastive)
    # Setup agent
    if is_contrastive:
        optimizer = optim.Adam(
            itertools.chain(
                nfq_net.layers_shared.parameters(),
                nfq_net.layers_last_shared.parameters(),
            ),
            lr=1e-1,
        )
    else:
        optimizer = optim.Adam(nfq_net.parameters(), lr=1e-1)

    nfq_agent = NFQAgent(nfq_net, optimizer)

    # NFQ Main loop
    bg_rollouts = []
    fg_rollouts = []
    total_cost = 0
    if init_experience > 0:
        for _ in range(init_experience):
            rollout_bg, episode_cost = train_env_bg.generate_rollout(
                None, render=False, group=0
            )
            rollout_fg, episode_cost = train_env_fg.generate_rollout(
                None, render=False, group=1
            )
            bg_rollouts.extend(rollout_bg)
            fg_rollouts.extend(rollout_fg)
            total_cost += episode_cost
    bg_rollouts.extend(fg_rollouts)
    all_rollouts = bg_rollouts.copy()

    bg_rollouts_test = []
    fg_rollouts_test = []
    if init_experience > 0:
        for _ in range(init_experience):
            rollout_bg, episode_cost = eval_env_bg.generate_rollout(
                None, render=False, group=0
            )
            rollout_fg, episode_cost = eval_env_fg.generate_rollout(
                None, render=False, group=1
            )
            bg_rollouts_test.extend(rollout_bg)
            fg_rollouts_test.extend(rollout_fg)
    bg_rollouts_test.extend(fg_rollouts)
    all_rollouts_test = bg_rollouts_test.copy()

    state_action_b, target_q_values, groups = nfq_agent.generate_pattern_set(
        all_rollouts_test
    )
    X_test = state_action_b
    test_groups = groups

    bg_success_queue = [0] * 3
    fg_success_queue = [0] * 3
    epochs_fg = 0
    eval_fg = 0
    for _, epoch in enumerate(tqdm.tqdm(range(epochs))):

        state_action_b, target_q_values, groups = nfq_agent.generate_pattern_set(
            all_rollouts
        )
        X = state_action_b
        train_groups = groups

        if not nfq_net.freeze_shared:
            loss = nfq_agent.train((state_action_b, target_q_values, groups))

        eval_episode_length_fg, eval_success_fg, eval_episode_cost_fg = 0, 0, 0
        if nfq_net.freeze_shared:
            eval_fg += 1

            if eval_fg > 50:
                loss = nfq_agent.train((state_action_b, target_q_values, groups))

        if is_contrastive:
            if nfq_net.freeze_shared:
                (
                    eval_episode_length_fg,
                    eval_success_fg,
                    eval_episode_cost_fg,
                ) = nfq_agent.evaluate(eval_env_fg, render=False)
                for param in nfq_net.layers_fg.parameters():
                    assert param.requires_grad == True
                for param in nfq_net.layers_last_fg.parameters():
                    assert param.requires_grad == True
                for param in nfq_net.layers_shared.parameters():
                    assert param.requires_grad == False
                for param in nfq_net.layers_last_shared.parameters():
                    assert param.requires_grad == False
            else:

                for param in nfq_net.layers_fg.parameters():
                    assert param.requires_grad == False
                for param in nfq_net.layers_last_fg.parameters():
                    assert param.requires_grad == False
                for param in nfq_net.layers_shared.parameters():
                    assert param.requires_grad == True
                for param in nfq_net.layers_last_shared.parameters():
                    assert param.requires_grad == True
                (
                    eval_episode_length_bg,
                    eval_success_bg,
                    eval_episode_cost_bg,
                ) = nfq_agent.evaluate(eval_env_bg, render=False)

        else:
            (
                eval_episode_length_bg,
                eval_success_bg,
                eval_episode_cost_bg,
            ) = nfq_agent.evaluate(eval_env_bg, render=False)
            (
                eval_episode_length_fg,
                eval_success_fg,
                eval_episode_cost_fg,
            ) = nfq_agent.evaluate(eval_env_fg, render=False)

        bg_success_queue = bg_success_queue[1:]
        bg_success_queue.append(1 if eval_success_bg else 0)

        fg_success_queue = fg_success_queue[1:]
        fg_success_queue.append(1 if eval_success_fg else 0)

        printed_bg = False
        printed_fg = False

        if sum(bg_success_queue) == 3 and not nfq_net.freeze_shared == True:
            if epochs_fg == 0:
                epochs_fg = epoch
            printed_bg = True
            nfq_net.freeze_shared = True
            if verbose:
                print("FREEZING SHARED")
            if is_contrastive:
                for param in nfq_net.layers_shared.parameters():
                    param.requires_grad = False
                for param in nfq_net.layers_last_shared.parameters():
                    param.requires_grad = False
                for param in nfq_net.layers_fg.parameters():
                    param.requires_grad = True
                for param in nfq_net.layers_last_fg.parameters():
                    param.requires_grad = True
            else:
                for param in nfq_net.layers_fg.parameters():
                    param.requires_grad = False
                for param in nfq_net.layers_last_fg.parameters():
                    param.requires_grad = False

                optimizer = optim.Adam(
                    itertools.chain(
                        nfq_net.layers_fg.parameters(),
                        nfq_net.layers_last_fg.parameters(),
                    ),
                    lr=1e-1,
                )
                nfq_agent._optimizer = optimizer
            # break

        if sum(fg_success_queue) == 3:
            printed_fg = True
            break

    eval_env_bg.step_number = 0
    eval_env_fg.step_number = 0

    eval_env_bg.max_steps = 1000
    eval_env_fg.max_steps = 1000

    performance_fg = []
    performance_bg = []
    num_steps_bg = []
    num_steps_fg = []
    total = 0
    for it in range(evaluations):
        (
            eval_episode_length_bg,
            eval_success_bg,
            eval_episode_cost_bg,
        ) = nfq_agent.evaluate(eval_env_bg, False)
        if verbose:
            print(eval_episode_length_bg, eval_success_bg)
        num_steps_bg.append(eval_episode_length_bg)
        performance_bg.append(eval_episode_length_bg)
        total += 1
        train_env_bg.close()
        eval_env_bg.close()

        (
            eval_episode_length_fg,
            eval_success_fg,
            eval_episode_cost_fg,
        ) = nfq_agent.evaluate(eval_env_fg, False)
        if verbose:
            print(eval_episode_length_fg, eval_success_fg)
        num_steps_fg.append(eval_episode_length_fg)
        performance_fg.append(eval_episode_length_fg)
        total += 1
        train_env_fg.close()
        eval_env_fg.close()
    print("Fg trained after " + str(epochs_fg) + " epochs")
    print("BG stayed up for steps: ", num_steps_bg)
    print("FG stayed up for steps: ", num_steps_fg)
    
    rollouts = []
    if init_experience > 0:
        for _ in range(init_experience):
            rollout, episode_cost = eval_env_bg.generate_rollout(nfq_agent, render=False, group=0)
            rollouts.extend(rollout)
            
            rollout, episode_cost = eval_env_fg.generate_rollout(nfq_agent, render=False, group=1)
            rollouts.extend(rollout)
            
    policy_rollouts = rollouts
    return policy_rollouts, reg
    
    return nfq_agent

In [None]:
# TODO: Fix dynamics
optAgent, policy_rollout = fqi()


In [None]:
epochs=20; learning_rate=1; verbose=False

if verbose:
    print("Generating optimal behavior using CFQNetwork")
optAgent = runFQI(group=0, shared_weights=true_weights_bg)

muB_shared = cart_feature_expectations(behavior_rollout_bg)
muB_fg = cart_feature_expectations(behavior_rollout_fg)

for i in range(epochs):
    if verbose:
        print('Epoch', i, '- Train pi with current w_shared='+str(w_shared) + " w_fg=", w_fg )
    policy_rollout_bg, agent_pi_bg = runFQI(shared_weights=w_shared, fg_weights=None,group=0)
    policy_rollout_fg, agent_pi_fg = runFQI(shared_weights=w_shared, fg_weights=w_fg,group=1)

    #print('Evaluate feature expectations for pi')
    # Generate rollout with this policy, then do feature expectations
    mu_shared = cart_feature_expectations(policy_rollout_bg)
    mu_fg = cart_feature_expectations(policy_rollout_fg)
    if verbose:
        print("Shared feature expectations: ", mu_shared)
        print("Fg feature expectations: ", mu_fg)

    #print('Gradient update for new w')
    grad_shared = norm(muB_shared) - norm(mu_shared) + norm(muB_fg) - norm(mu_fg)
    grad_fg = norm(muB_fg) - norm(mu_fg)
    if verbose:
        print("Grad shared: " + str(grad_shared) + " Grad fg: " + str(grad_fg))

    w_shared_old = w_shared
    # w_shared += learning_rate*(0.95**i) * grad_shared
    w_shared += learning_rate * grad_shared
    w_shared = w_shared/np.sum(np.abs(w_shared))

    w_fg_old = w_fg
    # w_fg += learning_rate*(0.95**i) * grad_fg
    w_fg += learning_rate * grad_fg
    # Don't normalize here: Normalizing here doesn't really matter
    w_fg = w_fg/np.sum(np.abs(w_fg))

#return norm(w_fg + w_shared), w_shared