In [1]:
import os
import shutil

import gym
import numpy as np
import matplotlib.pyplot as plt
plt.style.use("ggplot")
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd
import shutil


from itertools import count
from collections import namedtuple
from collections import defaultdict
from torch.autograd import Variable
from tqdm import tnrange
from random import choice

from complex_gridword import GridworldEnv
from nets import *
from draw_methods import *
from hyper_parametrs import *
from env_methods import *
from tensorboardX import SummaryWriter
from utils import command

In [2]:
#shutil.rmtree(logs_directory)
writer = SummaryWriter(logs_directory)
board = command.Command('tensorboard --logdir=run1:{} --port {}'.format(logs_directory, board_port))
board.run()

In [3]:
#shutil.copy("hyper_parametrs.py", logs_directory)

In [4]:
%pylab inline
%matplotlib inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [5]:
grid_size = (5, 5)
env_name = "simple"
grid_map = np.load("gridworlds_data/{}_{}x{}/grid_map.npy".format(env_name, grid_size[0], grid_size[1]))
env = GridworldEnv(grid_size, grid_map)

In [6]:
state_dim = int(env.nS * 2)
action_dim = int(env.nA)

In [7]:
def entropy_term(probs):
    return -torch.sum(probs * torch.log(probs))

In [8]:
def update_params(agent, optimizer, losses_history):
    R = 0
    saved_actions = agent.saved_actions
    value_loss = 0
    rewards = []
    for r in agent.rewards[::-1]:
        R = r + gamma_rl * R
        rewards.insert(0, R)
    rewards = torch.Tensor(rewards)
    log_loss = Variable(torch.FloatTensor([0]))
    val_loss = Variable(torch.FloatTensor([0]))
    entropy_loss = Variable(torch.FloatTensor([0]))
    for (action, probs, value), r in zip(saved_actions, rewards):
        m = torch.distributions.Categorical(probs)
        reward = r - value.data[0, 0]
        log_loss  += -(m.log_prob(action[0]) * reward)
        val_loss += lambda_baseline * F.mse_loss(value, Variable(torch.Tensor([r])))
        entropy_loss += -entropy_weights["agent"] * entropy_term(probs)
    
    losses_history["entropy"].append(entropy_loss.data.numpy()[0])
    losses_history["value loss"].append(val_loss.data.numpy()[0])
    losses_history["log loss"].append(log_loss.data.numpy()[0])
    loss = log_loss + val_loss + entropy_loss
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    del agent.rewards[:]
    del agent.saved_actions[:]


In [9]:
Expert_left_corner = NovelExpert("left_top_corner", np.load("gridworlds_data/{}_{}x{}/agents_left_corner/action_probs.npy".format(env_name, grid_size[0], grid_size[1])), 
                            np.load("gridworlds_data/{}_{}x{}/agents_left_corner/value_function.npy".format(env_name, grid_size[0], grid_size[1])),
                            np.load("gridworlds_data/{}_{}x{}/agents_left_corner/goal_map.npy".format(env_name, grid_size[0], grid_size[1])))
Expert_right_corner = NovelExpert("right_bottom_corner", np.load("gridworlds_data/{}_{}x{}/agents_right_corner/action_probs.npy".format(env_name, grid_size[0], grid_size[1])), 
                            np.load("gridworlds_data/{}_{}x{}/agents_right_corner/value_function.npy".format(env_name, grid_size[0], grid_size[1])),
                            np.load("gridworlds_data/{}_{}x{}/agents_right_corner/goal_map.npy".format(env_name, grid_size[0], grid_size[1])))
Expert_global_optimal = NovelExpert("global_optimal", np.load("gridworlds_data/{}_{}x{}/agents_global_optimal/action_probs.npy".format(env_name, grid_size[0], grid_size[1])), 
                            np.load("gridworlds_data/{}_{}x{}/agents_global_optimal/value_function.npy".format(env_name, grid_size[0], grid_size[1])),
                            np.load("gridworlds_data/{}_{}x{}/agents_global_optimal/goal_map.npy".format(env_name, grid_size[0], grid_size[1])))
possible_s_stars = [0, env.nS - 1]

In [10]:
siamese_net = torch.load("pretrained_nets/siamese_{}_{}x{}.pt".format(env_name, grid_size[0], grid_size[1]))

In [11]:
def goal_unsim(env, g1, g2):
    #print(g1)
    #print(g2)
    return (1.0 - siamese_net(np.array([(g1, g2)]))).data.numpy()[0][0]
   #return abs(g1 % int(np.sqrt(env.nS)) - g2 % int(np.sqrt(env.nS))) + abs(g1 // int(np.sqrt(env.nS)) - g2 // int(np.sqrt(env.nS)))

# def goal_unsim(env, g1, g2, env_type="pillar"):
#     if env_type == "simple":
#         return abs(g1 % int(np.sqrt(env.nS)) - g2 % int(np.sqrt(env.nS))) + abs(g1 // int(np.sqrt(env.nS)) - g2 // int(np.sqrt(env.nS)))
#     else:
#         if env.grid_map[g1] == 0 or env.grid_map[g2] == 0:
#             return 1000
#         r1 = np.abs(Expert_left_corner.v_function[g1] - Expert_left_corner.v_function[g2])
#         r2 = np.abs(Expert_right_corner.v_function[g1] - Expert_right_corner.v_function[g2])
#         return min(r1, r2)
    
def goal_based_training(env, tmax, agent, goal, optimizer, losses_history, agent_type): 
    time = 0
    done = False
    while True:
        if is_terminal(env, env.s): #done
            break
        time += 1
        a = agent.select_action(env.s, goal)
        state, _, done, _= env.step(a[0, 0])
        if is_terminal(env, env.s):
            if is_terminal(env, env.s): #done
                break
        if time >= tmax:
            break
        agent.rewards.append(0)
    
    if agent_type == "goal-based":
        final_reward = (-time - lambda_goals  * goal_unsim(env, goal, env.s)) / scale_reward 
    else:
        final_reward = (-time) / scale_reward 
    agent.rewards.append(final_reward)
    if(len(agent.saved_actions)):
        update_params(agent, optimizer, losses_history)
    
    del agent.rewards[:]
    del agent.saved_actions[:]

    return final_reward
    

In [12]:
def choose_goal(env, experts, s0, iteration, agent_type="goal-based"):
    #print(s0)
    if agent_type == "goal-based":
        best_goal = None
        best_goal_v = -np.inf
        possible_goals = [0, env.nS - 1]
        for expert in experts:
            possible_goals.append(expert.get_goal(s0))
            if expert.v_function[s0] > best_goal_v:
                best_goal = expert.get_goal(s0)
                best_goal_v = expert.v_function[s0]
        
        if goal_sampling_strategy == "eps_greedy":
            if np.random.random() < goal_eps:
                goal_eps *= goal_eps_decay
                return np.random.choice(possible_goals) 
            else:
                return int(best_goal)
        
        if goal_sampling_strategy == "random":
            return int(possible_goals[iteration % 2])
    return None
    
def train_agent(env, tmax, each_goal_times, agent, experts, optimizer, losses_history, i, agent_type):
    rewards = []
    episode_time = []
    finish_states = []
    set_random_s0(env)
    s0 = env.reset()
    goal = choose_goal(env, experts, s0, i, agent_type)
    res = goal_based_training(env, tmax, agent, goal, optimizer, losses_history, agent_type)   
    rewards.append(res)
    
    return agent, np.array(rewards)

In [13]:
def set_env_s0(env, s0):
    env.isd = np.zeros(env.shape[0] * env.shape[1])
    env.isd[s0] = 1 

def play_n_episodes(n, env, model, s0_list, s_star, tmax=40):
    state_dist_list = []
    for i in range(n):
        set_env_s0(env, s0_list[i])
        env.reset()
        state_dist_episode = np.zeros(env.shape)
        for j in range(tmax):
            s = env.s
            state_dist_episode[s // env.shape[0]][s % env.shape[1]] += 1
            if(is_terminal(env, env.s)):
                break
            probs, state_value = model(s, s_star)
            action = probs.multinomial().data
            env.step(action[0][0])
        state_dist_list.append(state_dist_episode)
    
    state_dist = np.zeros(env.shape)
    
    for dist in state_dist_list:
        state_dist += dist
    
    return state_dist / n, state_dist_list  

In [14]:
from torch.optim import lr_scheduler
agent_goal_based = Agent(state_dim, action_dim, "goal-based")
agent_simple = Agent(state_dim, action_dim, "simple")

#optimizer_goal_based_agent = optim.RMSprop(agent_goal_based.parameters(), alpha=0.97, eps=1e-6, lr=lr_agent)
#optimizer_simple_agent = optim.RMSprop(agent_simple.parameters(), alpha=0.97, eps=1e-6, lr=lr_agent)

optimizer_goal_based_agent = optim.Adam(agent_goal_based.parameters(),lr=lr_agent, weight_decay=weight_decay)
optimizer_simple_agent = optim.Adam(agent_simple.parameters(),lr=lr_agent, weight_decay=weight_decay)

goal_based_lr_scheduler = lr_scheduler.StepLR(optimizer_goal_based_agent, step_size=decrease_lr_every, gamma=0.25)

In [15]:
import shutil
plots_dir = "plots_compare"
try:
    shutil.rmtree(plots_dir)
except:
    pass
try:
    os.mkdir(plots_dir)
except:
    pass

In [None]:
losses_history_goal_based = {
    "entropy":[],
    "value loss":[],
    "log loss":[]
}

losses_history_simple = {
    "entropy":[],
    "value loss":[],
    "log loss":[]
}

In [None]:
each_goal_times = 1
start = 0

models = [agent_goal_based, agent_simple, Expert_global_optimal]
experts = [Expert_global_optimal]
previous_rewards = [[] for i in range(len(models))]

draw_every = 250
estimation_episodes_num = 500

for i in tnrange(train_steps):
    goal_based_lr_scheduler.step()
    agent_goal_based, rewards = train_agent(env, tmax, each_goal_times, agent_goal_based, experts, optimizer_goal_based_agent, losses_history_goal_based, i, agent_type="goal-based")
    agent_simple, rewards = train_agent(env, tmax, each_goal_times, agent_simple, experts, optimizer_simple_agent, losses_history_simple, i, agent_type="simple")
    
    #fig, ax = subplots(nrows=1, ncols=1, figsize=(12, 6))
    models = [agent_goal_based, agent_simple, Expert_global_optimal]
    models_anotations = ["goal-based agent", "simple agent", "Expert global optimal"]
    
    
    if (i + 1) % draw_every == 0:
        
        s0_list = np.random.choice(np.array(np.where(env.grid_map)).flatten(), size=estimation_episodes_num)
        
        draw_reward_curves(writer, i, env, models, models_anotations, tmax, previous_rewards, possible_s_stars, s0_list, estimation_episodes_num)
        #fig.tight_layout()
        #plt.savefig(os.path.join(plots_dir, "rewards_curves{}".format(each_goal_times * (start + i + 1))))
        plt.cla()
        plt.clf()


        #actions_prob_plot
        sns.set(color_codes=True)
        fig, ax = subplots(nrows=1, ncols=3, figsize=(15, 5))
        s_stars = [0, env.nS - 1, None]
        colors = sns.color_palette("Set1", n_colors=3, desat=.75)
        for j, s_star in enumerate(s_stars):
            for s in range(1, env.nS - 1):
                
                if env.grid_map[s] == 0:
                    continue
                
                if s_star is None:
                    agent = agent_simple
                
                else:
                    agent = agent_goal_based
                
                
                ax[j].set_xticks(np.arange(0, int(np.sqrt(env.nS)) + 1))
                ax[j].set_yticks(np.arange(0, int(np.sqrt(env.nS)) + 1))
                probs, state_value = agent(s, s_star)
                draw_direction_probs(ax[j], env, s, probs[0].data, arrow_color=colors[j])
                if s_star is None:
                    ax[j].set_title(agent.name, fontsize=16)
                else:
                    ax[j].set_title(agent.name + ", s_star={}".format(str(s_star)), fontsize=16)
                ax[j].invert_yaxis()

        fig.tight_layout()
        img = fig2img(fig)
        writer.add_image('Action_Probs', np.array(img)[..., :3], i)
        plt.cla()
        plt.clf()

        writer.add_scalars('loss terms',
                           {
                                "entropy term": losses_history_goal_based["entropy"][-1],
                                "value term": losses_history_goal_based["value loss"][-1],
                                "log loss term": losses_history_goal_based["log loss"][-1],
                           }, 
                           i)

        plt.cla()
        plt.clf()

        fig, ax = subplots(nrows=1, ncols=3, figsize=(12, 5))
        for j in range(3):
            ax[j].set_xticks(np.arange(0, int(np.sqrt(env.nS)) + 1))
            ax[j].set_yticks(np.arange(0, int(np.sqrt(env.nS)) + 1))
            ax[j].invert_yaxis()

        for j, s_star in enumerate([0, env.nS - 1, None]):
            if s_star is None:
                agent = agent_simple
            else:
                agent = agent_goal_based
            #print(agent.name)
            state_dist, state_dist_list = play_n_episodes(estimation_episodes_num, env, agent, s0_list, s_star)   
            for s in range(1, env.nS - 1):
                if env.grid_map[s] == 0:
                    continue
                
                value = agent(s, s_star)[1].data[0][0]
                #states_heat_map[int(s // np.sqrt(env.nS))][s % int(np.sqrt(env.nS))] = value
                draw_value_anotate(ax[j], env, s, value)
            ax[j].imshow(state_dist, cmap='hot', interpolation='nearest')
            if s_star is None:
                ax[j].set_title(agent.name, fontsize=16)
            else:
                ax[j].set_title(agent.name + ", s_star={}".format(str(s_star)), fontsize=16)

        fig.tight_layout()
        img = fig2img(fig)
        writer.add_image('Value state distr', np.array(img)[..., :3], i)

        #plt.savefig(os.path.join(plots_dir, "value_state_distr{}".format(each_goal_times * (start + i + 1))))
        plt.cla()
        plt.clf()

Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"
  return F.softmax(action_scores), state_values


In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 10))
for s in range(1, env.nS - 1):
    if env.grid_map[s] == 0:
        continue

    agent = Expert_global_optimal


    ax.set_xticks(np.arange(0, int(np.sqrt(env.nS)) + 1))
    ax.set_yticks(np.arange(0, int(np.sqrt(env.nS)) + 1))
    probs = agent.action_probs[s]
    draw_direction_probs(plt, env, s, probs)
    ax.invert_yaxis()

In [19]:
for i in range(49):
    print(i, Expert_global_optimal.action_probs[i])

0 [0. 0. 0. 0. 1.]
1 [0. 0. 0. 1. 0.]
2 [0. 0. 0. 1. 0.]
3 [0. 0. 0. 1. 0.]
4 [0. 0. 0. 1. 0.]
5 [0. 0. 0. 1. 0.]
6 [0. 0. 0. 1. 0.]
7 [0. 0. 0. 1. 0.]
8 [0. 0. 0. 1. 0.]
9 [1. 0. 0. 0. 0.]
10 [0.5 0.  0.  0.5 0. ]
11 [0.5 0.  0.  0.5 0. ]
12 [0.5 0.  0.  0.5 0. ]
13 [0.5 0.  0.  0.5 0. ]
14 [0.5 0.  0.  0.5 0. ]
15 [0.5 0.  0.  0.5 0. ]
16 [0.5 0.  0.  0.5 0. ]
17 [0. 0. 1. 0. 0.]
18 [1. 0. 0. 0. 0.]
19 [0.5 0.  0.  0.5 0. ]
20 [0.5 0.  0.  0.5 0. ]
21 [0.5 0.  0.  0.5 0. ]
22 [0.5 0.  0.  0.5 0. ]
23 [0.5 0.  0.  0.5 0. ]
24 [0.5 0.  0.  0.5 0. ]
25 [0.  0.5 0.5 0.  0. ]
26 [0. 0. 1. 0. 0.]
27 [1. 0. 0. 0. 0.]
28 [0.5 0.  0.  0.5 0. ]
29 [0.5 0.  0.  0.5 0. ]
30 [0.5 0.  0.  0.5 0. ]
31 [0.5 0.  0.  0.5 0. ]
32 [0.5 0.  0.  0.5 0. ]
33 [0.  0.5 0.5 0.  0. ]
34 [0.  0.5 0.5 0.  0. ]
35 [0. 0. 1. 0. 0.]
36 [1. 0. 0. 0. 0.]
37 [0.5 0.  0.  0.5 0. ]
38 [0.5 0.  0.  0.5 0. ]
39 [0.5 0.  0.  0.5 0. ]
40 [0.5 0.  0.  0.5 0. ]
41 [0.  0.5 0.5 0.  0. ]
42 [0.  0.5 0.5 0.  0. ]
43 [0.  0.5 0.5

In [22]:
g1 = 0
g2 = 1
siamese_net(np.array([(g1, g2)]))

Variable containing:
1.00000e-04 *
  1.0471
[torch.FloatTensor of size 1x1]

In [None]:
torch.load("pretrained_nets/siamese_5x5.pt")(np.array([(g1, g2)]))

# board.wait(board_timeout)

In [22]:
np.mean(get_policy_reward_estimation(env, agent_goal_based, 'agent', 1000, 40, s_star=[0, 24]))

  return F.softmax(action_scores), state_values


-11.03

In [23]:
s = 1
agent_goal_based(s, 0)

  return F.softmax(action_scores), state_values


(Variable containing:
  0.2266  0.1318  0.0942  0.5474
 [torch.FloatTensor of size 1x4], Variable containing:
 -2.5737
 [torch.FloatTensor of size 1x1])

In [24]:
agent_simple(s)

  return F.softmax(action_scores), state_values


(Variable containing:
  0.0607  0.4443  0.4541  0.0409
 [torch.FloatTensor of size 1x4], Variable containing:
 -0.9203
 [torch.FloatTensor of size 1x1])

In [None]:
np.mean(get_policy_reward_estimation(env, agent_simple, 'agent', 1000, 40, s_star=[None]))

In [None]:
agent_simple(s)

In [None]:
probs = Variable(torch.FloatTensor([0.5, 0.5]))
m = torch.distributions.Categorical(probs)
m.log_prob(Variable(torch.LongTensor(1)))