In [1]:
import gym
import numpy as np

import time

In [2]:
import sys
import os

sys.path.append(os.path.abspath('../'))

del sys, os

In [3]:
import matplotlib.pyplot as plt

In [4]:
# LaTeX rendering in graphs
from distutils.spawn import find_executable
if find_executable('latex'):
    plt.rc('text', usetex=True)

plt.rc('font', family='serif')

# High resolution graphs
%config InlineBackend.figure_format = 'retina'

In [5]:
import torch

In [6]:
%reload_ext autoreload
%autoreload 2

In [7]:
import models.rnn as rnns
import models.mlp as mlps
import models.linear as linears
import control.agents as agents
import control.environments as env

In [8]:
from utils.notifications import Slack

In [9]:
import copy

# Setup

In [10]:
env_name = 'Taxi-v2'
#env_name = 'Breakout-ram-v0'

In [11]:
environment = env.Environment(
    environment=gym.make(env_name), 
    agent=None,
    verbose=True,
    max_steps=200,
    capacity=100,
    representation_method='one_hot_encoding'
)

  result = entry_point.load(False)


In [13]:
model = linears.Linear(
    input_dimension=environment.get_input_dimension(), 
    n_actions=environment.n_actions,
)
optimiser = torch.optim.SGD(model.parameters(), lr=10, momentum=0)
#optimiser = torch.optim.Adam(model.parameters(), lr=1)
agent = agents.DQNAgent(model, optimiser, gamma=1., temperature=10, algorithm='sarsa', n_actions=environment.n_actions, end_reward=20)
environment.agent = agent

In [51]:
print(environment.agent.q(environment.state_representation(479)))
#model.load_state_dict(torch.load('../saved/taxi/mlp/state_dict.pth'))
model.load_state_dict(torch.load('../../../../Downloads/state_dict.pth'))

agent.commit()
print(environment.agent.q(environment.state_representation(479)))

[-495.5822  -499.46863 -497.5078  -500.3041  -531.75214 -489.91562]
[-495.5822  -499.46863 -497.5078  -500.3041  -531.75214 -489.91562]


# Experiment

## Boltzmann

In [None]:
q_estimation = []
returns = []

iterator = environment.tqdm(range(100), ascii=True, ncols=100)

with iterator as it:
    for _ in it:

        environment.agent.commit()
        returns.append(environment.exploration_segment(100))

        for _ in range(len(environment.replay_memory) // 100):
            environment.batch(100)


  3%|#8                                                             | 3/100 [00:08<04:25,  2.73s/it]

tensor([ -9.4972, -13.9983,  -7.3409, -11.1592,  -1.9683,   7.2023],
       grad_fn=<SqueezeBackward3>)
[20. 20. 20. 20. 20. 20.]
5


  5%|###1                                                           | 5/100 [00:13<04:23,  2.78s/it]

tensor([ -9.4972, -13.9983,  -7.3409, -11.1592,  -1.9683,   9.7618],
       grad_fn=<SqueezeBackward3>)
[20. 20. 20. 20. 20. 20.]
5


 12%|#######4                                                      | 12/100 [00:33<04:06,  2.80s/it]

tensor([ -9.4972, -13.9983,  -7.3409, -11.1592,  -1.9683,  11.8095],
       grad_fn=<SqueezeBackward3>)
[20. 20. 20. 20. 20. 20.]
5


 13%|########                                                      | 13/100 [00:36<04:14,  2.93s/it]

tensor([ -9.4972, -13.9983,  -7.3409, -11.1592,  -1.9683,  13.4476],
       grad_fn=<SqueezeBackward3>)
[20. 20. 20. 20. 20. 20.]
5


 18%|###########1                                                  | 18/100 [00:48<03:20,  2.44s/it]

In [36]:
environment.reset()

done = False
full_return = 0.

counter = 0
while not done and counter < environment.max_steps:
    
    s, reward, done, i = environment.step(environment.action)

    p, q = environment.boltzmann(s, return_q=True)
    a = environment.sample_action(p)
    
    environment.state, environment.action = s, a

    full_return = environment.agent.gamma * full_return + reward
    counter += 1
    
    print(np.argmax(s))
    print(environment.agent.q(s))


329
[-30.709248 -31.531038 -30.673918 -31.595798 -32.558308 -31.055958]
429
[-31.233753 -30.915483 -18.555916 -29.451986 -35.169575 -38.4846  ]
449
[-31.233753 -30.93541  -30.999357 -32.537136 -32.77127  -38.474052]
349
[-30.96778  -30.647896 -28.851017 -13.912269 -29.423548 -28.960613]
329
[-30.709248 -31.531038 -30.673918 -31.595798 -32.558308 -31.055958]
429
[-31.233753 -30.915483 -18.555916 -29.451986 -35.169575 -38.4846  ]
449
[-31.233753 -30.93541  -30.999357 -32.537136 -32.77127  -38.474052]
349
[-30.96778  -30.647896 -28.851017 -13.912269 -29.423548 -28.960613]
329
[-30.709248 -31.531038 -30.673918 -31.595798 -32.558308 -31.055958]
349
[-30.96778  -30.647896 -28.851017 -13.912269 -29.423548 -28.960613]
329
[-30.709248 -31.531038 -30.673918 -31.595798 -32.558308 -31.055958]
429
[-31.233753 -30.915483 -18.555916 -29.451986 -35.169575 -38.4846  ]
449
[-31.233753 -30.93541  -30.999357 -32.537136 -32.77127  -38.474052]
449
[-31.233753 -30.93541  -30.999357 -32.537136 -32.77127  -38.

In [26]:
n_episodes = 3
agent.temperature = 0.1

plt.figure()

for i in range(n_episodes):

    full_return, counter, observations = environment.evaluation_episode(render=False,return_observations=True)
    
    q = []
    
    observation_old = None
    q_old = None
    
    for observation in observations:
        
        observation_new = environment.state_representation(observation)
        q_new = environment.agent.q(environment.state_representation(observation))
        
        if observation_old is not None:
            same_obs = np.array_equal(observation_old, observation_new)
            same_q = np.array_equal(q_old, q_new)
            if not same_obs and not same_q:
                print("great")
        
        observation_old = observation_new
        q_old = q_new
        
        q.append(q_new)
        
    q = np.asarray(q)
    print(np.std(q, axis=0))
    print(q[:,0])
    plt.plot(q[:,0], label='0')
    break
    #plt.plot(q[:,1], label='1')
    #plt.plot(q[:,2], label='2')
    
plt.show()

AttributeError: 'TaxiEnv' object has no attribute '_get_obs'

<Figure size 432x288 with 0 Axes>

In [15]:
plt.figure()
for i in range(n_episodes):
    x = np.asarray(q_estimation[i])
    plt.plot(x[:,1])

plt.show()

NameError: name 'q_estimation' is not defined

<Figure size 432x288 with 0 Axes>

## Testing

In [59]:
agent.temperature = 10
for _ in range(1000):
    environment.exploration_episode(render=False)

+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35m[42mB[0m[0m: |
+---------+
  (Dropoff)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m:[42m_[0m|
+---------+
  (East)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35m[42mB[0m[0m: |
+---------+
  (Dropoff)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m:[42m_[0m|
+---------+
  (East)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35m[42mB[0m[0m: |
+---------+
  (West)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35m[42mB[0m[0m: |
+---------+
  (Dropoff)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : |[42m_[0m: |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35m[42mB[0m[0m: |
+---------+
  (Dropoff)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35m[42mB[0m[0m: |
+--------

In [60]:
for _ in range(5):
    environment.evaluation_episode(render=False)