In [39]:
import gym
import numpy as np

import time

In [40]:
import sys
import os

sys.path.append(os.path.abspath('../'))

del sys, os

In [41]:
import matplotlib.pyplot as plt

In [42]:
# LaTeX rendering in graphs
from distutils.spawn import find_executable
if find_executable('latex'):
    plt.rc('text', usetex=True)

plt.rc('font', family='serif')

# High resolution graphs
%config InlineBackend.figure_format = 'retina'

In [43]:
import torch

In [44]:
%reload_ext autoreload
%autoreload 2

In [45]:
import models.rnn as rnns
import models.mlp as mlps
import models.linear as linears
import control.agents as agents
import control.environments as env

In [46]:
from utils.notifications import Slack

In [47]:
import copy

# Setup

In [48]:
env_name = 'Taxi-v2'
#env_name = 'Breakout-ram-v0'

In [49]:
environment = env.Environment(
    environment=gym.make(env_name), 
    agent=None,
    verbose=True,
    max_steps=200,
    capacity=100,
    representation_method='one_hot_encoding'
)

In [50]:
model = linears.Linear(
    input_dimension=environment.get_input_dimension(), 
    n_actions=environment.n_actions,
    bias=False
)

#optimiser = torch.optim.Adam(model.parameters(), lr=.5)
optimiser = torch.optim.SGD(model.parameters(), lr=10., momentum=0)

agent = agents.DQNAgent(
    model,
    optimiser, 
    gamma=1., 
    temperature=10, 
    algorithm='sarsa', 
    n_actions=environment.n_actions,
    terminal_state=environment.max_obs,
    use_double_learning=False
)

environment.agent = agent

In [13]:
print(environment.agent.q(environment.state_representation(479)))
print()
#model.load_state_dict(torch.load('../saved/taxi/mlp/state_dict.pth'))
model.load_state_dict(torch.load('../../../../Downloads/state_dict.pth'))

agent.commit()
print(environment.agent.q(environment.state_representation(1)))
print(environment.agent.q(environment.state_representation(66)))
print(environment.agent.q(environment.state_representation(479)))
print(environment.agent.q(environment.state_representation(499)))

[-0.01093328  0.02178707  0.01731317  0.01599102  0.00051596 -0.04388257]



RuntimeError: Error(s) in loading state_dict for Linear:
	size mismatch for layer.weight: copying a param with shape torch.Size([6, 500]) from checkpoint, the shape in current model is torch.Size([6, 501]).

# Experiment

## Boltzmann

In [52]:
q_estimation = []
returns = []

iterator = environment.tqdm(range(10), ascii=True, ncols=100)

with iterator as it:
    for _ in it:

        environment.agent.commit()
        returns.append(environment.exploration_segment(100))

        for _ in range(len(environment.replay_memory) // 100):
            environment.batch(100)


 10%|######4                                                         | 1/10 [00:03<00:30,  3.35s/it]

tensor([-0.0129,  0.0106, -0.0041, -0.0060,  0.0185,  0.0200],
       grad_fn=<SqueezeBackward3>)
[-1.00714306 -0.99986876 -1.03842686 -1.00929548 -0.99428726 -0.99244342]
3
tensor([-0.0129,  0.0106, -0.0041, -0.2062,  0.0185,  0.0200],
       grad_fn=<SqueezeBackward3>)
[-1.04215212 -1.04086911 -0.97283752 -1.03070493 -1.01908503 -1.02093334]
1
tensor([-0.0129, -0.1957, -0.0041, -0.2062,  0.0185,  0.0200],
       grad_fn=<SqueezeBackward3>)
[-0.98961515 -1.01841995 -0.9978022  -0.983663   -0.98148546 -0.98697613]
0
tensor([-0.2089, -0.1957, -0.0041, -0.2062,  0.0185,  0.0200],
       grad_fn=<SqueezeBackward3>)
[20.04189484 19.98908895 19.97577592 19.96485768 19.96498702 20.01213002]
5


 50%|################################                                | 5/10 [00:15<00:15,  3.12s/it]

tensor([-0.2089, -0.1957, -0.0041, -0.2062,  0.0185,  4.0143],
       grad_fn=<SqueezeBackward3>)
[ -6.54493332  -5.74855661  -4.79455328  -4.28697681 -13.98438644
 -10.4560051 ]
0
tensor([-1.6943, -0.1957, -0.0041, -0.2062,  0.0185,  4.0143],
       grad_fn=<SqueezeBackward3>)
[ -6.42327309  -5.74351168  -5.94529057  -5.9790287  -12.30467224
 -13.44719315]
1
tensor([-1.6943, -1.8180, -0.0041, -0.2062,  0.0185,  4.0143],
       grad_fn=<SqueezeBackward3>)
[ -6.38616562  -6.59021235  -5.61608887  -6.05467319 -14.50000954
 -14.90545845]
2
tensor([-1.6943, -1.8180, -1.8050, -0.2062,  0.0185,  4.0143],
       grad_fn=<SqueezeBackward3>)
[-15.61079168 -15.4788208  -14.46382475 -13.66581154 -20.99383831
 -20.74546909]
4
tensor([-1.6943, -1.8180, -1.8050, -0.2062, -3.3505,  4.0143],
       grad_fn=<SqueezeBackward3>)
[17.60854959 16.36240435 16.65254879 18.17489505 10.64605999  8.99376678]
5


100%|###############################################################| 10/10 [00:31<00:00,  3.01s/it]


In [None]:
environment.reset()

done = False
full_return = 0.

counter = 0
while not done and counter < environment.max_steps:
    
    s, reward, done, i = environment.step(environment.action)

    p, q = environment.boltzmann(s, return_q=True)
    a = environment.sample_action(p)
    
    environment.state, environment.action = s, a

    full_return += reward
    counter += 1
    
    print(environment.agent.q(s))
    print(environment.action, np.argmax(environment.state))


In [26]:
n_episodes = 3
agent.temperature = 0.1

plt.figure()

for i in range(n_episodes):

    full_return, counter, observations = environment.evaluation_episode(render=False,return_observations=True)
    
    q = []
    
    observation_old = None
    q_old = None
    
    for observation in observations:
        
        observation_new = environment.state_representation(observation)
        q_new = environment.agent.q(environment.state_representation(observation))
        
        if observation_old is not None:
            same_obs = np.array_equal(observation_old, observation_new)
            same_q = np.array_equal(q_old, q_new)
            if not same_obs and not same_q:
                print("great")
        
        observation_old = observation_new
        q_old = q_new
        
        q.append(q_new)
        
    q = np.asarray(q)
    print(np.std(q, axis=0))
    print(q[:,0])
    plt.plot(q[:,0], label='0')
    break
    #plt.plot(q[:,1], label='1')
    #plt.plot(q[:,2], label='2')
    
plt.show()

AttributeError: 'TaxiEnv' object has no attribute '_get_obs'

<Figure size 432x288 with 0 Axes>

In [15]:
plt.figure()
for i in range(n_episodes):
    x = np.asarray(q_estimation[i])
    plt.plot(x[:,1])

plt.show()

NameError: name 'q_estimation' is not defined

<Figure size 432x288 with 0 Axes>

## Testing

In [59]:
agent.temperature = 10
for _ in range(1000):
    environment.exploration_episode(render=False)

+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35m[42mB[0m[0m: |
+---------+
  (Dropoff)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m:[42m_[0m|
+---------+
  (East)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35m[42mB[0m[0m: |
+---------+
  (Dropoff)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m:[42m_[0m|
+---------+
  (East)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35m[42mB[0m[0m: |
+---------+
  (West)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35m[42mB[0m[0m: |
+---------+
  (Dropoff)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : |[42m_[0m: |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35m[42mB[0m[0m: |
+---------+
  (Dropoff)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35m[42mB[0m[0m: |
+--------

In [60]:
for _ in range(5):
    environment.evaluation_episode(render=False)