In [1]:
from __future__ import absolute_import, division, print_function

import base64
import imageio
import IPython
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import PIL.Image
import pyvirtualdisplay

import tensorflow as tf

from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import q_network
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common

from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.trajectories import time_step as ts

import requests
import json


In [2]:
tf.compat.v1.enable_v2_behavior()


In [3]:
tf.version.VERSION


'2.2.0'

In [4]:
num_iterations = 20000 # @param {type:"integer"}

initial_collect_steps = 10  # @param {type:"integer"} 
collect_steps_per_iteration = 1  # @param {type:"integer"}
replay_buffer_max_length = 100000  # @param {type:"integer"}

batch_size = 16  # @param {type:"integer"}
learning_rate = 1e-3  # @param {type:"number"}
log_interval = 30  # @param {type:"integer"}

num_eval_episodes = 10  # @param {type:"integer"}
eval_interval = 1000  # @param {type:"integer"}


In [5]:
resp = requests.get("http://192.168.178.87/r_off")

In [6]:
resp = requests.get("http://192.168.178.87/r_on")

In [7]:
def observe_kettle():
    # TODO remove the offset!!
    resp = requests.get("http://192.168.178.87/")
    a = json.loads(resp.text)
    temperatures = [x[1]-50.0 for x in json.loads(resp.text)["temp"]]
    on_off = [float(x[1]) for x in json.loads(resp.text)["state"]]
    return temperatures+on_off

def turn_kettle_on():    
    resp = requests.get("http://192.168.178.87/r_on")
    
def turn_kettle_off():    
    resp = requests.get("http://192.168.178.87/r_off")

    
observation = observe_kettle()
print(observation)

[-4.625, -4.5, -4.4375, -4.3125, -4.25, -4.25, -4.125, -4.0625, -4.0, -3.9375, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


In [8]:
import time

class KettleEnv(py_environment.PyEnvironment):
  def __init__(self):
    self._action_spec = array_spec.BoundedArraySpec(shape=(), dtype=np.int32, minimum=0, maximum=1, name='action')
    self._observation_spec = array_spec.BoundedArraySpec(shape=(20,), dtype=np.float32, minimum=0, name='observation')
    self._state = 0
    self._episode_ended = False

  def action_spec(self):
    return self._action_spec

  def observation_spec(self):
    return self._observation_spec

  def _reset(self):
    # TODO print something for me to clean out the kettle...
    self._state = observe_kettle()
    self._episode_ended = False
    return ts.restart(np.array(self._state, dtype=np.float32))

  def _step(self, action):
    if self._episode_ended:
      # The last action ended the episode. Ignore the current action and start a new episode.
      return self.reset()

    # Make sure episodes don't go on forever.
    if action == 1:
        turn_kettle_on()
    elif action == 0:
        turn_kettle_off()
    else:
      raise ValueError('`action` should be 0 or 1.')

    # TODO do I have to sleep here or somewhere else...
    time.sleep(1.0)
    
    self._state = observe_kettle()

    current_temp = self._state[9]
    print("Current temp", current_temp)
    given_reward = 0.0
    if current_temp < 0:
        given_reward = 1.0-0.0*abs(current_temp)
    else:
        given_reward = 1.0-0.03*abs(current_temp)
    
    return ts.transition(np.array(self._state, dtype=np.float32), reward=given_reward, discount=1.0)

env = KettleEnv()

In [9]:
env.reset()


TimeStep(step_type=array(0, dtype=int32), reward=array(0., dtype=float32), discount=array(1., dtype=float32), observation=array([-4.5   , -4.4375, -4.3125, -4.25  , -4.25  , -4.125 , -4.0625,
       -4.    , -3.9375, -3.875 ,  1.    ,  0.    ,  1.    ,  1.    ,
        1.    ,  1.    ,  1.    ,  1.    ,  1.    ,  1.    ],
      dtype=float32))

In [10]:
print('Observation Spec:')
print(env.time_step_spec().observation)


Observation Spec:
BoundedArraySpec(shape=(20,), dtype=dtype('float32'), name='observation', minimum=0.0, maximum=3.4028234663852886e+38)


In [11]:
print('Reward Spec:')
print(env.time_step_spec().reward)


Reward Spec:
ArraySpec(shape=(), dtype=dtype('float32'), name='reward')


In [12]:
print('Action Spec:')
print(env.action_spec())


Action Spec:
BoundedArraySpec(shape=(), dtype=dtype('int32'), name='action', minimum=0, maximum=1)


In [13]:
train_py_env = KettleEnv()
eval_py_env = KettleEnv()

In [14]:
train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)


In [15]:
fc_layer_params = (30,10,5)

q_net = q_network.QNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    fc_layer_params=fc_layer_params)


In [16]:
q_net

<tf_agents.networks.q_network.QNetwork at 0x7f839adbb6a0>

In [17]:
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

train_step_counter = tf.Variable(0)

agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter)

agent.initialize()


In [18]:
eval_policy = agent.policy
collect_policy = agent.collect_policy


In [19]:
example_environment = tf_py_environment.TFPyEnvironment(KettleEnv())

In [20]:
time_step = example_environment.reset()

In [21]:
# def compute_avg_return(environment, policy, num_episodes=10):

def compute_avg_return(environment, policy, num_steps = 100):

  total_return = 0.0
  for _ in range(num_episodes):

    time_step = environment.reset()
    episode_return = 0.0

    while not time_step.is_last():
      action_step = policy.action(time_step)
      time_step = environment.step(action_step.action)
      episode_return += time_step.reward
    total_return += episode_return

  avg_return = total_return / num_episodes
  return avg_return.numpy()[0]



In [22]:
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_max_length)


In [23]:

def collect_step(environment, policy, buffer):
  time_step = environment.current_time_step()
  action_step = policy.action(time_step)
  next_time_step = environment.step(action_step.action)
  traj = trajectory.from_transition(time_step, action_step, next_time_step)

  # Add trajectory to the replay buffer
  buffer.add_batch(traj)

def collect_data(env, policy, buffer, steps):
  for _ in range(steps):
    collect_step(env, policy, buffer)

    
random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(), train_env.action_spec())
collect_data(train_env, random_policy, replay_buffer, steps=100)

Current temp -3.75
Current temp -3.6875
Current temp -3.5
Current temp -3.4375
Current temp -3.3125
Current temp -3.25
Current temp -3.0625
Current temp -3.0
Current temp -2.9375
Current temp -2.875
Current temp -2.8125
Current temp -2.75
Current temp -2.75
Current temp -2.625
Current temp -2.625
Current temp -2.5
Current temp -2.5
Current temp -2.375
Current temp -2.3125
Current temp -2.1875
Current temp -2.125
Current temp -2.0625
Current temp -2.0
Current temp -1.9375
Current temp -1.875
Current temp -1.8125
Current temp -1.8125
Current temp -1.75
Current temp -1.6875
Current temp -1.625
Current temp -1.625
Current temp -1.5625
Current temp -1.5
Current temp -1.4375
Current temp -1.375
Current temp -1.3125
Current temp -1.3125
Current temp -1.1875
Current temp -1.125
Current temp -1.0
Current temp -0.9375
Current temp -0.9375
Current temp -0.8125
Current temp -0.75
Current temp -0.6875
Current temp -0.5625
Current temp -0.5
Current temp -0.4375
Current temp -0.25
Current temp -0.187

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response',))

In [None]:
# Dataset generates trajectories with shape [Bx2x...]
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3, 
    sample_batch_size=batch_size, 
    num_steps=2).prefetch(3)


dataset


In [None]:
iterator = iter(dataset)


In [None]:
# (Optional) Optimize by wrapping some of the code in a graph using TF function.
agent.train = common.function(agent.train)

print("Reset the train step")
agent.train_step_counter.assign(0)

print("Evaluate the agent's policy once before training.")
# avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
# returns = [avg_return]

for _ in range(num_iterations):
    try:
    #   print("Collect a few steps using collect_policy and save to the replay buffer.")
      for _ in range(collect_steps_per_iteration):
        collect_step(train_env, agent.collect_policy, replay_buffer)

    #   print("Sample a batch of data from the buffer and update the agent's network.")
      experience, unused_info = next(iterator)
      train_loss = agent.train(experience).loss
      print(train_loss)
      step = agent.train_step_counter.numpy()
    #   print("Step is ", step)


      if step % log_interval == 0:
        print('step = {0}: loss = {1}, learning from {2}'.format(step, train_loss, replay_buffer.num_frames().numpy()))

    #   if step % eval_interval == 0:
    #     avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
    #     print('step = {0}: Average Return = {1}'.format(step, avg_return))
    #     returns.append(avg_return)
    except Exception as e:
        print("Damn you kettle")
        time.sleep(3.0)

In [None]:
dir(replay_buffer)

In [None]:
blaat = next(iterator)

In [None]:
blaat

In [None]:
a = KettleEnv()