In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import numpy as np
import random
import tensorflow as tf
from itertools import count
from collections import deque
from tensorflow.python.framework import ops

In [46]:
# Learning parameters
experience = deque()
max_experience = 50000
minibatch_size = 1000
discount_rate = 0.9
target_network_update_rate = 0.01
episode_max = 1

In [4]:
# Model parameters
L1 = 1
L2 = 1
M1 = 1
M2 = 1
damping = 0.1
G = 9.81

In [5]:
# Simulation parameters
dt = 0.01

In [6]:
ops.reset_default_graph()

In [7]:
# TensorFlow configuration
session = tf.InteractiveSession()
optimizer = tf.train.RMSPropOptimizer(learning_rate= 1.0, decay=0.1)

# Network
observation_size = 4
output_size = 100
hidden1_size = 100
hidden2_size = 100
observation = tf.placeholder(tf.float32, (None, observation_size), name="observation")

Layer1 = tf.get_variable('Layer1',(observation_size, hidden1_size))
Layer2 = tf.get_variable('Layer2',(hidden1_size, hidden2_size))
Layer3 = tf.get_variable('Layer3',(hidden2_size, output_size))
q_network = []
q_network.append(Layer1)
q_network.append(Layer2)
q_network.append(Layer3)

TLayer1 = tf.get_variable('TLayer1',(observation_size, hidden1_size))
TLayer2 = tf.get_variable('TLayer2',(hidden1_size, hidden2_size))
TLayer3 = tf.get_variable('TLayer3',(hidden2_size, output_size))
target_q_network = []
target_q_network.append(TLayer1)
target_q_network.append(TLayer2)
target_q_network.append(TLayer3)

# Network calculations
y1 = tf.nn.softmax(tf.matmul(observation, Layer1))
y2 = tf.nn.softmax(tf.matmul(y1, Layer2))
action_scores = tf.identity(y2, name="action_scores")
predicted_actions = tf.argmax(action_scores, dimension=1, name="predicted_actions")

Ty1 = tf.nn.softmax(tf.matmul(observation, TLayer1))
Ty2 = tf.nn.softmax(tf.matmul(Ty1, TLayer2))
Taction_scores = tf.identity(Ty2, name="action_scores")
Tpredicted_actions = tf.argmax(Taction_scores, dimension=1, name="predicted_actions")

In [8]:
# Training:  Placeholders
action_mask = tf.placeholder(tf.float32, (None, output_size), name="action_mask")
rewards = tf.placeholder(tf.float32, (None,), name="rewards")
next_observation = tf.placeholder(tf.float32, (None, observation_size), name="next_observation")
next_observation_mask = tf.placeholder(tf.float32, (None,), name="next_observation_mask")

In [9]:
# Training: Current
masked_action_scores = tf.reduce_sum(action_scores * action_mask, reduction_indices=[1,])

In [10]:
# Training: Future
next_action_scores = tf.to_float(tf.stop_gradient(Tpredicted_actions)) # -- NB: What is stop_gradient??
target_values = tf.reduce_max(next_action_scores) * next_observation_mask
future_rewards = rewards + discount_rate * target_values

In [11]:
# Training: Optimisation
temp_diff = masked_action_scores - future_rewards
prediction_error = tf.reduce_mean(tf.square(temp_diff))
gradients = optimizer.compute_gradients(prediction_error)
for i, (grad, var) in enumerate(gradients):
    if grad is not None:
        gradients[i] = (tf.clip_by_norm(grad, 5), var)
train_op = optimizer.apply_gradients(gradients)

In [12]:
# Training: Target update
target_network_update = target_network_update_rate*(Layer1-TLayer1)

In [66]:
# Initialisation
session.run(tf.initialize_all_variables())
session.run(target_network_update)
1

1

In [67]:
# More initialisation
new_observation = None
new_action      = None
control = 0
time = 0
episodes = 0
state = np.array([0.0, 0.0, 0.0, 0.0])

for time_ox in count():
    
    time = time + dt
        
    # Saving previous step
    last_action = new_action
    last_observation = new_observation
    
    ## ----------------------------------------------------------------
    ## INTERCHANGABLE MODEL
    ## ----------------------------------------------------------------
    # Model & Simulation
    dydx = np.zeros_like(state)
    del_ = state[2]-state[0]
    den1 = (M1+M2)*L1 - M2*L1*np.cos(del_)*np.cos(del_)
    den2 = (L2/L1)*den1
    
    dydx[0] = state[1]
    
    dydx[1] = (M2*L1*state[1]*state[1]*np.sin(del_)*np.cos(del_)
                + M2*G*np.sin(state[2])*np.cos(del_)
                + M2*L2*state[3]*state[3]*np.sin(del_)
                - (M1+M2)*G*np.sin(state[0]))/den1 - damping*state[1] + control
    
    dydx[2] = state[3]
    
    dydx[3] = (-M2*L2*state[3]*state[3]*np.sin(del_)*np.cos(del_)
                + (M1+M2)*G*np.sin(state[0])*np.cos(del_)
                - (M1+M2)*L1*state[1]*state[1]*np.sin(del_)
                - (M1+M2)*G*np.sin(state[2]))/den2 - damping * state[3]

    x1 = L1  * np.sin(state[0])
    y1 = L1 * np.cos(state[0])
    x2 = L2  * np.sin(state[2]) + x1
    y2 = L2 * np.cos(state[2]) + y1
 
    state = state + dt * dydx
    state[0] = state[0] % (2*np.pi)
    state[2] = state[2] % (2*np.pi)

    # Collecting reward
    reward = y2 + L2 + L1
    
    ## ----------------------------------------------------------------
    
    # Collecting together data for an observation
    new_observation = state
    
    # Selecting control value
    new_actions = session.run(
        predicted_actions,
        {observation: [new_observation]}
     )
    new_action = new_actions[0]
    print [new_observation, new_actions]
    control = new_action
    
    # Collecting experience
    experience.append((last_observation, new_action, reward, new_observation))
    if len(experience) > max_experience:
        experience.popleft()
        
    # Training
    if episodes == 20:
        break
    if time < episode_max:
        continue
        
    ## Sampling
    if len(experience) <  minibatch_size:
        continue
        
    samples = random.sample(range(len(experience)), minibatch_size)
    samples = [experience[i] for i in samples]
    
    ## Clever algorithm?
    Xobservation = np.empty((len(samples), observation_size))
    Xnewobservation = np.empty((len(samples), observation_size))
    Xaction_mask = np.zeros((len(samples), output_size))
    Xnewobservation_mask = np.empty((len(samples),))
    Xrewards = np.empty((len(samples),))

    for i, (x_observation, x_action, x_reward, x_newobservation) in enumerate(samples):
        Xobservation[i] = x_observation
        Xaction_mask[i] = 0
        Xaction_mask[i][x_action] = 1
        Xrewards[i] = x_reward
        if x_newobservation is not None:
            Xnewobservation[i] = x_newobservation
            Xnewobservation_mask[i] = 1
        else:
            Xnewobservation[i] = 0
            Xnewobservation_mask[i] = 0
    
    cost, _ = session.run([ # Fetches
            prediction_error,
            train_op,
        ], { 
            observation:            Xobservation,
            next_observation:       Xnewobservation,
            next_observation_mask:  Xnewobservation_mask,
            action_mask:            Xaction_mask,
            rewards:                Xrewards,
        })
    
    session.run(target_network_update)
    time = 0
    episodes = episodes + 1

[array([ 0.,  0.,  0.,  0.]), array([83])]
[array([ 0.  ,  0.83,  0.  ,  0.  ]), array([38])]
[array([ 0.0083 ,  1.20917,  0.     ,  0.     ]), array([38])]
[array([ 0.0203917 ,  1.58621116,  0.        ,  0.00187096]), array([38])]
[array([  3.62538116e-02,   1.96011332e+00,   1.87096082e-05,
         6.89281052e-03]), array([38])]
[array([  5.58549449e-02,   2.32966378e+00,   8.76377134e-05,
         1.67598053e-02]), array([38])]
[array([  7.91515827e-02,   2.69341235e+00,   2.55235766e-04,
         3.36596847e-02]), array([38])]
[array([  1.06085706e-01,   3.04966069e+00,   5.91832613e-04,
         6.03113009e-02]), array([38])]
[array([  1.36582313e-01,   3.39648521e+00,   1.19494562e-03,
         9.99444640e-02]), array([38])]
[array([  1.70547165e-01,   3.73180108e+00,   2.19439026e-03,
         1.56207890e-01]), array([38])]
[array([  2.07865176e-01,   4.05346931e+00,   3.75646916e-03,
         2.32999244e-01]), array([38])]
[array([ 0.24839987,  4.35943895,  0.00608646,  0.3342