# Imitation Learning

In [1]:
import pickle
import math
import tensorflow as tf
import numpy as np
import tf_util
import gym
import load_policy
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [2]:
ENV_NAME = 'Reacher-v1'
NUM_ROLLOUTS = 300
LOAD = True

In [3]:
print('loading and building expert policy')
policy_fn = load_policy.load_policy('experts/' + ENV_NAME + '.pkl')
print('loaded and built')

loading and building expert policy
('obs', (1, 11), (1, 11))
loaded and built


In [4]:
if LOAD:
    expert_data = pickle.load(open('expert_data_' + ENV_NAME + '.pkl', 'rb'))
else:
    with tf.Session():
        tf_util.initialize()

        env = gym.make(ENV_NAME)
        max_steps = env.spec.timestep_limit

        returns = []
        observations = []
        actions = []
        for i in range(NUM_ROLLOUTS):
            print('iter', i)
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                action = policy_fn(obs[None,:])
                observations.append(obs)
                actions.append(action)
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
                #env.render()
                if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
                if steps >= max_steps:
                    break
            returns.append(totalr)

        print('returns', returns)
        print('mean return', np.mean(returns))
        print('std of return', np.std(returns))

        expert_data = {'observations': np.array(observations),
                       'actions': np.array(actions)}
        pickle.dump(expert_data, open('expert_data_' + ENV_NAME + '.pkl', 'wb'))

In [90]:
observations = expert_data['observations']
actions = expert_data['actions'][:,0]
print('observations shape: {}'.format(observations.shape))
print('actions shape: {}'.format(actions.shape))

observations shape: (300000, 11)
actions shape: (300000, 2)


In [17]:
num_inputs = observations.shape[-1]
num_outputs = actions.shape[-1]

n_examples = observations.shape[0]
batch_size = 1000

X, Y = observations, actions

In [18]:
# Standardize
mean = X.mean(axis=0)
X -= mean
std = X.std(axis=0)
std[std==0.0] = 1.0 # replace 0 by 1 for the scaling factor (some state variables might remain constant)
X /= std

In [19]:
# Model definition
graph = tf.Graph()
with graph.as_default():
    x = tf.placeholder(tf.float32, [None, num_inputs])
    y = tf.placeholder(tf.float32, [None, num_outputs])

    def multilayer_perceptron(x, weights, biases):
        layer_1 = tf.matmul(x, weights['h1']) + biases['b1']
        layer_1 = tf.nn.relu(layer_1)

        out_layer = tf.matmul(layer_1, weights['out']) + biases['out']
        return out_layer

    weights = {'h1': tf.Variable(tf.truncated_normal([num_inputs, 128], stddev=math.sqrt(1.0/num_inputs))),
        'out': tf.Variable(tf.truncated_normal([128, num_outputs], stddev=math.sqrt(1.0/128)))
    }
    biases = {'b1': tf.Variable(tf.random_normal([128])),
        'out': tf.Variable(tf.random_normal([num_outputs]))
    }

    # Construct model
    pred = multilayer_perceptron(x, weights, biases)

    # Define loss and optimizer
    cost = tf.losses.mean_squared_error(y, pred) + 0.002*(tf.nn.l2_loss(weights['h1']) + tf.nn.l2_loss(weights['out']))
    optimizer = tf.train.AdamOptimizer().minimize(cost)
    
    saver = tf.train.Saver()

In [9]:
# Training against expert data
with tf.Session(graph=graph) as sess:
    tf.global_variables_initializer().run()

    # Training cycle
    for epoch in range(100):
        avg_cost = 0.0
        total_batch = int(n_examples/batch_size)
        # Loop over all batches
        for i in range(total_batch):
            batch_x, batch_y = ( X[i*batch_size:(i+1)*batch_size],
                                 Y[i*batch_size:(i+1)*batch_size] )
            # Run optimization op (backprop) and cost op (to get loss value)
            _, c = sess.run([optimizer, cost], feed_dict={x: batch_x,
                                                          y: batch_y})
            # Compute average loss
            avg_cost += c / total_batch
        # Display logs per epoch step
        if epoch % 10 == 0:
            print("Epoch {}:\t{:.9f}".format(epoch+1, avg_cost))
    print("Training Completed")
    save_path = saver.save(sess, "tmp/model.ckpt")

Epoch 1:	0.431990650
Epoch 11:	0.057077488
Epoch 21:	0.053175648
Epoch 31:	0.051997270
Epoch 41:	0.051435711
Epoch 51:	0.051239512
Epoch 61:	0.051128175
Epoch 71:	0.051002328
Epoch 81:	0.050965433
Epoch 91:	0.050943407
Training Completed


In [10]:
# Testing learned policy
with tf.Session(graph=graph) as sess:  
    saver.restore(sess, "tmp/model.ckpt")
    
    def learned_policy_fn(obs):
            # Make prediction using standardized data
            obs -= mean
            obs /= std
            action = sess.run(pred, feed_dict={x: obs[None,:], y:batch_y})
            return action.T

    env = gym.make(ENV_NAME)
    max_steps = env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')

    returns = []
    observations = []
    actions = []
    for i in range(3):
        print('iter', i)
        obs = env.reset()
        done = False
        totalr = 0.
        steps = 0
        while not done:
            action = learned_policy_fn(obs)
            observations.append(obs)
            actions.append(action)
            obs, r, done, _ = env.step(action)
            totalr += r
            steps += 1
            env.render()
            if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
            if steps >= max_steps:
                break
        returns.append(totalr)

    print('returns', returns)
    print('mean return', np.mean(returns))
    print('std of return', np.std(returns))

[2017-02-19 22:05:29,981] Making new env: Walker2d-v1


('iter', 0)
100/1000
200/1000
300/1000
('iter', 1)
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
1000/1000
('iter', 2)
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
1000/1000
('returns', [1575.2115742863768, 5487.6759614679067, 5384.9060031639738])
('mean return', 4149.2645129727525)
('std of return', 1820.6137819340711)


# DAgger

In [5]:
observations = expert_data['observations']
actions = expert_data['actions'][:,0]
print('observations shape: {}'.format(observations.shape))
print('actions shape: {}'.format(actions.shape))

num_inputs = observations.shape[-1]
num_outputs = actions.shape[-1]

observations shape: (300000, 11)
actions shape: (300000, 2)


In [6]:
# Model definition
x = tf.placeholder(tf.float32, [None, num_inputs])
y = tf.placeholder(tf.float32, [None, num_outputs])

def multilayer_perceptron(x, weights, biases):
    layer_1 = tf.matmul(x, weights['h1']) + biases['b1']
    layer_1 = tf.nn.relu(layer_1)

    out_layer = tf.matmul(layer_1, weights['out']) + biases['out']
    return out_layer

weights = {'h1': tf.Variable(tf.truncated_normal([num_inputs, 128], stddev=math.sqrt(1.0/num_inputs))),
    'out': tf.Variable(tf.truncated_normal([128, num_outputs], stddev=math.sqrt(1.0/128)))
}
biases = {'b1': tf.Variable(tf.random_normal([128])),
    'out': tf.Variable(tf.random_normal([num_outputs]))
}

pred = multilayer_perceptron(x, weights, biases)

cost = tf.losses.mean_squared_error(y, pred) + 0.002*(tf.nn.l2_loss(weights['h1']) + tf.nn.l2_loss(weights['out']))
optimizer = tf.train.AdamOptimizer(0.0001).minimize(cost)

saver = tf.train.Saver()

In [10]:
with tf.Session() as sess_ini:
    tf.global_variables_initializer().run()
    saver.save(sess_ini, "tmp/model.ckpt")

In [11]:
mean = observations.mean(axis=0)
std = observations.std(axis=0) + 1e-5

sess = tf.InteractiveSession()
tf.global_variables_initializer().run()

for iter in range(50):
    print('iter {}'.format(iter+1))
    n_examples = observations.shape[0]
    print('n_examples: {}'.format(n_examples))
    batch_size = 1000
    total_batch = int(n_examples/batch_size)
    print(total_batch)

    X, Y = observations, actions

    # Standardize
    X -= mean
    X /= std
    
    saver.restore(sess, "tmp/model.ckpt")
    
    # Training cycle
    for epoch in range(50):
        X, Y = shuffle(X, Y)
        avg_cost = 0.0
        for i in range(total_batch):
            batch_x, batch_y = ( X[i*batch_size:(i+1)*batch_size],
                                 Y[i*batch_size:(i+1)*batch_size] )
            _, c = sess.run([optimizer, cost], feed_dict={x: batch_x,
                                                          y: batch_y})
            avg_cost += c / total_batch
        if epoch % 10 == 0:
            print("Epoch {}:\t{:.9f}".format(epoch+1, avg_cost))
    print("Training Completed")
    saver.save(sess, "tmp/model.ckpt")

    def learned_policy_fn(obs):
        # Make prediction using standardized data
        obs -= mean
        obs /= std
        action = sess.run(pred, feed_dict={x: obs[None,:]})
        return action.T

    _env = gym.make(ENV_NAME)
    _max_steps = _env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')

    _returns = []
    _observations = []
    _actions = []
    for i in range(50):
        #print('iter', i)
        obs = _env.reset()
        done = False
        totalr = 0.
        steps = 0
        while not done:
            action = learned_policy_fn(obs)
            _observations.append(obs)
            _actions.append(action)
            obs, r, done, _ = _env.step(action)
            totalr += r
            steps += 1
            #_env.render()
            #if steps % 100 == 0: print("%i/%i"%(steps, _max_steps))
            if steps >= _max_steps:
                break
        _returns.append(totalr)

    #print('returns', _returns)
    print('mean return', np.mean(_returns))
    print('std of return', np.std(_returns))

    tf_util.initialize()
    
    expert_actions = []
    for obs in _observations:
        action = policy_fn(obs[None,:])
        expert_actions.append(action)

    #3: Merge old observations and actions into one dataset
    actions = np.concatenate((actions, np.array(expert_actions)[:, 0]), axis=0)
    observations = np.concatenate((observations, _observations), axis=0)

sess.close()

iter 1
n_examples: 320000
320
Epoch 1:	0.320277358
Epoch 11:	0.055332253
Epoch 21:	0.021047579
Epoch 31:	0.012908232
Epoch 41:	0.011704342
Training Completed


[2017-02-20 18:05:48,978] Making new env: Reacher-v1


('mean return', -13.192523649680176)
('std of return', 4.7954466448860034)
Instructions for updating:
Please use tf.global_variables instead.


[2017-02-20 18:05:50,357] From tf_util.py:91: all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Please use tf.global_variables instead.


Instructions for updating:
Use `tf.variables_initializer` instead.


[2017-02-20 18:05:50,358] From tf_util.py:92: initialize_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.variables_initializer` instead.


iter 2
n_examples: 322500
322
Epoch 1:	0.015704805
Epoch 11:	0.014602113
Epoch 21:	0.014519583
Epoch 31:	0.014477999
Epoch 41:	0.014450025
Training Completed


[2017-02-20 18:06:20,996] Making new env: Reacher-v1


('mean return', -12.426986037761434)
('std of return', 2.9346539159958298)
Instructions for updating:
Please use tf.global_variables instead.


[2017-02-20 18:06:22,475] From tf_util.py:91: all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Please use tf.global_variables instead.


Instructions for updating:
Use `tf.variables_initializer` instead.


[2017-02-20 18:06:22,476] From tf_util.py:92: initialize_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.variables_initializer` instead.


iter 3
n_examples: 325000
325
Epoch 1:	0.016005466
Epoch 11:	0.014807669
Epoch 21:	0.014756441
Epoch 31:	0.014667120
Epoch 41:	0.014653034
Training Completed


[2017-02-20 18:06:51,193] Making new env: Reacher-v1


('mean return', -11.208264614605223)
('std of return', 4.162401201179315)
Instructions for updating:
Please use tf.global_variables instead.


[2017-02-20 18:06:52,592] From tf_util.py:91: all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Please use tf.global_variables instead.


Instructions for updating:
Use `tf.variables_initializer` instead.


[2017-02-20 18:06:52,593] From tf_util.py:92: initialize_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.variables_initializer` instead.


iter 4
n_examples: 327500
327
Epoch 1:	0.031509620
Epoch 11:	0.015246653
Epoch 21:	0.015652599
Epoch 31:	0.015237352
Epoch 41:	0.015236335
Training Completed


[2017-02-20 18:07:21,429] Making new env: Reacher-v1


('mean return', -11.630598120666507)
('std of return', 1.9811282452876811)
Instructions for updating:
Please use tf.global_variables instead.


[2017-02-20 18:07:22,836] From tf_util.py:91: all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Please use tf.global_variables instead.


Instructions for updating:
Use `tf.variables_initializer` instead.


[2017-02-20 18:07:22,837] From tf_util.py:92: initialize_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.variables_initializer` instead.


iter 5
n_examples: 330000
330
Epoch 1:	0.581866940
Epoch 11:	0.018308944
Epoch 21:	0.088821967
Epoch 31:	0.044117543
Epoch 41:	0.021657736
Training Completed


[2017-02-20 18:07:51,711] Making new env: Reacher-v1


('mean return', -12.797071903476128)
('std of return', 3.6540924920086537)
Instructions for updating:
Please use tf.global_variables instead.


[2017-02-20 18:07:53,103] From tf_util.py:91: all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Please use tf.global_variables instead.


Instructions for updating:
Use `tf.variables_initializer` instead.


[2017-02-20 18:07:53,104] From tf_util.py:92: initialize_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.variables_initializer` instead.


iter 6
n_examples: 332500
332
Epoch 1:	19.758266548


KeyboardInterrupt: 

In [9]:
with tf.Session() as sess:
    saver.restore(sess, "tmp/model.ckpt")
    
    def learned_policy_fn(obs):
            # Make prediction using standardized data
            obs -= mean
            obs /= std
            action = sess.run(pred, feed_dict={x: obs[None,:]})
            return action.T

    _env = gym.make(ENV_NAME)
    _max_steps = _env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')

    _returns = []
    _observations = []
    _actions = []
    for i in range(50):
        print('iter', i)
        obs = _env.reset()
        done = False
        totalr = 0.
        steps = 0
        while not done:
            action = learned_policy_fn(obs)
            _observations.append(obs)
            _actions.append(action)
            obs, r, done, _ = _env.step(action)
            totalr += r
            steps += 1
            _env.render()
            if steps % 100 == 0: print("%i/%i"%(steps, _max_steps))
            if steps >= _max_steps:
                break
        _returns.append(totalr)

    print('returns', _returns)
    print('mean return', np.mean(_returns))
    print('std of return', np.std(_returns))

[2017-02-20 18:04:43,888] Making new env: Reacher-v1


('iter', 0)
('iter', 1)
('iter', 2)
('iter', 3)
('iter', 4)
('iter', 5)
('iter', 6)
('iter', 7)
('iter', 8)
('iter', 9)
('iter', 10)
('iter', 11)
('iter', 12)
('iter', 13)
('iter', 14)
('iter', 15)
('iter', 16)
('iter', 17)
('iter', 18)
('iter', 19)
('iter', 20)
('iter', 21)
('iter', 22)
('iter', 23)
('iter', 24)
('iter', 25)
('iter', 26)
('iter', 27)
('iter', 28)
('iter', 29)
('iter', 30)
('iter', 31)
('iter', 32)
('iter', 33)
('iter', 34)
('iter', 35)
('iter', 36)
('iter', 37)
('iter', 38)
('iter', 39)
('iter', 40)
('iter', 41)
('iter', 42)


KeyboardInterrupt: 