# Batu Aytemiz Assignment 1

In this assignment we:
0. Create a dataset from given expert policies.
1. Use the dataset to train and test a policy. (Behavior Cloning)
2. Implement DAGGER (Imitation Learning)

In [89]:
import os
import pickle
import tensorflow as tf
import numpy as np
import tf_util
import gym
import load_policy

## Section 1 - Behavior Cloning
### I have created the behivor data by running the given expert policies and saving the (state - action distribution) pairs. In this part I am creating a simple neural network to predict the action distribution given a state.

In [106]:
names = ["HalfCheetah-v2", "Ant-v2", "Humanoid-v2", "Walker2d-v2"]
ENV_NAME = names[-1]

def load_data():
    with open(f"bc_data/bc_data_{ENV_NAME}", "rb") as data_file:
        raw_data = pickle.load(data_file)

    x_train = raw_data[0]
    y_train_raw = raw_data[1]
    y_train_raw = y_train_raw.squeeze()
    return x_train, y_train_raw

x_train, y_train = load_data()
state_space = x_train.shape[1]
action_space = y_train.shape[1]


print(f"There are {len(x_train)} data points gathered from running the given expert policy.")
print(f"The state space is of size: {state_space}.")
print(f"The action space is of size: {action_space}.")

There are 10000 data points gathered from running the given expert policy.
The state space is of size: 17.
The action space is of size: 6.


### I am using a very simple NN architecture as our state and action spaces are relatively simple. 

In [107]:
def create_model():
    hidden_layer_size = 64
    tf.reset_default_graph()
    input_ph = tf.placeholder(dtype=tf.float32, shape=[None, state_space])
    output_ph = tf.placeholder(dtype=tf.float32, shape=[None, action_space])
    
    w0 = tf.get_variable(name='W0', shape=[state_space, hidden_layer_size],  initializer=tf.contrib.layers.xavier_initializer())
    w1 = tf.get_variable(name='W1', shape=[hidden_layer_size, hidden_layer_size],           initializer=tf.contrib.layers.xavier_initializer())
    w2 = tf.get_variable(name='W2', shape=[hidden_layer_size, action_space], initializer=tf.contrib.layers.xavier_initializer())
    
    b0 = tf.get_variable(name="b0", shape=[hidden_layer_size], initializer=tf.constant_initializer(0.))
    b1 = tf.get_variable(name="b1", shape=[hidden_layer_size], initializer=tf.constant_initializer(0.))
    b2 = tf.get_variable(name="b2", shape=[action_space], initializer=tf.constant_initializer(0.))
    
    weights = [w0, w1, w2]
    biases = [b0, b1, b2]
    activations = [tf.nn.relu, tf.nn.relu, None]
    
    layer = input_ph
    for W, b, activation in zip(weights, biases, activations):
        layer = tf.matmul(layer, W)
        layer = layer + b
        if activation is not None:
            layer = activation(layer)
    output_pred = layer
    return input_ph, output_ph, output_pred

In [110]:
def train_model(x_train=x_train, y_train=y_train, imitation=False):
    input_ph, output_ph, output_pred = create_model()
    print(f"Training model with {len(x_train)} data points.")
    with tf.Session() as sess:
        # create saver to save model variables
        saver = tf.train.Saver()
        mse = tf.reduce_mean(0.5 * tf.square(output_pred - output_ph))

        # create optimizer
        opt = tf.train.AdamOptimizer().minimize(mse)

        
        sess.run(tf.global_variables_initializer())
        if imitation:
            saver.restore(sess, "/tmp/model.ckpt")
        
        
        batch_size = 32
        for training_step in range(10001):
            # get a random subset of the training data
            indices = np.random.randint(low=0, high=len(x_train), size=batch_size)
            input_batch = x_train[indices]
            output_batch = y_train[indices]

            # run the optimizer and get the mse
            _, mse_run = sess.run([opt, mse], feed_dict={input_ph: input_batch, output_ph: output_batch})
            # print the mse every so often
            if training_step % 1000 == 0 and not imitation:
                print('{0:04d} mse: {1:.3f}'.format(training_step, mse_run))
                saver.save(sess, '/tmp/model.ckpt')
    print(f"Final mse: {mse_run:.3f}")
print("We are training the model using the expert data.")
train_model(x_train, y_train)

We are training the model using the expert data.
Training model with 10000 data points.
0000 mse: 2.010
1000 mse: 0.033
2000 mse: 0.025
3000 mse: 0.016
4000 mse: 0.019
5000 mse: 0.017
6000 mse: 0.010
7000 mse: 0.008
8000 mse: 0.009
9000 mse: 0.010
10000 mse: 0.009
Final mse: 0.009


In [115]:
def test_model():
    input_ph, output_ph, output_pred = create_model()
    with tf.Session() as sess:
        saver = tf.train.Saver()
        saver.restore(sess, "/tmp/model.ckpt")

        env = gym.make(ENV_NAME)
        max_steps = 10000

        for i in range(1):
            print('iter', i)
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                actions = output_pred_run = sess.run(output_pred, feed_dict={input_ph: (obs,)})
                action = np.argmax(actions)
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
                if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
                if steps >= max_steps:
                    break
        print(f"The total rewards that has been accumulated is: {totalr}")
        
test_model()

INFO:tensorflow:Restoring parameters from /tmp/model.ckpt
iter 0
100/10000
The total rewards that has been accumulated is: 75.55992480680706


### Not surprisingly the model has done terrilby. The agent has diverged from the training data (by falling over) allmost immediately and then kept on predicting the same action which never allowed it to get up.

### We can test other enviroments but the result is the same.

In [83]:
ENV_NAME = "Humanoid-v2"
    
x_train, y_train = load_data()
state_space = x_train.shape[1]
action_space = y_train.shape[1]

print(f"There are {len(x_train)} data points gathered from running the given expert policy.")
print(f"The state space is of size: {state_space}.")
print(f"The action space is of size: {action_space}.")

print(f"\n Training of {ENV_NAME} has started.")
train_model(x_train, y_train)
print("Training started.\n")

print("Testing started.")
test_model()

There are 10000 data points gathered from running the given expert policy.
The state space is of size: 376.
The action space is of size: 17.

 Training of Humanoid-v2 has started.
Training model with 10000 data points.
0000 mse: 119.905
1000 mse: 0.320
2000 mse: 0.191
3000 mse: 0.210
4000 mse: 0.173
5000 mse: 0.143
6000 mse: 0.174
7000 mse: 0.139
8000 mse: 0.151
9000 mse: 0.108
10000 mse: 0.119
Final mse: 0.119
Training started.

Testing started.
INFO:tensorflow:Restoring parameters from /tmp/model.ckpt
iter 0
The total rewards that has been accumulated is: 238.74245546199305


# Section 2 - Imitation Learning

In [116]:
def get_dataset_from_current_policy(rollout_count=10):
    input_ph, output_ph, output_pred = create_model()
    with tf.Session() as sess:
        saver = tf.train.Saver()
        saver.restore(sess, "/tmp/model.ckpt")

        env = gym.make(ENV_NAME)
        max_steps = 10000

        new_states = []
        for i in range(rollout_count):
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                actions = output_pred_run = sess.run(output_pred, feed_dict={input_ph: (obs,)})
                action = np.argmax(actions)
                obs, r, done, _ = env.step(action)
                new_states.append(obs)
                totalr += r
                steps += 1
                # if steps % 100 == 0: print("Steps: %i/%i with reward: %i"%(steps, max_steps,totalr))
                if steps >= max_steps:
                    break
        return new_states, totalr

def get_labels_from_expert(unlabeled_data_set):
    with tf.Session() as sess:
        # {ENV_NAME}.pkl
        expert_policy_fn = load_policy.load_policy(f"experts/HalfCheetah-v2.pkl")
        action_labels = [expert_policy_fn(unlabeled_data[None,:]) for unlabeled_data in unlabeled_data_set]
        return np.array(action_labels).squeeze()

In [None]:
ENV_NAME = "Walker2d-v2"

x_train, y_train = load_data()
state_space = x_train.shape[1]
action_space = y_train.shape[1]

def run_imitation_learning(x_train, y_train, imitation_count = 99):
    train_model()
    for i in range(imitation_count):
        new_data_set, reward = get_dataset_from_current_policy()
        new_label_set = get_labels_from_expert(new_data_set)
        x_train = np.concatenate((x_train, new_data_set), axis=0)
        y_train = np.concatenate((y_train, new_label_set), axis=0)
        train_model(x_train, y_train, imitation=True)
        print(f"The reward in iteration {i+1} is: {reward}")
run_imitation_learning(x_train, y_train)

Training model with 10000 data points.
0000 mse: 2.093
1000 mse: 0.047
2000 mse: 0.024
3000 mse: 0.027
4000 mse: 0.022
5000 mse: 0.017
6000 mse: 0.016
7000 mse: 0.011
8000 mse: 0.010
9000 mse: 0.009
10000 mse: 0.006
Final mse: 0.006
INFO:tensorflow:Restoring parameters from /tmp/model.ckpt
obs (1, 17) (1, 17)
Training model with 12152 data points.
INFO:tensorflow:Restoring parameters from /tmp/model.ckpt
Final mse: 0.004
The reward in iteration 1 is: 73.9825055206486
INFO:tensorflow:Restoring parameters from /tmp/model.ckpt
obs (1, 17) (1, 17)
Training model with 14029 data points.
INFO:tensorflow:Restoring parameters from /tmp/model.ckpt
