In [1]:
import pickle
import tensorflow as tf
import numpy as np
import tf_util
import gym
import load_policy

from sklearn.model_selection import train_test_split

import keras
from keras import backend as K

from keras.regularizers import l2, activity_l2
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dense, Lambda
from keras.optimizers import Adam

import matplotlib.pyplot as plt
from tqdm import tqdm

# bokeh stuff
from ipywidgets import interact

from bokeh.io import push_notebook, output_notebook
from bokeh.io import show
from bokeh.plotting import figure

import bokeh.plotting as bk
output_notebook()

# plotly stuff
import plotly as py
from plotly.graph_objs import *
import plotly.tools as tls
py.tools.set_credentials_file(username='bicepjai', api_key='iXgHGAFTTFcbJIuYCnE7')

Using TensorFlow backend.


In [2]:
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

print(get_available_gpus())

[u'/gpu:0']


## Generate Expert Data

In [3]:
expert_policy_file = "experts/Walker2d-v1.pkl"
envname = 'Walker2d-v1'

In [4]:
render = True
max_timesteps = 1000
num_rollouts = 500

def generate_expert_data():
    print('loading and building expert policy')
    policy_fn = load_policy.load_policy(expert_policy_file)
    print('loaded and built')

    with tf.Session():
        tf_util.initialize()

        import gym
        env = gym.make(envname)
        max_steps = max_timesteps or env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')
        print("timestep_limit {}".format(max_steps))

        returns = []
        observations = []
        actions = []
        print("roll_outs --")
        obs = env.reset()
        for i in tqdm(range(num_rollouts)):
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                action = policy_fn(obs[None,:])
                observations.append(obs)
                actions.append(action)
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
                if render:
                    env.render()
                if steps >= max_steps:
                    break
            returns.append(totalr)

        print('returns', returns)
        print('mean return', np.mean(returns))
        print('std of return', np.std(returns))

        expert_data = {'observations': np.array(observations),
                       'actions': np.array(actions)}

        # save and load
        np.savez('traindata.npz', observations=np.array(observations), actions=np.array(actions))
        data = np.load('traindata.npz') 
        print data.files 
        X = data['observations'] 
        y = data['actions']
        print("observations-----------------{}".format(X.shape))
        print("actions----------------------{}".format(y.shape))

generate_expert_data()

[2017-02-22 23:24:22,455] Making new env: Walker2d-v1


loading and building expert policy
('obs', (1, 17), (1, 17))
loaded and built


  0%|          | 0/500 [00:00<?, ?it/s]

timestep_limit 1000
roll_outs --


100%|██████████| 500/500 [2:18:33<00:00, 16.67s/it]  


('returns', [5574.9011693718085, 5581.2722455961193, 5497.0422288474374, 5601.1923917659542, 5509.993713908636, 5480.2229189221398, 5558.9363914611622, 5567.261694998183, 5553.1163013987234, 5459.5582531093924, 5589.7833157516543, 5598.541835688563, 5527.408396838905, 5604.5112604474598, 5576.1762753508374, 5472.6274898712172, 5448.5958998994902, 5515.2364265244105, 5533.8242553249229, 5553.5305519534331, 5520.207963394434, 5561.2703102132973, 5483.620989680011, 5496.9926382696467, 5511.7512062520818, 5519.1413193048847, 5521.3512741914874, 5510.0000170650992, 5575.0653783183016, 5547.8656230828701, 5589.7039263791276, 5574.7918877301381, 5531.0485523272437, 5534.1631231280408, 5560.835948747349, 5550.3483060650315, 5451.6375331607696, 5561.6972233451788, 5413.9022662939769, 5373.7366830101337, 5560.724669810691, 5555.8876179307663, 5532.8985046730295, 5554.9756396497251, 5567.0885710690363, 5537.1180125275359, 5531.0165872519119, 5465.8350577301007, 5457.9122093694623, 5520.3809475108

In [None]:
# sess = tf.Session()
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
K.set_session(sess)

## Keras with Tensorflow sample code

In [None]:
# setup layers
# this placeholder will contain our input digits, as flat vectors
img = tf.placeholder(tf.float32, shape=(None, 784))

# Keras layers can be called on TensorFlow tensors:
x = Dense(128, activation='relu')(img)  # fully-connected layer with 128 units and ReLU activation
x = Dense(128, activation='relu')(x)
preds = Dense(10, activation='softmax')(x)  # output layer with 10 units and a softmax activation

labels = tf.placeholder(tf.float32, shape=(None, 10))

from keras.objectives import categorical_crossentropy
loss = tf.reduce_mean(categorical_crossentropy(labels, preds))

from tensorflow.examples.tutorials.mnist import input_data
mnist_data = input_data.read_data_sets('MNIST_data', one_hot=True)

#train
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
K.get_session().run(tf.global_variables_initializer())
with sess.as_default():
    for i in range(1000):
        batch = mnist_data.train.next_batch(50)
        train_step.run(feed_dict={img: batch[0],
                                  labels: batch[1]})

#statistics
from keras.metrics import categorical_accuracy as accuracy

acc_value = accuracy(labels, preds)
with sess.as_default():
    print acc_value.eval(feed_dict={img: mnist_data.test.images,
                                    labels: mnist_data.test.labels})

## Some Ploting APIs

In [None]:
def plot_history(history_dict):
    acc_values = history_dict['loss']
    val_acc_values = history_dict['val_loss']
    epochs = range(1, len(acc_values) + 1)
    plt.plot(epochs, acc_values, 'bo')
    plt.plot(epochs, val_acc_values, 'b+')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.show()

def plotly_history(history_dict):
    fig1 = plt.figure()
    acc_values = history_dict['loss']
    val_acc_values = history_dict['val_loss']
    epochs = range(1, len(acc_values) + 1)
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.plot(epochs, acc_values, 'bo')
    plt.plot(epochs, val_acc_values, 'b+')
    return fig1

# fig = plotly_history(history.history)
# py.plotly.iplot_mpl(fig, strip_style = True)

def bokeh_history(history_dict):
    acc_values = history_dict['loss']
    val_acc_values = history_dict['val_loss']
    epochs = range(1, len(acc_values) + 1)

    p = figure(tools="pan,box_zoom,reset,save",
            plot_height=400, plot_width=800,
            title="loss vs epoch", 
            x_axis_label='epoch', 
            y_axis_label='loss')
    p.line(epochs, acc_values, legend="train_acc", line_color="blue")
    p.line(epochs, val_acc_values, legend="val_acc", line_color="orange")
    show(p)

## Behavioral Cloning

In [None]:
train_data = np.load('walker2d-v1-r200-t100_traindata.npz')
random_sampling = 2344

observations = train_data['observations']
actions = train_data['actions'][:, 0]

print("observations {}".format(train_data['observations'].shape))
print("actions {}".format(train_data['actions'].shape))

N, input_dims = train_data['observations'].shape
N1, _, output_dims = train_data['actions'].shape

assert(N1 == N)

# Train-Test-Validation split
test_split = 0.20
validation_splot = 0.20
X, X_test, y, y_test = train_test_split(observations, actions, test_size=test_split, random_state=random_sampling)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=validation_splot, random_state=random_sampling)

print("X {}".format(X.shape))
print("y {}".format(y.shape))

print("X_test {}".format(X_test.shape))
print("y_test {}".format(y_test.shape))

print("X_train {}".format(X_train.shape))
print("y_train {}".format(y_train.shape))

print("X_val {}".format(X_val.shape))
print("y_val {}".format(y_val.shape))

In [None]:
#means and stds
X_mean, y_mean, X_std, y_std = X.mean(axis=0), y.mean(axis=0), X.std(axis=0), y.std(axis=0)
X_test_mean, y_test_mean, X_test_std, y_test_std = X_test.mean(axis=0), y_test.mean(axis=0), X_test.std(axis=0), y_test.std(axis=0)
X_train_mean, y_train_mean, X_train_std, y_train_std = X_train.mean(axis=0), y_train.mean(axis=0), X_train.std(axis=0), y_train.std(axis=0)
X_val_mean, y_val_mean, X_val_std, y_val_std = X_val.mean(axis=0), y_val.mean(axis=0), X_val.std(axis=0), y_val.std(axis=0)

# standardizing data
# X_test -= X_test_mean
# X_test /= X_test_std

# X_train -= X_train_mean
# X_train /= X_train_std

# X_val -= X_val_mean
# X_val /= X_val_std


## Neural Network models

In [None]:
def nn_h1_build():
    model = Sequential()
    model.add(Dense(128, input_dim=input_dims, init='glorot_normal', activation='sigmoid'))
    model.add(Dense(output_dims, init='glorot_normal'))
    
    model.compile(loss='mse', optimizer='adam', metrics=['mae'])
    return model

def nn_h2_build():
    model = Sequential()
    model.add(Dense(128, input_dim=input_dims, init='glorot_normal', activation='sigmoid'))
    model.add(Dense(128, input_dim=input_dims, init='glorot_normal', activation='tanh'))
    model.add(Dense(128, input_dim=input_dims, init='glorot_normal', activation='relu'))
    model.add(Dense(output_dims, init='glorot_normal'))
    
    model.compile(loss='mse', optimizer='adam', metrics=['mae'])
    return model

def nn_h3_reg_build():
    model = Sequential()
    model.add(Lambda(lambda x: (x - X_mean) / X_std, batch_input_shape=(None, input_dims)))
    model.add(Dense(128, input_dim=input_dims, init='glorot_normal', activation='sigmoid'))
    model.add(Dense(64, init='glorot_normal', activation='tanh', W_regularizer=l2(0.01), b_regularizer=l2(0.01)))
    model.add(Dense(64, init='glorot_normal', activation='tanh', W_regularizer=l2(0.01), b_regularizer=l2(0.01)))
    model.add(Dense(output_dims, init='glorot_normal',W_regularizer=l2(0.01), activity_regularizer=activity_l2(0.01),b_regularizer=l2(0.01)))
    
    model.compile(loss='mse', optimizer='adam')
    return model

def regularized_model():
    model = Sequential()
    model.add(Dense(64, input_dim=input_dims, init='normal', activation='relu',W_regularizer=l2(0.01), activity_regularizer=activity_l2(0.01),b_regularizer=l2(0.01)))
    model.add(Dense(64, input_dim=input_dims, init='normal', activation='relu',W_regularizer=l2(0.01), activity_regularizer=activity_l2(0.01),b_regularizer=l2(0.01)))
    model.add(Dense(output_dims, init='normal',W_regularizer=l2(0.01), activity_regularizer=activity_l2(0.01),b_regularizer=l2(0.01)))
    
    model.compile(loss='mse', optimizer='adam')
    return model

def awesome_model():
    model = Sequential([
        Lambda(lambda x: (x - X_mean) / X_std, batch_input_shape=(None, input_dims)),
        Dense(64, activation='tanh'),
        Dense(64, activation='tanh'),
        Dense(output_dims)
    ])

    opt = Adam(lr=learning_rate)
    model.compile(optimizer=opt, loss='mse', metrics=['mse'])
    return model

In [None]:
model = nn_h3_reg_build()
history = model.fit(X_train, y_train, verbose=False, nb_epoch=100, batch_size=10000, validation_data=(X_val, y_val))
bokeh_history(history.history)

In [None]:
model = nn_h2_build()
history = model.fit(X_train, y_train, verbose=False, nb_epoch=100, batch_size=10000, validation_data=(X_val, y_val))
bokeh_history(history.history)

In [None]:
learning_rate = 0.001
model = awesome_model()
history = model.fit(X_train, y_train, verbose=False, nb_epoch=100, batch_size=10000, validation_data=(X_val, y_val))
bokeh_history(history.history)

In [None]:
model = nn_h1_build()
history = model.fit(X_train, y_train, verbose=False, nb_epoch=50, batch_size=10000, validation_data=(X_val, y_val))
bokeh_history(history.history)

In [None]:
model = regularized_model()
history = model.fit(X_train, y_train, verbose=False, nb_epoch=100, batch_size=10000, validation_data=(X_val, y_val))
bokeh_history(history.history)

## Running Behavioral Cloned Model

In [None]:
def local_policy_fn(obs):
    #Make prediction using standized data
#     obs -= X_mean
#     obs /= X_std
    action = model.predict(obs)
    #Cap the range to -.4 to +.4
#     np.clip(action, -.4, .4, out=action)
    return action[:,None].T

In [None]:
render = True
max_timesteps = 1000
num_rollouts = 10

with tf.Session():
    tf_util.initialize()

    env = gym.make(envname)
    max_steps = max_timesteps or env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')
    print("timestep_limit {}".format(max_steps))

    returns = []
    observations = []
    actions = []
    print("roll_outs --")
    obs = env.reset()
    for i in tqdm(range(num_rollouts)):
        obs = env.reset()
        done = False
        totalr = 0.
        steps = 0
        while not done:
            action = local_policy_fn(obs[None,:])
            observations.append(obs)
            actions.append(action)
            obs, r, done, _ = env.step(action)
            totalr += r
            steps += 1
            if render:
                env.render()
            if steps >= max_steps:
                break
        returns.append(totalr)

    print('returns', returns)
    print('mean return', np.mean(returns))
    print('std of return', np.std(returns))

    expert_data = {'observations': np.array(observations),
                   'actions': np.array(actions)}

    # save and load
    np.savez('traindata.npz', observations=np.array(observations), actions=np.array(actions))
    data = np.load('traindata.npz') 
    print data.files 
    X = data['observations'] 
    y = data['actions']
    print("observations-----------------{}".format(X.shape))
    print("actions----------------------{}".format(y.shape))