## HW3 part A by Denis Osipychev
### Testing my DQN on CartPole
##### import gym and local environments

In [7]:
import gym, time
import numpy as np
import matplotlib.pyplot as plt
import random
import tensorflow as tf
import tensorflow.contrib.slim as slim

In [None]:
# initialize the env and test it with a fixed action
env = gym.make("CartPole-v0")
env.reset()

In [None]:
# initialize hyperparameters
n_episodes = 5000
episode_lenght = 100
gamma = 0.99
alpha = 0.001
buffer_size = 10000
batch_size = 32
epsilon = 0.7 # it will vary during the training
n_neurons = 100

In [3]:
# nn structure for Q module
class Qnetwork():
    def __init__(self, input_size, layer_size, output_size):
        
        self.input = tf.placeholder(shape=[None,input_size],dtype=tf.float32)
        hidden = slim.fully_connected(self.input,layer_size,biases_initializer=None,activation_fn=tf.nn.relu)
        self.output = slim.fully_connected(hidden,output_size,biases_initializer=None)
        
        self.new_q = tf.placeholder(shape=[None,],dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[None,output_size],dtype=tf.float32)
        self.Q_action = tf.reduce_sum(tf.multiply(self.output,self.action_holder), reduction_indices=1)
        
        
        self.loss = tf.reduce_sum(tf.square(self.new_q - self.Q_action))
        optimizer = tf.train.AdamOptimizer(learning_rate=alpha)
        self.update_model = optimizer.minimize(self.loss)

In [4]:
# epsilon greedy policy
def greedy_policy(Q):
    if np.random.uniform() < epsilon:
        best_action = np.random.randint(env.action_space.n)
    else:
        best_action = np.argmax(Q)
    return best_action

In [5]:
class ReplayMemory():
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, arg):
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = arg
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        if len(self.memory) < batch_size:
            batch_size = len(self.memory)
        return random.sample(self.memory, batch_size)
    
    def size(self):
        return len(self.memory)

In [None]:
# initialize the weights and network
tf.reset_default_graph() # Clear the Tensorflow graph.
myAgent = Qnetwork(env.observation_space.shape[0], n_neurons, env.action_space.n)
merged = tf.summary.merge_all()
buffer = ReplayMemory(buffer_size)
epsilon_reduction = epsilon / n_episodes

In [None]:
# launch the session
sess = tf.Session()
sess.run(tf.global_variables_initializer())
print("Initialized Variables")

In [None]:
history = []
i = 0

# run the training
while i < n_episodes:
    
    s = env.reset()
    total_reward = 0
    i += 1
    j = 0
    d = False
    
    #run the simulation
    while j < episode_lenght:
        
        j += 1
        
        q = sess.run(myAgent.output,feed_dict={myAgent.input:[s]})
        action = greedy_policy(q)
        
        # do the step
        s_new,r,d,_ = env.step(action)
        
        action_one_hot = np.zeros(env.action_space.n)
        action_one_hot[action] = 1
        
        buffer.push([s,action_one_hot,r,d,s_new])
        
        total_reward += r
        s = s_new
        
        if buffer.size() > batch_size:
            st, at, rt, dt, snewt = zip(*buffer.sample(batch_size))
            qt = sess.run(myAgent.output,feed_dict={myAgent.input:snewt})
            yt = []
            for k in range(batch_size):
                if dt[k] == True:
                    yt.append(rt[k])
                else:
                    yt.append(rt[k] + gamma*np.max(qt[k]))
                
            _ = sess.run(myAgent.update_model,feed_dict={myAgent.input:st,
                                                         myAgent.action_holder:at,
                                                         myAgent.new_q:yt})
            
        # update q
        if d == True:
            break
            
    # decay exploration
    epsilon -= epsilon_reduction
    epsilon = max(0.01, epsilon)
    history.append([i, total_reward, epsilon, alpha, j])
    
    if i % 100 == 0:
        print(' Epoch:',i,',Average R:',np.mean(np.asarray(history)[-100:,1]),',Epsilon:', epsilon)

In [None]:
# plot the history
h = np.asarray(history)
plt.subplot(4, 1, 1)
plt.plot(h[:,0], h[:,1], '-')
plt.ylabel('reward')

plt.subplot(4, 1, 2)
plt.plot(h[:,0], h[:,2], '-')
plt.xlabel('step')
plt.ylabel('epsilon')

plt.subplot(4, 1, 3)
plt.plot(h[:,0], h[:,3], '-')
plt.xlabel('step')
plt.ylabel('alpha')

plt.subplot(4, 1, 4)
plt.plot(h[:,0], h[:,4], '-')
plt.xlabel('step')
plt.ylabel('length ep')

plt.show()

In [None]:
# ckpt = tf.train.get_checkpoint_state('./model')

# if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
#     saver.restore(sess, ckpt.model_checkpoint_path)
#     print("Load Model : ", ckpt.model_checkpoint_path)
# else:
#     print("No policy for evaluation")

# evaluate the policy
s = env.reset()
total_reward = 0
j = 0

while j < episode_lenght:
    j += 1
    q = sess.run(myAgent.output,feed_dict={myAgent.input:[s]})
    action = np.argmax(q)
    s_new,r,d,_ = env.step(action)
    total_reward += r
    s = s_new
    env.render()
    print("step:",j,",r:",r)
print("total reward:", total_reward)

## HW3 part B by Denis Osipychev
### Training my DQN on Acrobot

In [1]:
import myenv
#dir(myenv)

# initialize the env
env = myenv.AcrobotEnv()
env.reset()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


array([ 4.05531235,  2.13833491, -1.29839659,  1.83169234])

In [2]:
# initialize hyperparameters
n_episodes = 10000
episode_lenght = 100
gamma = 0.99
alpha = 0.001
buffer_size = 10000
batch_size = 32
epsilon = 0.7 # it will vary during the training
n_neurons = 100

In [8]:
# initialize the weights and network
tf.reset_default_graph() # Clear the Tensorflow graph.
myAgent = Qnetwork(env.observation_space.shape[0], n_neurons, env.action_space.n)
merged = tf.summary.merge_all()
buffer = ReplayMemory(buffer_size)
epsilon_reduction = epsilon / n_episodes

In [9]:
# launch the session
sess = tf.Session()

# initialize, save or restore the network 
saver = tf.train.Saver(tf.global_variables())

writer = tf.summary.FileWriter('./logs', sess.graph)

ckpt = tf.train.get_checkpoint_state('./model')
if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
    saver.restore(sess, ckpt.model_checkpoint_path)
    print("Load Model : ", ckpt.model_checkpoint_path)
else:
    sess.run(tf.global_variables_initializer())
    print("Initialized Variables")

INFO:tensorflow:Restoring parameters from ./model/policy.ckpt
Load Model :  ./model/policy.ckpt


In [None]:
history = []
i = 0

# run the training
while i < n_episodes:
    
    s = env.reset()
    total_reward = 0
    i += 1
    j = 0
    d = False
    
    #run the simulation
    while j < episode_lenght:
        
        j += 1
        
        q = sess.run(myAgent.output,feed_dict={myAgent.input:[s]})
        action = greedy_policy(q)
        
        # do the step
        s_new,r,d,_ = env.step(action)
        
        action_one_hot = np.zeros(env.action_space.n)
        action_one_hot[action] = 1
        
        buffer.push([s,action_one_hot,r,d,s_new])
        
#         q_next = sess.run(myAgent.output,feed_dict={myAgent.input:[s_new]})
#         targetq = q.flatten()
#         targetq[action] = r + gamma * np.max(q_next)
        
#         s_batch.append(s)
#         q_batch.append(q_next)
        
        total_reward += r
        s = s_new
        
        if buffer.size() > batch_size:
            st, at, rt, dt, snewt = zip(*buffer.sample(batch_size))
            qt = sess.run(myAgent.output,feed_dict={myAgent.input:snewt})
            yt = []
            for k in range(batch_size):
                if dt[k] == True:
                    yt.append(rt[k])
                else:
                    yt.append(rt[k] + gamma*np.max(qt[k]))
                
            _ = sess.run(myAgent.update_model,feed_dict={myAgent.input:st,
                                                         myAgent.action_holder:at,
                                                         myAgent.new_q:yt})
            
        # update q
    #if d == True:
            
            
    # decay exploration
    epsilon -= epsilon_reduction
    epsilon = max(0.01, epsilon)
    history.append([i, total_reward, epsilon, alpha, j])
    
    if i % 100 == 0:
        # save the model
        print(' Epoch:',i,',Average R:',np.mean(np.asarray(history)[-100:,1]),',Epsilon:', epsilon)
        saver.save(sess, './model/policy.ckpt')

 Epoch: 100 ,Average R: 0.32 ,Epsilon: 0.6929999999999985
 Epoch: 200 ,Average R: 0.15 ,Epsilon: 0.6859999999999971
 Epoch: 300 ,Average R: 0.36 ,Epsilon: 0.6789999999999956


In [None]:
# plot the history
h = np.asarray(history)
plt.subplot(4, 1, 1)
plt.plot(h[:,0], h[:,1], '-')
plt.ylabel('reward')

plt.subplot(4, 1, 2)
plt.plot(h[:,0], h[:,2], '-')
plt.xlabel('step')
plt.ylabel('epsilon')

plt.subplot(4, 1, 3)
plt.plot(h[:,0], h[:,3], '-')
plt.xlabel('step')
plt.ylabel('alpha')

plt.subplot(4, 1, 4)
plt.plot(h[:,0], h[:,4], '-')
plt.xlabel('step')
plt.ylabel('length ep')

plt.show()

In [None]:
# ckpt = tf.train.get_checkpoint_state('./model')

# if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
#     saver.restore(sess, ckpt.model_checkpoint_path)
#     print("Load Model : ", ckpt.model_checkpoint_path)
# else:
#     print("No policy for evaluation")

# evaluate the policy
s = env.reset()
total_reward = 0
j = 0

while j < episode_lenght:
    j += 1
    q = sess.run(myAgent.output,feed_dict={myAgent.input:[s]})
    action = np.argmax(q)
    s_new,r,d,_ = env.step(action)
    total_reward += r
    s = s_new
    env.render()
    print("step:",j,",r:",r)
print("total reward:", total_reward)