# Deep RL Project



In [None]:
%tensorflow_version 1.x

In [2]:

import gym
import random
import numpy as np
import time
from gym.envs.registration import register
from IPython.display import clear_output
import tensorflow as tf
import matplotlib.pyplot as plt


# Initialising the environment


In [4]:
env_name = 'FrozenLake-v0'
env = gym.make(env_name)

print("observation space: ", env.observation_space)
print("action space: ", env.action_space)

observation space:  Discrete(16)
action space:  Discrete(4)


# Defining a random agent


In [5]:
class Agent():

  def __init__(self, env):
    self.is_discrete = type(env.action_space) == gym.spaces.discrete.Discrete

    if self.is_discrete:
      self.action_size = env.action_space.n
      print("Action size: ", self.action_size)
    else:
      self.action_low = env.action_space.low
      self.action_high = env.action_space.high
      self.action_shape = env.action_space.shape
      print("Action range: ", self.action_low, self.action_high)


  def get_action(self, state):
    if self.is_discrete:
      action = random.choice(range(self.action_size))
    else:
      action = np.random.uniform(self.action_low, self.action_high, self.action_shape)

    return action

    

# Defining a Q-learning agent

In [None]:
class QlAgent(Agent):
  def __init__(self, env,discount_rate= 0.9, learning_rate= 0.01):
    super().__init__(env)
    self.state_size = env.observation_space.n
    print("State size: ", self.state_size)

    self.eps = 1.0
    self.discount_rate = discount_rate
    self.learning_rate = learning_rate
    self.build_model()

  def build_model(self):
    self.q_table = 1e-4 * np.random.random([self.state_size, self.action_size])


  def get_action(self, state):
    q_state = self.q_table[state]
    greedy_action = np.argmax(q_state)
    random_action = super().get_action(state)
    return random_action if random.random() < self.eps else greedy_action

  
  def train(self, experience):
    state, action, next_state ,reward, done = experience

    q_next = self.q_table[next_state]
    q_next = np.zeros([self.action_size]) if done else q_next
    q_target = reward + self.discount_rate*np.max(q_next)
    
    q_update = q_target - self.q_table[state, action]
    self.q_table[state, action] += self.learning_rate * q_update

    if done:
      self.eps = self.eps * 0.99


agent = QlAgent(env)

## Training

In [None]:
rewards_his=[]
total_reward = 0
for episode in range(1000):
  state = env.reset()
  done = False
  while not done:
    action = agent.get_action(state)
    next_state, reward, done, info = env.step(action)
    agent.train((state, action, next_state, reward, done))
    state = next_state
    total_reward += reward

    print('s: ', state, 'a: ', action)
    print('episode: {}, total reward:{}, epsilon: {}'.format(episode, total_reward,agent.eps))

    env.render()
    

    print(agent.q_table)
    #time.sleep(0.05)
    clear_output(wait=True)
  rewards_his.append(total_reward)


In [None]:
plt.figure(figsize=(15, 10))
plt.plot(rewards_his)
plt.ylabel('total rewards')
plt.show()

print(np.mean(rewards_his))

# QLearning with NN


In [None]:
eds= [0.97, 0.9, 0.8, 0.99]
r = []
for ed in eds:
  class QlNNAgent(Agent):
    def __init__(self, env,discount_rate= 0.99, learning_rate= 0.01):
      super().__init__(env)
      self.state_size = env.observation_space.n
      print("State size: ", self.state_size)

      self.eps = 1.0
      self.discount_rate = discount_rate
      self.learning_rate = learning_rate
      self.build_model()

      self.session = tf.Session()
      self.session.run(tf.global_variables_initializer())


    def build_model(self):
      tf.reset_default_graph()

      #ops.reset_default_graph()

      self.state_in = tf.placeholder(tf.int32, shape=[1])
      self.action_in = tf.placeholder(tf.int32, shape=[1])
      self.target_in = tf.placeholder(tf.float32, shape=[1])

      self.state = tf.one_hot(self.state_in, depth= self.state_size)
      self.action = tf.one_hot(self.action_in, depth= self.action_size)

      
      self.q_state = tf.layers.dense(self.state, units=self.action_size, name= 'q_table')
      self.q_action = tf.reduce_sum(tf.multiply(self.q_state, self.action), axis=1)

      self.loss = tf.reduce_sum(tf.square(self.target_in - self.q_action))
      self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)



    def get_action(self, state):
      q_state = self.session.run(self.q_state, feed_dict={self.state_in:[state]})
      greedy_action = np.argmax(q_state)
      random_action = super().get_action(state)
      return random_action if random.random() < self.eps else greedy_action

    
    def train(self, experience):
      state, action, next_state ,reward, done = ([exp] for exp in experience)

      q_next = self.session.run(self.q_state, feed_dict={self.state_in: next_state})
      q_next[done] = np.zeros([self.action_size]) 
      q_target = reward + self.discount_rate*np.max(q_next)
      
      feed = {self.state_in: state, self.action_in: action, self.target_in: q_target}

      self.session.run(self.optimizer, feed_dict=feed)

      if experience[4]:
        self.eps = max(0.0001, self.eps * ed)

    def __del__(self):
      self.session.close()

  agent = QlNNAgent(env)



  total_reward = 0
  rewards_his=[]
  for episode in range(1000):
    state = env.reset()
    done = False
    while not done:
      action = agent.get_action(state)
      next_state, reward, done, info = env.step(action)
      agent.train((state, action, next_state, reward, done))
      state = next_state
      total_reward += reward

      print('s: ', state, 'a: ', action)
      print('episode: {}, total reward:{}, epsilon: {}'.format(episode, total_reward,agent.eps))

      env.render()
      with tf.variable_scope('q_table', reuse=True):
        weights = agent.session.run(tf.get_variable("kernel"))
        print(weights)
    
      #time.sleep(0.05)
      clear_output(wait=True)
    rewards_his.append(total_reward) 
  r.append(rewards_his) 
  

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(15, 10))

for n, ep_rewards in enumerate(r):
    x = range(len(ep_rewards))
    
    plt.plot(x, ep_rewards, label=eds[n])

plt.xlabel("Episode")
plt.ylabel("rewards")

plt.legend()