[View in Colaboratory](https://colab.research.google.com/github/ZER-0-NE/Reinforcement-Learning_problems/blob/master/cartpole_using_DQN_py.ipynb)

In [0]:
# This script makes use of OpenAI gym to train on the cartpole game.
# Description of Game:

# A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. 
# The system is controlled by applying a force of +1 or -1 to the cart. The pendulum starts upright, 
# and the goal is to prevent it from falling over. A reward of +1 is provided for every timestep that 
# the pole remains upright. The episode ends when the pole is more than 15 degrees from vertical, or 
# the cart moves more than 2.4 units from the center.
# source : https://gym.openai.com/envs/CartPole-v1/


# Parameters:

# episodes - a number of games we want the agent to play.
# gamma - aka decay or discount rate, to calculate the future discounted reward.
# epsilon - aka exploration rate, this is the rate in which an agent randomly decides its action rather than prediction.
# epsilon_decay - we want to decrease the number of explorations as it gets good at playing games.
# epsilon_min - we want the agent to explore at least this amount.
# learning_rate - Determines how much neural net learns in each iteration.


In [2]:
!pip install gym
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

Collecting gym
[?25l  Downloading https://files.pythonhosted.org/packages/9b/50/ed4a03d2be47ffd043be2ee514f329ce45d98a30fe2d1b9c61dea5a9d861/gym-0.10.5.tar.gz (1.5MB)
[K    100% |████████████████████████████████| 1.5MB 8.0MB/s 
Collecting pyglet>=1.2.0 (from gym)
[?25l  Downloading https://files.pythonhosted.org/packages/1c/fc/dad5eaaab68f0c21e2f906a94ddb98175662cc5a654eee404d59554ce0fa/pyglet-1.3.2-py2.py3-none-any.whl (1.0MB)
[K    100% |████████████████████████████████| 1.0MB 10.2MB/s 
Building wheels for collected packages: gym
  Running setup.py bdist_wheel for gym ... [?25l- \ | / done
[?25h  Stored in directory: /root/.cache/pip/wheels/cb/14/71/f4ab006b1e6ff75c2b54985c2f98d0644fffe9c1dddc670925
Successfully built gym
Installing collected packages: pyglet, gym
Successfully installed gym-0.10.5 pyglet-1.3.2


Using TensorFlow backend.


In [0]:
### Inspired from the post of keon.io/deep-q-learning/

Episodes = 1000

'''
By defining memory, we make sure that the state,action.reward and next_state
are remembered, as the neural network in DQN tends to forget them after each 
iteration.
'''

class DQNAgent:
  def __init__(self, state_size, action_size):
    self.state_size = state_size
    self.action_size = action_size
    self.memory = deque(maxlen=2000)
    self.gamma = 0.95 # discount rate
    self.epsilon = 1.0 #exploration rate
    self.epsilon_min = 0.01
    self.epsilon_decay = 0.995
    self.learning_rate = 0.001 
    self.model = self._build_model()
    
    
  def _build_model(self):
    '''
    Neural Network for DQN
    '''
    model = Sequential()
    model.add(Dense(24, input_dim = self.state_size, activation = 'relu'))
    model.add(Dense(24, activation = 'relu'))
    model.add(Dense(self.action_size, activation='linear'))
    
    model.compile(loss = 'mse', optimizer = Adam(lr = self.learning_rate))
    
    return model
  
  def remember(self, state, action, reward, next_state, done):
    '''
    Keep appending the memory
    '''
    self.memory.append((state, action, reward, next_state, done))
    
  def act(self, state):
    '''
    The agent will select at first it's action at random because 
    it is better for the agent to try all kinds of things before 
    it starts to see the patterns. 
    '''
    if np.random.rand() <= self.epsilon:
      return random.randrange(self.action_size)
    act_values = self.model.predict(state)
    return np.argmax(act_values[0]) # argmax picks the highest value among
    # the two values in act_values eg [0.67, 0.04]
  
  def replay(self, batch_size):
    '''
    Trains the neural net with experience in the memory.
    We need to maximise the rewards in the long run, so we define gamma/discount
    rate through which the agent will learn to maximise the discounted future 
    award in the long run.
    '''
    minibatch = random.sample(self.memory, batch_size)
    for state, action, reward, next_state, done in minibatch:
      target = reward # if done
      
      if not done:
        target = (reward + self.gamma * np.amax(
            self.model.predict(next_state)[0]))
        target_f = self.model.predict(state)
        target_f[0][action] = target
        self.model.fit(state, target_f, epochs = 1, verbose = 0)
      if self.epsilon > self.epsilon_min:
        self.epsilon *= self.epsilon_decay
        
  def load(self, name):
    self.model.load_weghts(name)
    
  def save(self, name):
    self.model.save_weights(name)
    
  

In [10]:
if __name__ == "__main__":
  env = gym.make('CartPole-v1')
  state_size = env.observation_space.shape[0]
  action_size = env.action_space.n
  agent = DQNAgent(state_size, action_size)
  
  done = False
  batch_size = 32
  
  for a in range(Episodes):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    for time in range(500):
      
      action = agent.act(state)
      next_state, reward, done, _ = env.step(action)
      reward = reward if not done else -10
      next_state = np.reshape(next_state, [1,state_size])
      agent.remember(state, action, reward, next_state, done)
      state = next_state
      if done:
        print("Episode: {}/{}, score: {}, a : {:.2}"
             .format(a, Episodes, time, agent.epsilon))
        break
      if len(agent.memory) > batch_size:
        agent.replay(batch_size)
      

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Episode: 0/1000, score: 15, a : 1.0
Episode: 1/1000, score: 17, a : 0.85
Episode: 2/1000, score: 11, a : 0.15
Episode: 3/1000, score: 8, a : 0.04
Episode: 4/1000, score: 9, a : 0.01
Episode: 5/1000, score: 8, a : 0.01
Episode: 6/1000, score: 9, a : 0.01
Episode: 7/1000, score: 9, a : 0.01
Episode: 8/1000, score: 8, a : 0.01
Episode: 9/1000, score: 8, a : 0.01
Episode: 10/1000, score: 9, a : 0.01
Episode: 11/1000, score: 8, a : 0.01
Episode: 12/1000, score: 7, a : 0.01
Episode: 13/1000, score: 8, a : 0.01
Episode: 14/1000, score: 8, a : 0.01
Episode: 15/1000, score: 7, a : 0.01
Episode: 16/1000, score: 9, a : 0.01
Episode: 17/1000, score: 8, a : 0.01
Episode: 18/1000, score: 8, a : 0.01
Episode: 19/1000, score: 8, a : 0.01
Episode: 20/1000, score: 8, a : 0.01
Episode: 21/1000, score: 9, a : 0.01
Episode: 22/1000, score: 8, a : 0.01
Episode: 23/1000, score: 9, a : 0.01
Episode: 24/

KeyboardInterrupt: ignored

In [11]:
# -*- coding: utf-8 -*-
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

EPISODES = 1000

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma *
                          np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)


if __name__ == "__main__":
    env = gym.make('CartPole-v1')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = DQNAgent(state_size, action_size)
    # agent.load("./save/cartpole-dqn.h5")
    done = False
    batch_size = 32

    for e in range(EPISODES):
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        for time in range(500):
            # env.render()
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            reward = reward if not done else -10
            next_state = np.reshape(next_state, [1, state_size])
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                print("episode: {}/{}, score: {}, e: {:.2}"
                      .format(e, EPISODES, time, agent.epsilon))
                break
            if len(agent.memory) > batch_size:
                agent.replay(batch_size)
        # if e % 10 == 0:
        #     agent.save("./save/cartpole-dqn.h5")

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
episode: 0/1000, score: 17, e: 1.0
episode: 1/1000, score: 20, e: 0.97
episode: 2/1000, score: 35, e: 0.81
episode: 3/1000, score: 23, e: 0.73
episode: 4/1000, score: 16, e: 0.67
episode: 5/1000, score: 13, e: 0.63
episode: 6/1000, score: 11, e: 0.59
episode: 7/1000, score: 14, e: 0.55
episode: 8/1000, score: 13, e: 0.52
episode: 9/1000, score: 9, e: 0.5
episode: 10/1000, score: 15, e: 0.46
episode: 11/1000, score: 22, e: 0.41
episode: 12/1000, score: 18, e: 0.38
episode: 13/1000, score: 13, e: 0.35
episode: 14/1000, score: 7, e: 0.34
episode: 15/1000, score: 11, e: 0.32
episode: 16/1000, score: 9, e: 0.31
episode: 17/1000, score: 11, e: 0.29
episode: 18/1000, score: 8, e: 0.28
episode: 19/1000, score: 10, e: 0.27
episode: 20/1000, score: 17, e: 0.24
episode: 21/1000, score: 9, e: 0.23
episode: 22/1000, score: 9, e: 0.22
episode: 23/1000, score: 7, e: 0.22
episode: 24/1000, score

KeyboardInterrupt: ignored