In [1]:
import gym
import numpy as np
from keras.layers import Dense,Activation
from keras.models import Sequential

import random

Using TensorFlow backend.


In [2]:
# remember -> saving the bot experience
# replay - > tranining on minibatches
# get action -> get the next action
# model -> neural network for computing the q value

# parameters -> obsrvation_size, action_size

class Game:
    
    def __init__(self,observation_space,action_space):
        
        self.observation_space = observation_space
        self.action_space = action_space
        self.gamma = 0.6
        self.learning_rate = 0.01
        self.memory = []
        self.model = self.dqn_model()
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        
    
    def dqn_model(self):
        
        model = Sequential()
        model.add(Dense(24,input_dim=self.observation_space,activation='relu'))
        model.add(Dense(24,activation='relu'))
        model.add(Dense(self.action_space,activation='linear'))
        model.compile(loss='mse',optimizer='adam')
        
        return model
    
    def action(self,state):
        # epsilon greedy approach
        if(np.random.rand() < self.epsilon):
            return random.randrange(self.action_space)
        
        act = self.model.predict(state)
        return np.argmax(act[0])
    
    
    def remember(self,state,action,reward,next_state,done):
        self.memory.append([state,action,reward,next_state,done])
        
    
    def replay(self,batch_size):
        minibatch = random.sample(self.memory,batch_size)
        for state,action,reward,next_state,done in minibatch:
            target = reward
            if not done:
                target = reward + self.gamma*(np.max(self.model.predict(next_state)[0]))
                
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
            if self.epsilon > self.epsilon_min:
                self.epsilon *= self.epsilon_decay
        

In [3]:
env = gym.make('CartPole-v0')
observation_space = env.observation_space.shape[0]
action_space = env.action_space.n
agent = Game(observation_space,action_space)

for episodes in range(100):
    
    state = env.reset()
    state = np.reshape(state,[1,4])
    # env.render()
    
    for steps in range(200):
        
        env.render()
        action = agent.action(state)
        next_state,reward,done,info = env.step(action)
        next_state = np.reshape(next_state,[1,4])
        
        agent.remember(state,action,reward,next_state,done)
        
        state = next_state
        
        if done:
            print('steps : {}'.format(steps))
            break
    
    # train the agent based on the experience
    agent.replay(10)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
steps : 13
steps : 19
steps : 14
steps : 10
steps : 68
steps : 22
steps : 66
steps : 102
steps : 46
steps : 49
steps : 52
steps : 70
steps : 31
steps : 116
steps : 45
steps : 51
steps : 30
steps : 70
steps : 52
steps : 47
steps : 87
steps : 146
steps : 101
steps : 54
steps : 69
steps : 35
steps : 86
steps : 47
steps : 25
steps : 10
steps : 9
steps : 13
steps : 9
steps : 24
steps : 10
steps : 8
steps : 7
steps : 12
steps : 8
steps : 7
steps : 14
steps : 8
steps : 73
steps : 67
steps : 10
steps : 8
steps : 8
steps : 9
steps : 8
steps : 9
steps : 9
steps : 8
steps : 10
steps : 8
steps : 8
steps : 9
steps : 8
steps : 10
steps : 8
steps : 8
steps : 9
steps : 11
steps : 8
steps : 8
steps : 9
steps : 9
steps : 7
steps : 8
steps : 9
steps : 9
steps : 9
steps : 8
steps : 8
steps : 9
steps : 10
steps : 9
steps : 10
steps : 8
steps : 7
steps : 9
steps : 9
steps : 8
steps : 8
steps : 8
steps

In [4]:
env.close()

In [7]:
model = Sequential()
model.add(Dense(24,input_dim=4,activation='relu'))
model.add(Dense(24,activation='relu'))
model.add(Dense(2,activation='linear'))
model.summary()
# model.compile(loss='mse',optimizer='adam')

# x=np.array([[1,2,3,4],[2,3,4,5]])
# y=np.array([[0,1],[1,0]])
# model.fit(x,y,epochs=1)
# model.predict(np.array([[2,4,5,1]]))
# model

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 24)                120       
_________________________________________________________________
dense_8 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_9 (Dense)              (None, 2)                 50        
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________


In [None]:
epsilon-greedy

