In [1]:
import gym

In [2]:
#create an environment
env = gym.make('CartPole-v0') # name of game
    

In [3]:
env.reset() # gives all the variables associated with the environment 
            # for e.g. in cartpole game this could be location of cart, velocity of cart, angular velocity of pole, linear velocity
    # hence this array contains 4 values as 4 attributes define our environment

array([ 0.00963219, -0.01569452,  0.00182675,  0.03424951])

In [4]:
env.action_space # dicrete action space showing we can have only two action 
                 # one is move left and other one is move right

Discrete(2)

In [5]:
env.action_space.n #gives total number of possible actions

2

In [6]:
env.observation_space

Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)

In [7]:
env.observation_space.shape[0]

4

In [23]:
env.reset()
for t in range(1000):
    random_Action = env.action_space.sample()
    env.step(random_Action) #action space contains all the possible actions,.sample picks one of those actions randomly
    env.render()
env.close()   
#env.step() executes a particular action



In [24]:
for e in range(20): #we will play 20 episodes
    observation = env.reset()
    for t in range(50):
        env.render()
        action = env.action_space.sample()
        observation,reward,done,other_info = env.step(action)
        
        if done: #this means game episode is over
            print("game episode :{}/{} high score:{}".format(e,20,t))
            break
        
env.close()
print("all 20 episodes are over")

game episode :0/20 high score:14
game episode :1/20 high score:18
game episode :2/20 high score:18
game episode :3/20 high score:25
game episode :4/20 high score:12
game episode :5/20 high score:20
game episode :6/20 high score:12
game episode :7/20 high score:10
game episode :8/20 high score:10
game episode :9/20 high score:12
game episode :10/20 high score:12
game episode :11/20 high score:21
game episode :12/20 high score:16
game episode :13/20 high score:23
game episode :14/20 high score:18
game episode :15/20 high score:11
game episode :16/20 high score:12
game episode :17/20 high score:30
game episode :18/20 high score:15
game episode :19/20 high score:19
all 20 episodes are over


In [7]:
import gym
import numpy as np
import matplotlib.pyplot as plt
import os
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import random
%matplotlib inline


        

In [8]:
class Agent:
    def __init__(self,state_size,action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95 #Discount factor
        #exploration vs exploitation tradeoff
        self.epsilon = 1.0 #100% random exploration in the beginning no knowlegde of the environment(Epsilon Greedy Method)
        self.epsion_decay = 0.995
        self.epsion_min = 0.01 #with 1% of prob I am going to take a random step at the end
        self.learning_rate = 0.001
        self.model = self.create_model()
        
    def create_model(self):
        model = Sequential()
        model.add(Dense(24,input_dim=self.state_size,activation='relu'))#first layer,input dimension = 4 as we have 4 variables to define env. state
        model.add(Dense(24,activation='relu'))#second layer
        model.add(Dense(self.action_size,activation='linear'))#output layer has 2(self.sction_size) neurons as we have two movements to define, one left and one right. 
        model.compile(loss='mse',optimizer=Adam(lr=0.001))
        return model    
        
    def remember(self,state,action,reward,next_state,done):
    #remember past experience
        self.memory.append((state,action,reward,next_state,done))

    def act(self,state):
    #sampling according to epsilon greedy method
        if np.random.rand()<=self.epsilon:
        #take a random action
           return random.randrange(self.action_size)
    #ask neural network to give me the most suitable action
        return np.argmax(self.model.predict(state)[0]) # which is the best reward out of available 2 rewards hence we use argmax
    

    
    def train(self,batch_size=32):
    #training using a 'replay buffer'
        minibatch = random.sample(self.memory,batch_size)#each mini batch is of 32 batch size we are feeding each example of that 32 examples to our neural network
        for experience in minibatch:
            state,action,reward,next_state,done = experience
        #X,y : state,expected reward(using bellman equation)
            if not done:
                
            
               target = reward + self.gamma*np.amax(self.model.predict(next_state)[0]) # to predict rewards we would get in the next state we use neural network
            else: #if game is not over
                target = reward #final reward we get in last stage
        
            target_f = self.model.predict(state)
            target_f[0][action] = target
        #x= state, y = target_f
            self.model.fit(state,target_f,epochs=1,verbose = 0)
            if self.epsilon > self.epsion_min:
                self.epsilon *= self.epsion_decay  # if it is more than min then decay the epsilon    

    def load(self,name):
        self.model.load_weights(name)
    
    
    def save(self,name):
        self.model.save_weights(name)                

In [9]:
#x = np.random.rand(1,4) #we are giving 1 vector with 4 columns signifing 4 attributes randomly or a particular state
#model.predict(x) #this gives us what action we should take, left or right
#this is fed to a neural network which is not trained yet
# here 1 in x specifies batch size

In [10]:
#training deep q agent
n_episodes = 1000 # no of times we want to play the game
output_dir = "cartpole_model/"

In [11]:
agent = Agent(state_size=4,action_size=2) # define our agent class
done = False
state_size = 4
action_size = 2

In [None]:
for e in range(n_episodes):
    state = env.reset()
    state = np.reshape(state,[1,state_size])
    batch_size = 32
    
    for t in range(500): #each episode would go for 500 sec
        env.render()
        action = agent.act(state) # action would be 0 or 1 / left or right
        next_state,reward,done,other_info= env.step(action)
        reward = reward if not done else -10
        next_state = np.reshape(next_state,[1,state_size])
        agent.remember(state,action,reward,next_state,done)#experience for the agent
    
        if done:
            print("game episode:{}/{} high score:{} exploration rate:{:.2}".format(e,20,t,agent.epsilon))
            break
    if len(agent.memory)>batch_size:#no of experiences in the memory should be greater than some threshold then only it is profitable to use then for exploitation
        agent.train(batch_size)       #this threshold must always be equal to batch_size
                                              #we are creating a batch during training but in real we are feeding single example to neural network as we are using sgd
    if e%50==0:
        agent.save(output_dir+"weights_"+'{:04d}'.format(e)+".hdf5") #after each 50 epoch I would save my output
print("deep q network model trained")
env.close()


game episode:0/20 high score:17 exploration rate:1.0
game episode:1/20 high score:17 exploration rate:1.0
game episode:2/20 high score:20 exploration rate:0.85
game episode:3/20 high score:21 exploration rate:0.73
game episode:4/20 high score:18 exploration rate:0.62
game episode:5/20 high score:7 exploration rate:0.53
game episode:6/20 high score:8 exploration rate:0.45
game episode:7/20 high score:11 exploration rate:0.38
game episode:8/20 high score:10 exploration rate:0.33
game episode:9/20 high score:15 exploration rate:0.28
game episode:10/20 high score:11 exploration rate:0.24
game episode:11/20 high score:8 exploration rate:0.2
game episode:12/20 high score:8 exploration rate:0.17
game episode:13/20 high score:8 exploration rate:0.15
game episode:14/20 high score:9 exploration rate:0.12
game episode:15/20 high score:8 exploration rate:0.11
game episode:16/20 high score:7 exploration rate:0.09
game episode:17/20 high score:8 exploration rate:0.077
game episode:18/20 high score:7