In [1]:
import gym

In [2]:
env = gym.make('CartPole-v0')

In [3]:
env.reset()
for _ in range(1000):
    env.render()
    env.step(env.action_space.sample())
    
env.close()    



In [10]:
print(env.action_space)
print(env.observation_space)
print(env.reward_range)

Discrete(2)
Box(4,)
(-inf, inf)


## Episodes

In [35]:
for e in range(100):   ## No.of episodes
    env.reset()
    for t in range(100):
        observation,reward,done,info=env.step(env.action_space.sample())
        env.render()
        if done==True:
            print("Episode {}/{} :: High Score - {}".format(e,100,t))
            break
env.close()         

Episode 0/100 :: High Score - 36
Episode 1/100 :: High Score - 13
Episode 2/100 :: High Score - 25
Episode 3/100 :: High Score - 17
Episode 4/100 :: High Score - 14
Episode 5/100 :: High Score - 15
Episode 6/100 :: High Score - 10
Episode 7/100 :: High Score - 10
Episode 8/100 :: High Score - 35
Episode 9/100 :: High Score - 21
Episode 10/100 :: High Score - 16
Episode 11/100 :: High Score - 17
Episode 12/100 :: High Score - 11
Episode 13/100 :: High Score - 16
Episode 14/100 :: High Score - 14
Episode 15/100 :: High Score - 9
Episode 16/100 :: High Score - 20
Episode 17/100 :: High Score - 29
Episode 18/100 :: High Score - 16
Episode 19/100 :: High Score - 17
Episode 20/100 :: High Score - 14
Episode 21/100 :: High Score - 13
Episode 22/100 :: High Score - 13
Episode 23/100 :: High Score - 17
Episode 24/100 :: High Score - 36
Episode 25/100 :: High Score - 11
Episode 26/100 :: High Score - 13
Episode 27/100 :: High Score - 27
Episode 28/100 :: High Score - 18
Episode 29/100 :: High Sc

In [6]:
from collections import deque
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam
import numpy as np
import random
import os

## Agent Class

In [12]:
class Agent:
    def __init__(self,state_size,action_size):
        self.state_size=state_size
        self.action_size=action_size
        self.memory=deque(maxlen=2000)
        self.epsilon=1.0
        self.epsilon_decay=0.995
        self.epsilon_min=0.1
        self.gamma=0.95 ## Discount factor
        self.model = self._create_model()
    
    def _create_model(self):
        #neural network to approximate Q-learning Function
        model=Sequential()
        model.add(Dense(24,input_dim=self.state_size,activation='relu'))
        model.add(Dense(48,activation='tanh'))
        model.add(Dense(self.action_size,activation='linear'))
        model.compile(optimizer=Adam(lr=0.001),loss='mse')
        return model
    
    def remember(self,state,action,reward,next_state,done):
        self.memory.append((state,action,reward,next_state,done)) # remember previous experiences
        
    def act(self,state):
        if np.random.rand()<self.epsilon:
            return random.randrange(self.action_size)
        action = self.model.predict(state)
        return np.argmax(action[0])
    
    def train(self,batch_size=32):
        minibatch=random.sample(self.memory,batch_size)
        
        for state,action,reward,next_state,done in minibatch:
            if not done:
                target = reward + self.gamma*np.amax(self.model.predict(next_state)[0])
            else:
                target = reward
            
            target_f = self.model.predict(state)
            target_f[0][action] = target
            
            self.model.fit(state,target_f,epochs=1,verbose=0)
            
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
    def load(self,name):
        self.model.load_weights(name)
        
    def save(self,name):
        self.model.save_weights(name)                  

In [14]:
n_episodes = 1000
#os.mkdir('cartpole_model/')
output_dir = "cartpole_model/"

state_size = 4
action_size =2
batch_size = 32

agent = Agent(state_size, action_size) # initialise agent
done = False
for e in range(n_episodes):
    state = env.reset()
    state = np.reshape(state,[1,state_size])
    
    for time in range(5000):
        env.render()
        action = agent.act(state) #action is 0 or 1
        next_state,reward,done,other_info = env.step(action) 
        reward = reward if not done else -10
        next_state = np.reshape(next_state,[1,state_size])
        agent.remember(state,action,reward,next_state,done)
        state = next_state
        
        if done:
            print("Game Episode :{}/{}, High Score:{},Exploration Rate:{:.2}".format(e,n_episodes,time,agent.epsilon))
            break
            
    if len(agent.memory)>batch_size:
        agent.train(batch_size)
    
    if e%50==0:
        agent.save(output_dir+"weights_"+'{:04d}'.format(e)+".hdf5")
        
env.close()

Game Episode :0/100, High Score:17,Exploration Rate:1.0
Game Episode :1/100, High Score:14,Exploration Rate:1.0
Game Episode :2/100, High Score:22,Exploration Rate:0.99
Game Episode :3/100, High Score:18,Exploration Rate:0.99
Game Episode :4/100, High Score:48,Exploration Rate:0.99
Game Episode :5/100, High Score:34,Exploration Rate:0.98
Game Episode :6/100, High Score:14,Exploration Rate:0.98
Game Episode :7/100, High Score:13,Exploration Rate:0.97
Game Episode :8/100, High Score:21,Exploration Rate:0.97
Game Episode :9/100, High Score:17,Exploration Rate:0.96
Game Episode :10/100, High Score:11,Exploration Rate:0.96
Game Episode :11/100, High Score:15,Exploration Rate:0.95
Game Episode :12/100, High Score:11,Exploration Rate:0.95
Game Episode :13/100, High Score:11,Exploration Rate:0.94
Game Episode :14/100, High Score:10,Exploration Rate:0.94
Game Episode :15/100, High Score:15,Exploration Rate:0.93
Game Episode :16/100, High Score:10,Exploration Rate:0.93
Game Episode :17/100, High