### Reinforcement Learning

#### Interacting with Gym API

In [1]:
import gym

In [2]:
# Creating Environment
env = gym.make('CartPole-v0')



**Methods & Attributes in Environment :**
- action_space
- observation_space
- reset() : returns init state and also resets the environment
- step()
- render()
- close()

In [3]:
env.reset()

array([-0.01056342,  0.01717044,  0.0404181 , -0.04790671])

In [4]:
env.action_space

Discrete(2)

In [5]:
env.observation_space

Box(4,)

In [6]:
for t in range(1000):
    random_action = env.action_space.sample()
    env.step(random_action) # Randomly move left or right
    env.render()
env.close()



#### Playing games with Random Strategy

In [7]:
for e in range(20):
    observation = env.reset()
    for t in range(50):
        env.render()
        action = env.action_space.sample()
        observation,reward,done,other_info = env.step(action)
        if done:
            print("Game Episode : {}/{} High score : {} ".format(e+1,20,t))
            break
env.close()
print('All Episodes are over')

Game Episode : 1/20 High score : 12 
Game Episode : 2/20 High score : 8 
Game Episode : 3/20 High score : 28 
Game Episode : 4/20 High score : 30 
Game Episode : 6/20 High score : 20 
Game Episode : 7/20 High score : 14 
Game Episode : 8/20 High score : 13 
Game Episode : 9/20 High score : 19 
Game Episode : 10/20 High score : 14 
Game Episode : 11/20 High score : 11 
Game Episode : 12/20 High score : 28 
Game Episode : 13/20 High score : 25 
Game Episode : 14/20 High score : 28 
Game Episode : 15/20 High score : 13 
Game Episode : 16/20 High score : 14 
Game Episode : 17/20 High score : 27 
Game Episode : 18/20 High score : 37 
Game Episode : 19/20 High score : 18 
Game Episode : 20/20 High score : 17 
All Episodes are over


#### Q-Learning
##### Designing an AI Agent

In [12]:
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import random

In [9]:
class Agent:
    def __init__(self,state_size,action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95 # Discount Factor
        self.epsilon = 1.0 # Exploration Rate
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.learning_rate = 0.001
        self.model = self._create_model()
    
    def _create_model(self):
        model = Sequential()
        model.add(Dense(24,input_dim=self.state_size,activation='relu'))
        model.add(Dense(24,activation='relu'))
        model.add(Dense(self.action_size,activation='linear'))
        model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        return model
    
    def remember(self,state,action,reward,next_state,done):
        self.memory.append((state,action,reward,next_state,done))
        
    def act(self,state):
        if np.random.rand()<=self.epsilon: # Sampling according to Epsilon Greedy Method
            return random.randrange(self.action_size) # Take random action
        act_values = self.model.predict(state)
        return np.argmax(act_values[0]) # Take action from neural network
    
    def train(self,batch_size=32): # Training using Replay Buffer
        minibatch = random.sample(self.memory,batch_size)
        for experience in minibatch:
            state,action,reward,next_state,done = experience
            if not done: # Game not over, Use Bellman Equation
                target = reward + self.gamma*np.amax(self.model.predict(next_state)[0])
            else:
                target = reward
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state,target_f,epochs=1,verbose=0)
            if self.epsilon > self.epsilon_min:           
                self.epsilon *= self.epsilon_decay
        
    def save(self,name): # To save the model
        self.model.save_weights(name)
    
    def load(self,name): # To load the saved model
        self.model.load_weights(name)

#### Training the DQN Agent (Deep Q-Learner)

In [10]:
n_episodes = 1000
output_dir = 'cartpole_model/'
agent = Agent(state_size=4,action_size=2)
done = False
state_size = 4
action_size = 2

In [11]:
for e in range(n_episodes):
    state = env.reset()
    state = np.reshape(state,[1,state_size])
    batch_size = 32
    for time in range(500):
        env.render()
        action = agent.act(state)
        next_state,reward,done,other_info = env.step(action)
        reward = reward if not done else -10
        next_state = np.reshape(next_state,[1,state_size])
        agent.remember(state,action,reward,next_state,done)
        state = next_state
        if done:
            if e%20==0:
                print("Game Episode : {}/{} High score : {} Exploration Rate : {:.2}".format(e,n_episodes,time,agent.epsilon))
            break
    if len(agent.memory)>batch_size:
        agent.train(batch_size)
    if e%50 == 0:
        agent.save(output_dir+'weights_'+'{:04d}'.format(e)+".hdf5")
env.close()

Game Episode : 0/1000 High score : 20 Exploration Rate : 1.0

Game Episode : 20/1000 High score : 27 Exploration Rate : 0.047
Game Episode : 40/1000 High score : 11 Exploration Rate : 0.01
Game Episode : 60/1000 High score : 26 Exploration Rate : 0.01
Game Episode : 80/1000 High score : 10 Exploration Rate : 0.01
Game Episode : 100/1000 High score : 9 Exploration Rate : 0.01
Game Episode : 120/1000 High score : 37 Exploration Rate : 0.01
Game Episode : 140/1000 High score : 28 Exploration Rate : 0.01
Game Episode : 160/1000 High score : 57 Exploration Rate : 0.01
Game Episode : 180/1000 High score : 38 Exploration Rate : 0.01
Game Episode : 200/1000 High score : 34 Exploration Rate : 0.01
Game Episode : 220/1000 High score : 45 Exploration Rate : 0.01
Game Episode : 240/1000 High score : 58 Exploration Rate : 0.01
Game Episode : 260/1000 High score : 149 Exploration Rate : 0.01
Game Episode : 280/1000 High score : 112 Exploration Rate : 0.01
Game Episode : 300/1000 High score : 158 Exp