
# **Training an agent in CartPole-v1 environment with Deep Reinforcement Learning Algorithm using Keras**

In [None]:
import gym
import numpy as np
from collections import deque # Used to define the agent's memory
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import random

from tqdm import tqdm # to visualize progress

import warnings
warnings.filterwarnings("ignore")

In [None]:
# DQL Agent Class
class DQLAgent:
    def __init__(self,env):
        # Gets the state space size of the CartPole environment.
        # In the CartPole environment, the situation is 4-dimensional. (position of the car, speed, angle of the bar and angular velocity).
        self.state_size = env.observation_space.shape[0] 
        
        self.action_size = env.action_space.n # move left or move right
        
        self.gamma = 0.995 # effect of future rewards on current value
        # If it is close to 0, the agent prefers short-term rewards.
        # If it is closer to 1, the agent considers future rewards more.
    
        self.learning_rate = 0.001 
        
        # Epsilon = Exploration rate, probability of agent choosing random action
        self.epsilon = 1.0 # discovery rate
        self.epsilon_decay = 0.995 # The rate at which epsilon increases in each iteration is that as epsilon decreases, it learns more and discovers less.
        self.epsilon_min = 0.01
        
        # deque : It automatically deletes old data when it reaches a certain capacity.
        self.memory = deque(maxlen=2000)
        
        self.alpha = 0.6 # Prioritization parameter
        self.beta = 0.4 # Importance sampling correction parameter 
        
        self.model = self.build_model()
        
    def build_model(self):
        
        model = Sequential()
        
        model.add(Dense(64, input_dim=self.state_size, activation="relu"))
        model.add(Dense(64, activation="relu"))
        model.add(Dense(32, activation="relu"))
        model.add(Dense(self.action_size, activation="linear"))
        
        # Compiling
        model.compile(loss = "mse", optimizer=Adam(learning_rate= self.learning_rate))
        
        return model
        
    
    def remember(self, state, action, reward, next_state, done):
        state = np.array(state, dtype=np.float32).reshape(1, -1)
        next_state = np.array(next_state, dtype=np.float32).reshape(1, -1)

        # Calculating TD Error
        target = reward
        if not done:
            target += self.gamma * np.amax(self.model.predict(next_state, verbose=0)[0])
    
        predicted = self.model.predict(state, verbose=0)[0][action]
        error = abs(target - predicted)  

        self.memory.append((state, action, reward, next_state, done, error))


    
    def act(self,state, env):
        
        state = np.array(state, dtype=np.float32).reshape(1, -1)  
        if np.random.rand() <= self.epsilon:
            return random.randint(0, self.action_size - 1)  # Random act
        act_values = self.model.predict(state, verbose=0)
        return np.argmax(act_values[0])  # Pick the best act
        
    # Prioritized Experience Replay has been implemented.
    # It provides a faster training process by prioritizing experiences with a high error rate.
    def replay(self,batch_size): # deep q network is trained by replaying experiences
        
        if len(self.memory) < batch_size:
            return

        # Sample Selection with Priority Probabilities
        errors = np.array([exp[5] for exp in self.memory])  # Get errors
        probabilities = errors ** self.alpha  # Apply Prioritization 
        probabilities /= probabilities.sum()  # Normalization

        
        indices = np.random.choice(len(self.memory), batch_size, p=probabilities)
        minibatch = [self.memory[i] for i in indices]

        for state, action, reward, next_state, done, _ in minibatch:
            state = np.array(state, dtype=np.float32).reshape(1, -1)
            next_state = np.array(next_state, dtype=np.float32).reshape(1, -1)
            
            target = reward
            if not done:
                target += self.gamma * np.amax(self.model.predict(next_state, verbose=0)[0])


            target_f = self.model.predict(state, verbose=0)
            target_f[0][action] = target

            self.model.fit(state, target_f, epochs=1, verbose=0)


        # When beta is small, the agent learns more important examples.
        # As beta grows, the agent begins to learn all examples equally.
        self.beta = min(1.0, self.beta + 0.001)
            
    def adaptiveEGreedy(self): # decrease of epsilon over time, balance of exploration and exploitation
        
        if self.epsilon > self.epsilon_min:
            self.epsilon = self.epsilon * self.epsilon_decay

In [3]:
# trainİNG the dql agent using the gym environment.

env = gym.make("CartPole-v1", render_mode = "human")
agent = DQLAgent(env)

batch_size = 32 
episodes = 20

for e in tqdm(range(episodes)):
    
    state, _ = env.reset() 
    state = np.array(state, dtype=np.float32).reshape(1, -1)
    
    time = 0
    
    while True:
        action = agent.act(state, env) 
        
        # the agent implements the action in the environment
        (next_state, reward, done,_,_) = env.step(action)
        next_state = np.array(next_state, dtype=np.float32).reshape(1, -1)

        
        # It records the action performed and the information received from the env as a result of the action.
        reward = reward if not done else -10 
        agent.remember(state,action,reward, next_state, done)
        
        state = next_state

        
        agent.replay(batch_size) # Starts replay from experiences
        
        agent.adaptiveEGreedy()
    
        if done:
            print(f"\nEpisode : {e}, time : {time}")
            break
        
        

  5%|▌         | 1/20 [00:02<00:46,  2.43s/it]


Episode : 0, time : 0


 10%|█         | 2/20 [01:13<12:45, 42.54s/it]


Episode : 1, time : 0


 15%|█▌        | 3/20 [01:54<11:51, 41.87s/it]


Episode : 2, time : 0


 20%|██        | 4/20 [04:18<21:59, 82.47s/it]


Episode : 3, time : 0


 25%|██▌       | 5/20 [05:40<20:34, 82.30s/it]


Episode : 4, time : 0


 30%|███       | 6/20 [06:29<16:30, 70.74s/it]


Episode : 5, time : 0


 35%|███▌      | 7/20 [07:17<13:44, 63.43s/it]


Episode : 6, time : 0


 40%|████      | 8/20 [08:10<12:00, 60.02s/it]


Episode : 7, time : 0


 45%|████▌     | 9/20 [09:02<10:32, 57.52s/it]


Episode : 8, time : 0


 50%|█████     | 10/20 [10:23<10:47, 64.74s/it]


Episode : 9, time : 0


 55%|█████▌    | 11/20 [10:59<08:25, 56.15s/it]


Episode : 10, time : 0


 60%|██████    | 12/20 [19:47<26:37, 199.74s/it]


Episode : 11, time : 0


 65%|██████▌   | 13/20 [20:51<18:29, 158.45s/it]


Episode : 12, time : 0


 70%|███████   | 14/20 [22:23<13:51, 138.53s/it]


Episode : 13, time : 0


 75%|███████▌  | 15/20 [24:50<11:44, 140.96s/it]


Episode : 14, time : 0


 80%|████████  | 16/20 [26:56<09:05, 136.43s/it]


Episode : 15, time : 0


 85%|████████▌ | 17/20 [29:07<06:44, 134.85s/it]


Episode : 16, time : 0


 90%|█████████ | 18/20 [30:07<03:44, 112.23s/it]


Episode : 17, time : 0


 95%|█████████▌| 19/20 [31:09<01:37, 97.38s/it] 


Episode : 18, time : 0


100%|██████████| 20/20 [33:17<00:00, 99.89s/it] 


Episode : 19, time : 0





In [7]:
# Test

trained_model = agent

env = gym.make("CartPole-v1", render_mode = "human")
state = env.reset()[0]
state = np.reshape(state, [1,4])

for time_t in range(500):
    env.render() # Visually render the environment
    
    action = trained_model.act(state,env)
    
    (next_state, reward, done, _,_) = env.step(action)
    next_state = np.reshape(next_state, [1,4])
    state = next_state
    
    time_t += 1
    
    print(f"Time: {time_t}")
    
    if done:
        print("Episode Done")
        break

env.close()

    

Time: 1
Time: 2
Time: 3
Time: 4
Time: 5
Time: 6
Time: 7
Time: 8
Time: 9
Time: 10
Time: 11
Time: 12
Time: 13
Time: 14
Time: 15
Time: 16
Time: 17
Time: 18
Time: 19
Time: 20
Time: 21
Time: 22
Time: 23
Time: 24
Time: 25
Time: 26
Time: 27
Time: 28
Time: 29
Time: 30
Time: 31
Time: 32
Time: 33
Time: 34
Time: 35
Time: 36
Time: 37
Time: 38
Time: 39
Time: 40
Time: 41
Time: 42
Time: 43
Time: 44
Time: 45
Time: 46
Episode Done
