<a href="https://colab.research.google.com/github/ananyashreyjain/ML/blob/master/DQLearnig.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam, RMSprop
from collections import deque
import gym
import numpy as np
env = gym.make('CartPole-v0')
import random

In [0]:
#parameters
AGENT_MEMORY = 5000
UPDATE_AFTER_EPISODES = 10
HIDDEN_LAYER_NODES = 32
BATCH_SIZE = 64
DISCOUNT = 0.99
LEARNING_RATE = 0.001
LOSS_FUNCTION = 'mse'
AVG_OF_LAST = 10
OPTIMIZER = Adam(lr=LEARNING_RATE)
RENDER_AFTER_EPISODES = 25
q_new = lambda q_max, reward: (reward + DISCOUNT * q_max)
TOTAL_EPISODES = 2500
MAX_EPSILON = 1
EPSILON_DECAY = 0.99975
MIN_EPSILON = 0.001

In [0]:
class Agent:
    def __init__(self):
        
        self.state = env.reset().tolist()
        self.done = False
        self.score = 0
        self.render = True
        self.epsilon = MAX_EPSILON
        self.average_reward = 0
        self.present_q_model = Sequential()
        self.present_q_model.add(Dense(HIDDEN_LAYER_NODES, input_shape=(env.observation_space.low.size,),activation='relu'))
        self.present_q_model.add(Dense(env.action_space.n,activation='linear'))
        self.present_q_model.compile(loss=LOSS_FUNCTION, optimizer=OPTIMIZER, metrics=['accuracy'])
        
        self.future_q_model = Sequential()
        self.future_q_model.add(Dense(HIDDEN_LAYER_NODES, input_shape=(env.observation_space.low.size,),activation='relu'))
        self.future_q_model.add(Dense(env.action_space.n,activation='linear'))
        self.future_q_model.compile(loss=LOSS_FUNCTION, optimizer=OPTIMIZER, metrics=['accuracy'])
        
        self.future_q_model.set_weights(self.present_q_model.get_weights())
        
        self.memory = deque(maxlen=AGENT_MEMORY)
        self.episodes = 0
        
    def train(self):
        if len(self.memory) < BATCH_SIZE :
            return
        
        batch = random.sample(self.memory, BATCH_SIZE)
        present_q_values = self.present_q_model.predict([x[1] for x in batch]).tolist()
        future_q_values = self.future_q_model.predict([x[3] for x in batch]).tolist()
        for index, slot in enumerate(batch):
            if not slot[4]:
                q_future_max = np.max(future_q_values[index])
                qnew = q_new(q_future_max, slot[2])
            else:
                qnew = slot[2]
            
            present_q_values[index][slot[0]] = qnew
            
        X = [slot[1] for slot in batch]
        Y = present_q_values
        
        history = self.present_q_model.fit(X, Y, batch_size=BATCH_SIZE, shuffle=False, verbose = 0)
        
        if self.episodes % UPDATE_AFTER_EPISODES == 0:
            _ = self.future_q_model.set_weights(self.present_q_model.get_weights())
                
    def next_action(self):
        q_values = self.present_q_model.predict([self.state]).tolist()
        
        if np.random.random() > self.epsilon:
            action = np.argmax(q_values)
        else:
            action = np.random.randint(0, env.action_space.n)
            
        new_state, reward, done, info = env.step(action)
        self.score += 1
        
        if done:
            self.episodes +=1
            if self.episodes % RENDER_AFTER_EPISODES == 0:
                self.render = True
            else:
                self.render = False
                
            if self.episodes % AVG_OF_LAST == 0:
                self.avg_reward = self.score/AVG_OF_LAST
                print(f"Average Score = {self.avg_reward}")
                self.score = 0
                        
        if self.render:
            #env.render()
            None
            
            
        self.memory.append([action, self.state, reward, new_state.tolist(), done])
        self.state = new_state.tolist() if not done else env.reset().tolist()
        
        self.train()
        if self.epsilon > MIN_EPSILON:
            self.epsilon *= EPSILON_DECAY
            self.epsilon = max(MIN_EPSILON, self.epsilon)
        
        return done
        

In [0]:
agent = Agent()
for episode in range(TOTAL_EPISODES):
    print (f"On Episode {episode}")
    while True:
        if agent.next_action():
            break

On Episode 0
On Episode 1
On Episode 2
On Episode 3
On Episode 4
On Episode 5
On Episode 6
On Episode 7
On Episode 8
On Episode 9
Average Score = 25.1
On Episode 10
On Episode 11
On Episode 12
On Episode 13
On Episode 14
On Episode 15
On Episode 16
On Episode 17
On Episode 18
On Episode 19
Average Score = 16.6
On Episode 20
On Episode 21
On Episode 22
On Episode 23
On Episode 24
On Episode 25
On Episode 26
On Episode 27
On Episode 28
On Episode 29
Average Score = 16.8
On Episode 30
On Episode 31
On Episode 32
On Episode 33
On Episode 34
On Episode 35
On Episode 36
On Episode 37
On Episode 38
On Episode 39
Average Score = 19.4
On Episode 40
On Episode 41
On Episode 42
On Episode 43
On Episode 44
On Episode 45
On Episode 46
On Episode 47
On Episode 48
On Episode 49
Average Score = 26.3
On Episode 50
On Episode 51
On Episode 52
On Episode 53
On Episode 54
On Episode 55
On Episode 56
On Episode 57
On Episode 58
On Episode 59
Average Score = 29.7
On Episode 60
On Episode 61
On Episode 62
On