In [1]:
import tensorflow as tf
import numpy as np
from collections import deque
import gym
import random

In [2]:
## Load the environment

env = gym.make('MountainCar-v0')

In [3]:
class DQNN:
      
    def __init__(self, env, lr):
        self.optimizer = tf.keras.optimizers.Adam(lr)
        self.loss = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.SUM)
        
        ##Define the neural network
        
        inputs = tf.keras.layers.Input(shape = (env.observation_space.shape[0], ))
        dense1 = tf.keras.layers.Dense(24, activation = 'relu')(inputs)
        dense2 = tf.keras.layers.Dense(48, activation = 'relu')(dense1)
        outputs = tf.keras.layers.Dense(env.action_space.n, activation = None, )(dense2)
        self.model = tf.keras.Model(inputs, outputs)
    
    def predict(self, X):
        y_pred = self.model.predict(X)
        return y_pred
    
    def train(self, X, y):   
        cost = lambda: self.loss(y, self.model(X))
        var_list = lambda: self.model.trainable_weights
        self.optimizer.minimize(cost, var_list)     
        
        

In [4]:
## Define an agent with Deep Q-function, Q-learning update algorithm and experience replay 

BATCH_NUM = 16

class Agent:
    def __init__(self, env):
           
        self.gamma = 0.9 ##No discount
        self.epsilon = 1.0 ## Exploration is granted by negative reward feedback
        self.epsdecay = 0.001
        self.num_actions = env.action_space.n
        self.step = 0
        
        self.lr = 0.3
        self.states = deque(maxlen=20000) ## Collect states for experience replay
        self.qnetwork = DQNN(env, self.lr)
        self.targetnetwork = DQNN(env, self.lr)
    
    
    ##Collect the relevant feedback from the environment
    
    def collect(self, state, action, reward, next_state, next_action, done):
        
        self.states.append([state, action, reward, next_state, next_action, done])
     
    ##Prepare the dataset for experience replay
    
    def prepareDataset(self, states):
        
        X = []
        y = []
        
        for vals in states:
            
            state, a, reward, next_state, next_action, done = vals
            
            X.append(state)
            
            if not done:  
                next_state = np.expand_dims(next_state, axis = 1).T
                target = reward + self.gamma * np.max(self.targetnetwork.predict(next_state))
            else:
                target = reward
            
            state = np.expand_dims(state, axis = 1).T
            y_target = self.qnetwork.predict(state)
            y_target[0, a] = target
            y.append(y_target)
            
        return np.array(X), np.squeeze(y, axis = 1)
            
    
    def chooseAction(self, state):
        
        if self.epsilon > 0.01:
            self.epsilon -= self.epsdecay
            
        state = np.expand_dims(state, axis = 1).T
        if (np.random.random() > self.epsilon):
            vals = self.qnetwork.predict(state)
            return np.argmax(vals)
        else:
            return int(np.random.random() * self.num_actions)
    
    
    def updateReplay(self):
        
        if len(self.states) > BATCH_NUM:
            states = random.sample(self.states, k = BATCH_NUM)
        else:
            return
        
        X, y = self.prepareDataset(states)
        
        self.qnetwork.train(X, y)
        self.step += 1
        
        if self.step%40:
            self.targetnetwork.model.set_weights(self.qnetwork.model.get_weights()) 
    

In [5]:
##Create agent
agent = Agent(env)

In [6]:
UPDATE_STEP = 200
NUM_EPISODES = 2000
NUM_STEPS = 1000

rewards = []

for episode in range(NUM_EPISODES):

    state = env.reset()
    action = agent.chooseAction(state)
    total_reward = 0

    for step in range(1, NUM_STEPS + 1):
        env.render()

        next_state, reward, done, info = env.step(action)
        next_action = agent.chooseAction(next_state)
        agent.collect(state, action, reward, next_state, next_action, done)
        total_reward += reward
        
        ##Update the weights every UPDATE_STEP steps
       # if step%UPDATE_STEP == 0:
        agent.updateReplay()    
        
        if done:
            ##Return some infos about the performance and append reward for later analysis
            rewards.append(total_reward)
            
            print("Episode ", episode,": ", total_reward)
            break

        state = next_state
        action = next_action

env.close()

Episode  0 :  -200.0


KeyboardInterrupt: 

In [8]:
env.observation_space.shape[0]

2

In [7]:
X = np.array([[-0.03880799, -0.0232308,   0.01612237]])

In [8]:
X.shape

(1, 3)

In [10]:
X[0, 2]

0.01612237