In [1]:
import random
import gym
import numpy as np
from collections import deque
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import matplotlib as plt

In [2]:
EPISODES = 50
THRESHOLD = 201

In [7]:
class DQN():
    def __init__(self, env_string,batch_size=64):
        self.memory = deque(maxlen=100000)
        self.env = gym.make(env_string)
        input_size = self.env.observation_space.shape[0]
        self.action = np.array([-2,0,2], dtype=np.float32)
        action_size = len(self.action)
        self.batch_size = batch_size
        self.gamma = 1.0
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.1
        
        alpha=0.01
        alpha_decay=0.01
        
        # Init model
        self.model = Sequential()
        self.model.add(Dense(64, input_dim=input_size, activation='relu'))
        self.model.add(Dense(32, activation='relu'))
        # self.model.add(Dense(action_size, activation='linear'))
        self.model.add(Dense(3, activation='linear'))
        self.model.compile(loss='mse', optimizer=Adam(learning_rate=alpha))

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def choose_action(self, state, epsilon):
        if np.random.random() <= epsilon:
            return [np.random.choice(self.action)]
        else:
            return [self.action[np.argmax(self.model.predict(state, verbose=0)[0])]]

    def preprocess_state(self, state):
        return np.reshape(state, [1, 3])

    def replay(self, batch_size):
        x_batch, y_batch = [], []
        minibatch = random.sample(self.memory, min(len(self.memory), batch_size))
        for state, action, reward, next_state, done in minibatch:
            y_target = self.model.predict(state, verbose=0)
            action_index = np.where(self.action == action[0])[0][0]
            y_target[0][action_index] = reward if done else reward + self.gamma * np.max(self.model.predict(next_state, verbose=0)[0])
            x_batch.append(state[0])
            y_batch.append(y_target[0])
        
        self.model.fit(np.array(x_batch), np.array(y_batch), batch_size=len(x_batch), verbose=0)
        self.epsilon = max(self.epsilon_min, self.epsilon_decay*self.epsilon) # decrease epsilon
       

    def train(self):
        scores = deque(maxlen=100)
        avg_scores = []

        for e in range(EPISODES):
            state = self.env.reset()
            if e % 10 == 0:
                self.env.render()
            state = self.preprocess_state(state)
            done = False
            i = 0
            while not done:
                action = self.choose_action(state,self.epsilon)
                next_state, reward, done, _ = self.env.step(action)
                if e % 10 == 0:
                    self.env.render()
                next_state = self.preprocess_state(next_state)
                self.remember(state, action, reward, next_state, done)
                state = next_state
                self.epsilon = max(self.epsilon_min, self.epsilon_decay*self.epsilon) # decrease epsilon
                i += 1
                
            if e % 10 == 0:
                self.env.close()

            scores.append(i)
            mean_score = np.mean(scores)
            avg_scores.append(mean_score)
            print(f'Epoch number {e}, mean score {mean_score}, reward {reward}')
            if mean_score >= THRESHOLD and e >= 10:
                print('Ran {} episodes. Solved after {} trials ✔'.format(e, e - 10))
                return avg_scores
            if (e + 1) % 10 == 0:
                print('[Episode {}] - Mean survival time over last 10 episodes was {} ticks.'.format(e, mean_score))

            self.replay(self.batch_size)
        
        print('Did not solve after {} episodes 😞'.format(e))
        return avg_scores

In [8]:
env_string = 'Pendulum-v0'

def plot_avg_reward(avg_scores):
    plt.figure(figsize=(10, 6))
    plt.plot(avg_scores, label='Average Reward per Episode')
    plt.xlabel('Episode')
    plt.ylabel('Average Reward')
    plt.title('Average Reward per Episode over Time')
    plt.legend()
    plt.grid(True)
    plt.show()

dqn = DQN(env_string)
avg_scores = dqn.train()




Epoch number 0, mean score 200.0, reward -5.342642604018294
Epoch number 1, mean score 200.0, reward -9.592861337153957
Epoch number 2, mean score 200.0, reward -8.601326385805423
Epoch number 3, mean score 200.0, reward -5.034676835784575
Epoch number 4, mean score 200.0, reward -7.604488991258211
Epoch number 5, mean score 200.0, reward -8.143848878274552
Epoch number 6, mean score 200.0, reward -8.50809972489631
Epoch number 7, mean score 200.0, reward -8.561474142973045
Epoch number 8, mean score 200.0, reward -9.412909600106607
Epoch number 9, mean score 200.0, reward -9.8518839581704
[Episode 9] - Mean survival time over last 10 episodes was 200.0 ticks.
Epoch number 10, mean score 200.0, reward -9.87091409769772
Epoch number 11, mean score 200.0, reward -9.701976814305565


KeyboardInterrupt: 

In [5]:

class DQN():
    def __init__(self, env_string, batch_size=64):
        self.memory = deque(maxlen=100000)
        self.env = gym.make(env_string)
        input_size = self.env.observation_space.shape[0]
        self.action = np.array([-2, 0, 2], dtype=np.float32)
        action_size = len(self.action)
        self.batch_size = batch_size
        self.gamma = 1.0
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        
        alpha = 0.01
        alpha_decay = 0.01
        
        # Init model
        self.model = Sequential()
        self.model.add(Dense(64, input_dim=input_size, activation='relu'))
        self.model.add(Dense(32, activation='relu'))
        self.model.add(Dense(action_size, activation='linear'))
        self.model.compile(loss='mse', optimizer=Adam(learning_rate=alpha))

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def choose_action(self, state, epsilon):
        if np.random.random() <= epsilon:
            return [np.random.choice(self.action)]
        else:
            return [self.action[np.argmax(self.model.predict(state, verbose=0)[0])]]

    def preprocess_state(self, state):
        return np.reshape(state, [1, 3])

    def replay(self, batch_size):
        x_batch, y_batch = [], []
        minibatch = random.sample(self.memory, min(len(self.memory), batch_size))
        for state, action, reward, next_state, done in minibatch:
            y_target = self.model.predict(state, verbose=0)
            action_index = np.where(self.action == action[0])[0][0]
            y_target[0][action_index] = reward if done else reward + self.gamma * np.max(self.model.predict(next_state, verbose=0)[0])
            x_batch.append(state[0])
            y_batch.append(y_target[0])
        
        self.model.fit(np.array(x_batch), np.array(y_batch), batch_size=len(x_batch), verbose=0)
        self.epsilon = max(self.epsilon_min, self.epsilon_decay * self.epsilon)  # decrease epsilon

    def train(self, EPISODES, THRESHOLD):
        scores = deque(maxlen=100)
        rewards = []
        average_rewards = []

        for e in range(EPISODES):
            state = self.env.reset()
            if e % 10 == 0:
                self.env.render()
            state = self.preprocess_state(state)
            done = False
            total_reward = 0
            while not done:
                action = self.choose_action(state, self.epsilon)
                next_state, reward, done, _ = self.env.step(action)
                if e % 10 == 0:
                    self.env.render()
                next_state = self.preprocess_state(next_state)
                self.remember(state, action, reward, next_state, done)
                state = next_state
                total_reward += reward

            if e % 10 == 0:
                self.env.close()

            rewards.append(total_reward)
            scores.append(total_reward)
            mean_score = np.mean(scores)
            average_rewards.append(mean_score)
            print(f'Epoch number {e}, mean score {mean_score}, reward {total_reward}')
            if mean_score >= THRESHOLD and e >= 10:
                print('Ran {} episodes. Solved after {} trials ✔'.format(e, e - 10))
                return rewards, average_rewards
            if (e + 1) % 10 == 0:
                print('[Episode {}] - Mean survival time over last 10 episodes was {} ticks.'.format(e, mean_score))

            self.replay(self.batch_size)
        
        print('Did not solve after {} episodes 😞'.format(e))
        return rewards, average_rewards


In [6]:
def plot_avg_reward(avg_scores):
    plt.figure(figsize=(10, 6))
    plt.plot(avg_scores, label='Average Reward per Episode')
    plt.xlabel('Episode')
    plt.ylabel('Average Reward')
    plt.title('Average Reward per Episode over Time')
    plt.legend()
    plt.grid(True)
    plt.show()
    
def plot_rewards(rewards, average_rewards):
    # plt.fig(figsize=(12, 6))
    
    plt.plot(rewards, label='Reward per Episode')
    plt.plot(average_rewards, label='Average Reward per Episode', linestyle='--')
    
    plt.xlabel('Episode')
    plt.ylabel('Reward')
    plt.title('Reward and Average Reward per Episode')
    plt.legend()
    plt.grid(True)
    plt.show()

env_string = 'Pendulum-v0'
dqn = DQN(env_string)
rewards, average_rewards = dqn.train(EPISODES, THRESHOLD)
plot_rewards(rewards, average_rewards)


Epoch number 0, mean score -1732.2229243318654, reward -1732.2229243318654
Epoch number 1, mean score -1726.0153517576973, reward -1719.807779183529
Epoch number 2, mean score -1511.2336432160764, reward -1081.6702261328344
Epoch number 3, mean score -1545.4887255872036, reward -1648.253972700585
Epoch number 4, mean score -1438.90476230872, reward -1012.5689091947852
Epoch number 5, mean score -1377.0684258631618, reward -1067.886743635371
Epoch number 6, mean score -1411.474122577089, reward -1617.9083028606537
Epoch number 7, mean score -1349.1442079860437, reward -912.834805848725
Epoch number 8, mean score -1305.8753571281004, reward -959.7245502645521
Epoch number 9, mean score -1250.095931575663, reward -748.081101603726
[Episode 9] - Mean survival time over last 10 episodes was -1250.095931575663 ticks.
Epoch number 10, mean score -1294.0915540943074, reward -1734.0477792807521
Epoch number 11, mean score -1275.9716997167748, reward -1076.6533015639168
Epoch number 12, mean sco

AttributeError: module 'matplotlib' has no attribute 'figure'