In [0]:
%tensorflow_version 1.x

In [10]:
!/opt/bin/nvidia-smi

Fri Apr 24 09:22:05 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [0]:
"""
Wrappers in this cell are from Open AI:
https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py
"""

import numpy as np
from collections import deque
import gym
from gym import spaces
import cv2


class NoopResetEnv(gym.Wrapper):
    def __init__(self, env=None, noop_max=30):
        """Sample initial states by taking random number of no-ops on reset.
        No-op is assumed to be action 0.
        """
        super(NoopResetEnv, self).__init__(env)
        self.noop_max = noop_max
        self.override_num_noops = None
        assert env.unwrapped.get_action_meanings()[0] == 'NOOP'

    def reset(self):
        """ Do no-op action for a number of steps in [1, noop_max]."""
        self.env.reset()
        if self.override_num_noops is not None:
            noops = self.override_num_noops
        else:
            noops = np.random.randint(1, self.noop_max + 1)
        assert noops > 0
        obs = None
        for _ in range(noops):
            obs, _, done, _ = self.env.step(0)
            if done:
                obs = self.env.reset()
        return obs


class FireResetEnv(gym.Wrapper):
    def __init__(self, env=None):
        """For environments where the user need to press FIRE for the game to start."""
        super(FireResetEnv, self).__init__(env)
        assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
        assert len(env.unwrapped.get_action_meanings()) >= 3

    def reset(self):
        self.env.reset()
        obs, _, done, _ = self.env.step(1)
        if done:
            self.env.reset()
        obs, _, done, _ = self.env.step(2)
        if done:
            self.env.reset()
        return obs


class EpisodicLifeEnv(gym.Wrapper):
    def __init__(self, env=None):
        """Make end-of-life == end-of-episode, but only reset on true game over.
        Done by DeepMind for the DQN and co. since it helps value estimation.
        """
        super(EpisodicLifeEnv, self).__init__(env)
        self.lives = 0
        self.was_real_done = True
        self.was_real_reset = False

    def step(self, action):
        obs, reward, done, info = self.env.step(action)
        self.was_real_done = done
        # check current lives, make loss of life terminal,
        # then update lives to handle bonus lives
        lives = self.env.unwrapped.ale.lives()
        if lives < self.lives and lives > 0:
            # for Qbert somtimes we stay in lives == 0 condtion for a few frames
            # so its important to keep lives > 0, so that we only reset once
            # the environment advertises done.
            done = True
        self.lives = lives
        return obs, reward, done, info

    def reset(self):
        """Reset only when lives are exhausted.
        This way all states are still reachable even though lives are episodic,
        and the learner need not know about any of this behind-the-scenes.
        """
        if self.was_real_done:
            obs = self.env.reset()
            self.was_real_reset = True
        else:
            # no-op step to advance from terminal/lost life state
            obs, _, _, _ = self.env.step(0)
            self.was_real_reset = False
        self.lives = self.env.unwrapped.ale.lives()
        return obs


class ProcessFrame84(gym.ObservationWrapper):
    def __init__(self, env=None):
        super(ProcessFrame84, self).__init__(env)
        self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1))

    def observation(self, obs):
        return ProcessFrame84.process(obs)

    @staticmethod
    def process(frame):
        if frame.size == 210 * 160 * 3:
            img = np.reshape(frame, [210, 160, 3]).astype(np.float32)
        elif frame.size == 250 * 160 * 3:
            img = np.reshape(frame, [250, 160, 3]).astype(np.float32)
        else:
            assert False, "Unknown resolution."
        img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
        resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA)
        x_t = resized_screen[18:102, :]
        x_t = np.reshape(x_t, [84, 84, 1])
        return x_t.astype(np.uint8)


class ClippedRewardsWrapper(gym.RewardWrapper):
    def reward(self, reward):
        """Change all the positive rewards to 1, negative to -1 and keep zero."""
        return np.sign(reward)


class LazyFrames(object):
    def __init__(self, frames):
        """This object ensures that common frames between the observations are only stored once.
        It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
        buffers.
        This object should only be converted to numpy array before being passed to the model.
        You'd not belive how complex the previous solution was."""
        self._frames = frames

    def __array__(self, dtype=None):
        out = np.concatenate(self._frames, axis=0)
        if dtype is not None:
            out = out.astype(dtype)
        return out


class FrameStack(gym.Wrapper):
    def __init__(self, env, k):
        """Stack k last frames.
        Returns lazy array, which is much more memory efficient.
        See Also
        --------
        baselines.common.atari_wrappers.LazyFrames
        """
        gym.Wrapper.__init__(self, env)
        self.k = k
        self.frames = deque([], maxlen=k)
        shp = env.observation_space.shape
        self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0]*k, shp[1], shp[2]))

    def reset(self):
        ob = self.env.reset()
        for _ in range(self.k):
            self.frames.append(ob)
        return self._get_ob()

    def step(self, action):
        ob, reward, done, info = self.env.step(action)
        self.frames.append(ob)
        return self._get_ob(), reward, done, info

    def _get_ob(self):
        assert len(self.frames) == self.k
        return LazyFrames(list(self.frames))


class ChannelsFirstImageShape(gym.ObservationWrapper):
    """
    Change image shape to CWH
    """
    def __init__(self, env):
        super(ChannelsFirstImageShape, self).__init__(env)
        old_shape = self.observation_space.shape
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(old_shape[-1], old_shape[0], old_shape[1]))

    def observation(self, observation):
        return np.swapaxes(observation, 2, 0)

In [0]:
class Environment():
    def __init__(self, env_name, is_video=False):
        self.env = gym.make(env_name)
        self.action_space = self.env.action_space
        self.observation_space = self.env.observation_space
        if is_video:
            self.env = gym.wrappers.Monitor(self.env, "./output/video/", force=True)

    def gen_wrapped_env(self):
        self.env = NoopResetEnv(self.env, noop_max=30)
        if 'FIRE' in self.env.unwrapped.get_action_meanings():
            self.env = FireResetEnv(self.env)
        self.env = ProcessFrame84(self.env)
        self.env = ChannelsFirstImageShape(self.env)
        self.env = FrameStack(self.env, 4)

# Wrapper
----
# Logger

In [0]:
import os
import csv
import numpy as np
import shutil
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import datetime

TRAINING_UPDATE_FREQUENCY = 1000
RUN_UPDATE_FREQUENCY = 10


class Logger():

    def __init__(self, header, directory_path):
        self.header = header
        self.directory_path = directory_path + self.timestamp() + "/"

        self.score = []
        self.step = []
        self.loss = []
        self.accuracy = []
        self.q = []

        if os.path.exists(self.directory_path):
            shutil.rmtree(self.directory_path, ignore_errors=True)
        os.makedirs(self.directory_path)

    def timestamp(self):
        return str(datetime.datetime.now().strftime('%Y-%m-%d_%H-%M'))

    def add_gameover(self, gameover):
        if gameover % RUN_UPDATE_FREQUENCY == 0:
            print('{{"metric": "gameover", "value": {}}}'.format(gameover))

    def add_score(self, value):
        self.add_log(self.score, value, "gameover", "score", RUN_UPDATE_FREQUENCY, self.directory_path, self.header)

    def add_step(self, value):
        self.add_log(self.step, value, "gameover", "number of steps", RUN_UPDATE_FREQUENCY, self.directory_path, self.header)

    def add_accuracy(self, value):
        self.add_log(self.accuracy, value, "update", "accuracy", TRAINING_UPDATE_FREQUENCY, self.directory_path, self.header)

    def add_loss(self, value):
        value = min(5, value)  # clip loss, max = 5
        self.add_log(self.loss, value, "update", "loss", TRAINING_UPDATE_FREQUENCY, self.directory_path, self.header)

    # def add_q(self, value):
    #     self.q.add_log(self.q, value, "update", "q", TRAINING_UPDATE_FREQUENCY, self.directory_path, self.header)

    def add_log(self, values, value, x_label, y_label, update_frequency, directory_path, header):
        values.append(value)
        if len(values) % update_frequency == 0:
            mean_value = np.mean(values)
            print(y_label + ": (min: " + str(min(values)) + ", avg: " + str(mean_value) + ", max: " + str(max(values)))
            print('{"metric": "' + y_label + '", "value": {}}}'.format(mean_value))
            self.save_data(self.directory_path + y_label + ".csv", mean_value)
            self.save_fig(input_path=self.directory_path + y_label + ".csv",
                           output_path=self.directory_path + y_label + ".png",
                           small_batch_length=update_frequency,
                           big_batch_length=update_frequency*10,
                           x_label=x_label,
                           y_label=y_label)
            values = []

    def save_fig(self, input_path, output_path, small_batch_length, big_batch_length, x_label, y_label):
        x = []
        y = []
        with open(input_path, "r") as scores:
            reader = csv.reader(scores)
            data = list(reader)
            for i in range(0, len(data)):
                x.append(float(i)*small_batch_length)
                y.append(float(data[i][0]))

        plt.subplots()
        plt.plot(x, y, label="last " + str(small_batch_length) + " average")

        plt.title(self.header)
        plt.xlabel(x_label)
        plt.ylabel(y_label)
        plt.legend(loc="upper left")
        plt.savefig(output_path, bbox_inches="tight")
        plt.close()

    def save_data(self, path, score):
        if not os.path.exists(path):
            with open(path, "w"):
                pass
        scores_file = open(path, "a")
        with scores_file:
            writer = csv.writer(scores_file)
            writer.writerow([score])

# Logger
-----
# Network

In [0]:
from keras.optimizers import RMSprop
from keras.models import Sequential
from keras.layers import Conv2D, Flatten, Dense


class CNN():
    def __init__(self, input_shape, action_space, is_dueling=False):
        self.input_shape = input_shape
        self.action_space = action_space
        self.optimizer = RMSprop(lr=0.00025,rho=0.95,epsilon=0.01)
        self.model = self.build_model()

    def build_model(self):
        model = Sequential()
        model.add(Conv2D(32,8,
                    strides=(4, 4),
                    padding="valid",
                    activation="relu",
                    input_shape=self.input_shape,
                    data_format="channels_first"))
        model.add(Conv2D(64,4,
                    strides=(2, 2),
                    padding="valid",
                    activation="relu",
                    input_shape=self.input_shape,
                    data_format="channels_first"))
        model.add(Conv2D(64,3,
                    strides=(1, 1),
                    padding="valid",
                    activation="relu",
                    input_shape=self.input_shape,
                    data_format="channels_first"))
        model.add(Flatten())
        model.add(Dense(512, activation="relu"))
        model.add(Dense(self.action_space))
        model.compile(loss="mse",
                  optimizer=self.optimizer,
                  metrics=["accuracy"])
        model.summary()
        return model

# Network
----
# Model

In [0]:
import numpy as np
import os
import random
import shutil


GAMMA = 0.99
MEMORY_SIZE = 900000
BATCH_SIZE = 32
TRAINING_FREQUENCY = 4
TARGET_NETWORK_UPDATE_FREQUENCY = 40000
MODEL_PERSISTENCE_UPDATE_FREQUENCY = 10000
REPLAY_START_SIZE = 50000

EPSILON_MAX = 1.0
EPSILON_MIN = 0.1
EPSILON_TEST = 0.02
EPSILON_STEPS = 850000

class ReplayMemory():
    def __init__(self, memory_size=1000000, batch_size=32):
      self.memory_size = memory_size
      self.batch_size = batch_size
      self.replays = []

    def add_replay(self, state, action, reward, next_state, terminal):
        new_replay = {"state": state,
                "action": action,
                "reward": reward,
                "next_state": next_state,
                "terminal": terminal}
        self.replays.append(new_replay)
        if len(self.replays) > self.memory_size:
            self.replays.pop(0)

    def gen_batch(self):
        batch = np.asarray(random.sample(self.replays, self.batch_size))
        return batch


class DQNTrain():

    def __init__(self, game_name, input_shape, action_space, save_log=True, save_weights=True):
        self.action_space = action_space
        self.input_shape = input_shape
        self.epsilon = EPSILON_MAX
        self.memory = ReplayMemory(MEMORY_SIZE, BATCH_SIZE)

        self.ddqn_eval = CNN(self.input_shape, self.action_space).model
        self.ddqn_target = CNN(self.input_shape, self.action_space).model
        self.update_target_weights()

        self.save_log = save_log
        self.save_weights = save_weights
        self.save_path = "./output/train"
        self.logger = Logger(game_name + "Train", self.save_path + "/logs/ddqn/")

    def get_action(self, state):
        if np.random.uniform() < self.epsilon or len(self.memory.replays) < REPLAY_START_SIZE:
            return random.randrange(self.action_space)
        else:
            q_values = self.ddqn_eval.predict(np.expand_dims(np.asarray(state).astype(np.float64), axis=0), batch_size=1)
            return np.argmax(q_values[0])

    def update_epsilon(self):
        self.epsilon -= (EPSILON_MAX-EPSILON_MIN)/EPSILON_STEPS
        if self.epsilon < EPSILON_MIN:
          self.epsilon = EPSILON_MIN 

    def save_model(self):
        model_path = self.save_path + "/models/ddqn/" + self.logger.timestamp() + "/model.h5"
        if os.path.exists(os.path.dirname(model_path)):
            shutil.rmtree(os.path.dirname(model_path), ignore_errors=True)
        os.makedirs(os.path.dirname(model_path))
        self.ddqn_eval.save_weights(model_path)

    def update_target_weights(self):
        self.ddqn_target.set_weights(self.ddqn_eval.get_weights())

    def update(self, total_step):
        if len(self.memory.replays) < REPLAY_START_SIZE:
            return

        if total_step % TRAINING_FREQUENCY == 0 and self.save_log:
            loss, accuracy = self.learn()
            self.log_model_status(loss, accuracy)
    
        self.update_epsilon()

        if total_step % MODEL_PERSISTENCE_UPDATE_FREQUENCY == 0 and self.save_weights:
            self.save_model()

        if total_step % TARGET_NETWORK_UPDATE_FREQUENCY == 0:
            self.update_target_weights()

    def learn(self, mode=2015):
        batch = self.memory.gen_batch()

        states = []
        q_values = []
        # max_q_values = []  TODELETE

        for record in batch:
            state = np.expand_dims(np.asarray(record["state"]).astype(np.float64), axis=0)
            states.append(state)
            next_state = np.expand_dims(np.asarray(record["next_state"]).astype(np.float64), axis=0)

            q = list(self.ddqn_eval.predict(state)[0])
            if mode == 2015:
               q[record["action"]] = record["reward"] + (1 - record["terminal"]) * GAMMA * np.max(self.ddqn_target.predict(next_state).ravel())
            elif mode == 2013:
                q[record["action"]] = record["reward"] + (1 - record["terminal"]) * GAMMA * np.max(self.ddqn.predict(next_state).ravel())
            elif mode == 2016:
                next_max_action = np.argmax(self.ddqn.predict(next_state).ravel())
                q[record["action"]] = record["reward"] + (1 - record["terminal"]) * GAMMA * self.ddqn_target.predict(next_state).ravel()[next_max_action]

            q_values.append(q)
            # max_q_values.append(np.max(q))

        fit = self.ddqn_eval.fit(np.asarray(states).squeeze(),
                      np.asarray(q_values).squeeze(),
                      batch_size=BATCH_SIZE,
                      verbose=0)
        
        loss = fit.history["loss"][0]
        accuracy = fit.history["accuracy"][0]
        # return loss, accuracy, np.mean(max_q_values)  TODELETE
        return loss, accuracy

    def log_game_status(self, score, step, gameover):
        self.logger.add_score(score)
        self.logger.add_step(step)
        self.logger.add_gameover(gameover)

    def log_model_status(self, loss, accuracy):
        self.logger.add_loss(loss)
        self.logger.add_accuracy(accuracy)
        # self.logger.add_q(average_max_q)  TODELETE


class DQNTest():
    def __init__(self, game_name, input_shape, action_space, testing_model_path):
        self.action_space = action_space
        self.ddqn = CNN(input_shape, action_space).model
        self.logger = Logger(game_name + "Test", "./output/test/logs/ddqn/")
        assert os.path.exists(os.path.dirname(testing_model_path)), "No testing model in: " + str(testing_model_path)
        if os.path.isfile(testing_model_path):
            self.ddqn.load_weights(testing_model_path)

    def get_action(self, state):
        if np.random.rand() < EPSILON_TEST:
            return random.randrange(self.action_space)
        q_values = self.ddqn.predict(np.expand_dims(np.asarray(state).astype(np.float64), axis=0), batch_size=1)
        return np.argmax(q_values[0])

    def log_model_status(self, loss, accuracy):
        self.logger.add_loss(loss)
        self.logger.add_accuracy(accuracy)

# Model
----
# Entry

In [0]:
import gym
import argparse
import numpy as np
import atari_py

# preprocessing & generate env
game_name = "Breakout"
env_name = game_name + "Deterministic-v4"  
breakout_env = Environment(env_name)
breakout_env.gen_wrapped_env()
env = breakout_env.env
action_space = env.action_space.n
img_size = 84
input_shape = (action_space, img_size, img_size)
max_train_step = 2000000

# generate model
is_train = True
model = DQNTrain(game_name, input_shape, action_space) if is_train else DQNTest(game_name, input_shape, action_space)
 
# iteration of updating the Q table
gameover = 0
total_step = 0
while True:
    step = 0
    score = 0
    state = env.reset()
    while True:
        if total_step >= max_train_step:
            print(f"Traning of {max_train_step} steps completed!")
            exit(0)
        action = model.get_action(state)
        next_state, reward, terminal, info = env.step(action)
        reward = np.sign(reward)
        if is_train:
          model.memory.add_replay(state, action, reward, next_state, terminal)

        score += reward
        state = next_state
        total_step += 1
        if is_train:
          model.update(total_step)
        step += 1
        
        if terminal:
            model.log_game_status(score, step, gameover)
            gameover += 1
            break



Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_7 (Conv2D)            (None, 32, 20, 20)        8224      
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 64, 9, 9)          32832     
_________________________________________________________________
conv2d_9 (Conv2D)            (None, 64, 7, 7)          36928     
_________________________________________________________________
flatten_3 (Flatten)          (None, 3136)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 512)               1606144   
_________________________________________________________________
dense_6 (Dense)              (None, 4)                 2052      
Total params: 1,686,180
Trainable params: 1,686,180
Non-trainable params: 0
____________________________________________