In [1]:
!pip install gym[atari]



In [2]:
!pip install numpy
!pip install  gym
!pip install keras
!pip install matplotlib
!pip install opencv-python
!pip install tensorflow-gpu



In [2]:
import keras
from keras import layers
from keras.utils import to_categorical
import tensorflow as tf
import numpy as np
import time
import os
import gym
import sys
from gym import error, spaces
from gym import utils
from gym.utils import seeding
try:
    import atari_py
except ImportError as e:
    raise error.DependencyNotInstalled(
            "{}. (HINT: you can install Atari dependencies by running "
            "'pip install gym[atari]'.)".format(e))

In [3]:
def create_model():
    # keras example for breakout
    inputs = layers.Input(shape=(105, 80, 4), dtype=np.float32)
    layer1 = layers.Conv2D(32, 8, strides=4, activation="relu")(inputs)
    layer2 = layers.Conv2D(64, 4, strides=2, activation="relu")(layer1)
    layer3 = layers.Conv2D(64, 3, strides=1, activation="relu")(layer2)
    layer4 = layers.Flatten()(layer3)
    layer5 = layers.Dense(512, activation="relu")(layer4)
    action = layers.Dense(2, activation="linear")(layer5)
    model = keras.Model(inputs=inputs, outputs=action)
    optimizer=keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)  # faster than rmsprop
    model.compile(optimizer, loss=keras.losses.Huber())  # Huber for stability
    return model

In [4]:
model = create_model()
second_model = create_model()
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 105, 80, 4)]      0         
_________________________________________________________________
conv2d (Conv2D)              (None, 25, 19, 32)        8224      
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 11, 8, 64)         32832     
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 9, 6, 64)          36928     
_________________________________________________________________
flatten (Flatten)            (None, 3456)              0         
_________________________________________________________________
dense (Dense)                (None, 512)               1769984   
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 1026  

In [5]:
def rgb_to_greyscale(observation):
    observation = observation[:,:,0] + observation[:,:,1] + observation[:,:,2]
    return np.where(observation > 0, 255, 0)[::2, ::2]

In [7]:
env = gym.make("BreakoutDeterministic-v4")
print("env:", env)
print("env.env:", env.env)
predicted_action = 2
previous_lives = 5

observation = env.reset()
env.step(1)
for t in range(50):
    env.render()
    action = env.action_space.sample()
    action =  2 if np.random.random(1)[0] > 0.5 else 3
    observation, reward, done, info = env.step(action)
    if done:
        break
    if info["ale.lives"] != previous_lives:
        env.step(1)
        previous_lives = info["ale.lives"]
    if reward > 0:
        print("Not negative reward", reward, info)
    time.sleep(0.1)
    if done:
        break


time.sleep(2)
env.close()
# print("help:", help(env.env))

env: <TimeLimit<AtariEnv<BreakoutDeterministic-v4>>>
env.env: <AtariEnv<BreakoutDeterministic-v4>>


In [8]:
state = rgb_to_greyscale(observation)
state.shape

(105, 80)

In [9]:
state = rgb_to_greyscale(observation)
tensor_state = np.array([state, state, state, state])
tensor_state = tensor_state.reshape(105, 80, 4)
tensor_state.shape

(105, 80, 4)

In [10]:
x = model.predict(np.array([tensor_state]), verbose=1)



In [11]:
x[0, 0]

11.771219

In [6]:
# Variable declaration
state_history = []
action_history = []
reward_history = []
next_state_history = []
optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)  # faster than rmsprop # TODO: somehow set the learning rate higher at start, and decreasing it over time
loss_function = keras.losses.Huber()  # used for stability 

In [13]:
(np.array(state_history)).shape
four_states = np.array(state_history[-4:])  # takes last 4 known states
four_states = four_states.reshape(105, 80, 4)  # reshapes them into input shape 
four_states.shape
predictions = model.predict(np.array([four_states]), verbose=0)
predictions

ValueError: cannot reshape array of size 0 into shape (105,80,4)

In [None]:
action = 2 if predictions[0, 0] > predictions[0 , 1] else 3
action
env.render()
_, _, _, _ = env.step(3)


In [7]:
# Parameters
epsilon = 1.0
gamma = 0.5
previous_lives = 5
max_memory_legth = 10000
improvement_check = 100
iterations = 0
games_played = 0
update_second_model = 5000
batch_size = 32

In [8]:
env = gym.make("BreakoutDeterministic-v4")  # is better than v0
while True:  # training continues forever
    observation = env.reset()  # reseting env
    state = rgb_to_greyscale(observation)  # converts observation (210, 160, 3), to greydownscaled state, (105, 80)
    env.step(1)  # FIRE triggers ball initialization
    episode_reward = 0  # initializes reward for the new episode which follows
    while True:  # agent still has lives
#         env.render()
        iterations += 1  # increasing total iterations of game
        if epsilon > np.random.rand(1)[0]:  # a leap of fate, exploration
            action = 2 if np.random.random(1)[0] > 0.5 else 3  # random left or right | 3 is left, 2 is right
        else:  # agent must predict action, exploatation
            four_states = np.array(state_history[-4:])  # takes last 4 known states
            four_states = four_states.reshape(105, 80, 4)  # reshapes them into input shape 
            predictions = model.predict(np.array([four_states]), verbose=0)  # gets reward predictions for both actions
            action = 2 if predictions[0, 0] > predictions[0 , 1] else 3  # choses the actions with the greatest predicted reward
            
        if epsilon > 0.05:  # decay over time is applied to epsilon until it reaches critical value
            epsilon -= epsilon / 10000  # * np.random.random(1)   # decrease is done by (at least) 0.01 %, critical value is reached in (at least) 29956 steps
        
        observation, reward, done, info = env.step(action)  # action is played, returns new observation, possible reward, done flag and lives remained
        next_state = rgb_to_greyscale(observation)  # converts observation (210, 160, 3), to greydownscaled state, (105, 80)  # TODO: check if next_state is really needed, we might only use state
        
        if info["ale.lives"] != previous_lives:  # if number of lives decreased during this frame
            env.step(1)  # FIRE resummons ball
            previous_lives = info["ale.lives"]  # updates previous_lives with current lives
            reward -= 10  # updates reward with negative value because a life was lost
            
        # uncomment this later
        # if reward == 0:  # if no reward is received
        #     reward -= 0.1  # reward receives small negative value, should encourage the agent to finish the game faster
        
        # saving values
        state_history.append(state)
        action_history.append(action)
        reward_history.append(reward)
        next_state_history.append(next_state)  # next_state of state_history[3] = state_history[4]  # TODO: only use state_history
        state = next_state  # replaces old state with new one
        
        episode_reward += reward  # increases reward for this episode, for checking out improvements for games
        
        # TODO: apply backprop sometimes in the future
        
        # Start Back Prop
        
        if iterations % batch_size == 0:  # doing backprop once every 32 steps
            indices = np.random.choice(range(4, len(action_history)), size=batch_size)  # get only indices that have at least 4 previous states, and 1 next state
            
            state_sample = np.array([state_history[i-4:i] for i in indices])  # takes groups of 4 images of game board, previous and except current index
            state_sample = state_sample.reshape(batch_size, 105, 80, 4)  # reshapes group from (32, 4, 105, 80) to (32, 105, 80, 4)
            next_state_sample = np.array([state_history[i - 3: i + 1] for i in indices]) # takes gropus of 4 images of game board, previous and including current index
            next_state_sample = next_state_sample.reshape(batch_size, 105, 80, 4)  
            reward_sample = np.array([reward_history[i] for i in indices])  # has shape (32,)
            action_sample = [action_history[i] - 2 for i in indices]  # has len 32; 2 is decreased from each action to transform it into 0 or 1, to minimize one_hot_vectors size
            future_rewards = np.amax(second_model.predict(next_state_sample, verbose=0), axis=1)  # gets maximum prediction using second model of future rewards for each next state sample
            updated_q_values = reward_sample + gamma * future_rewards  # for current state, adds reward obtained to next state predicted max reward
            masks = to_categorical(action_sample)  # one hot masks are created for actions, to apply backprop only for chosen actions
        
            with tf.GradientTape() as tape:  # Copied example from keras q-learning. Applies backpropagation to model
                q_values = model(state_sample)  # same as `q_values = model.predict(state_sample, verbose=0)`, but returns tensor
                q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)  # same as `q_action = np.sum(q_values * masks, axis=1)`, but returns tensor
                loss = loss_function(updated_q_values, q_action)  # calculates the loss between updated_q_values, which are correct labels expected, and q_action is the output obtained
                grads = tape.gradient(loss, model.trainable_variables)  # yess, applies gradient to weights
                optimizer.apply_gradients(zip(grads, model.trainable_variables))  # yess, uses optimizer to update wigths
        
        
        # End Back Prop
        
        if iterations % update_second_model == 0:  # once every 5000 iterations
            second_model.set_weights(model.get_weights())  # updates second model
        
        if len(action_history) > max_memory_legth:  # if max memory was reached
            del state_history[:5000]  # deletes first 5000 elements from each list
            del action_history[:5000]
            del reward_history[:5000]
            del next_state_history[:5000]
            
        if done:  # end game flag
            games_played += 1  # increasing played games
            if games_played % improvement_check == 0:  # once every 100 played games
                model.save("a.h5")
                print(f"Reward: {episode_reward}, games played: {games_played}, iterations made: {iterations}")
            break  # exits current game

    
    if iterations % 10000 == 0:
        print(f"Reward: {episode_reward}, games played: {games_played}, iterations made: {iterations}")
        print(games_played)
# env.close()

Reward: -55.0, games played: 100, iterations made: 17096
Reward: -60.0, games played: 200, iterations made: 33059


ResourceExhaustedError: OOM when allocating tensor with shape[55,475,256] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu [Op:Conv2DBackpropInput]

In [None]:
from keras.models import load_model
model = load_model('a.h5')
second_model = load_model('a.h5')

In [None]:
env.step(3)
env.render()

In [16]:
indices = np.random.choice(range(4, len(action_history) - 1), size=batch_size)
indices

array([ 748, 1965, 1883,  469,  172, 1240, 1997, 2129, 2483,  845, 2754,
       1201,  418, 2433, 1885,  528, 2008,  285, 2663,   13, 2320, 2930,
        389,  570, 2221,  222, 2115, 2104, 1277,   56, 2965,  569])

In [17]:
state_sample = np.array([state_history[i-4:i] for i in indices])
state_sample = state_sample.reshape(batch_size, 105, 80, 4)
state_sample.shape

(32, 105, 80, 4)

In [18]:
next_state_sample = np.array([state_history[i - 3: i + 1] for i in indices])
next_state_sample = next_state_sample.reshape(batch_size, 105, 80, 4)
next_state_sample.shape

(32, 105, 80, 4)

In [23]:
reward_sample = np.array([reward_history[i] for i in indices])
reward_sample.shape

(32,)

In [20]:
action_sample = [action_history[i] - 2 for i in indices]
len(action_sample)

32

In [21]:
future_rewards = np.amax(model.predict(next_state_sample, verbose=0), axis=1)
future_rewards.shape

(32,)

In [24]:
updated_q_values = reward_sample + gamma * future_rewards
updated_q_values

array([  7.75147858, -91.85701561,   8.11975327,   7.20004692,
         5.73523035,   7.6509284 ,   8.96559372,   7.31376162,
         7.96670856,   8.30192795,   8.49255314,   8.34010067,
         7.88028183,   8.53532734,   9.04493752,   7.46779385,
         8.08306541,   4.59949722,   7.2463974 ,   7.91787567,
         8.04298439,   7.06307402,   8.04298439,   6.02660122,
         7.16639652,   7.55825224,   9.52005901,   6.95206108,
         8.9129776 ,   7.03784075, -91.51414108,   7.07100906])

In [25]:
masks = to_categorical(action_sample)
masks.shape

(32, 2)

In [26]:
q_values = model.predict(state_sample, verbose=0)
q_values

array([[14.878384 , -3.0652595],
       [16.285969 , -3.1187997],
       [17.679295 , -3.5014887],
       [15.0685425, -1.8668299],
       [15.468747 , -7.6869564],
       [17.142412 , -3.7448063],
       [16.702974 , -2.5793161],
       [15.241731 , -6.9109945],
       [17.850702 , -3.1788397],
       [16.880201 , -3.655157 ],
       [16.380934 , -4.2904115],
       [15.942877 , -3.1279793],
       [17.294239 , -2.8690991],
       [17.466892 , -3.4695053],
       [17.09319  , -2.7820396],
       [14.201021 , -7.633436 ],
       [16.306837 , -3.8131094],
       [ 5.4835024, -6.65471  ],
       [14.772081 , -3.7185078],
       [15.703913 , -2.1918273],
       [16.378193 , -4.135713 ],
       [15.842686 , -3.5639486],
       [16.285969 , -3.1187997],
       [14.342003 , -6.874502 ],
       [15.771332 , -3.3852835],
       [17.586872 , -5.674301 ],
       [15.065003 , -6.34674  ],
       [14.243395 , -3.1381192],
       [17.622715 ,  1.1727142],
       [16.08644  , -5.409835 ],
       [16

In [27]:
q_action = np.sum(q_values * masks, axis=1)
q_action.shape

(32,)

In [43]:
loss = model.loss(updated_q_values, q_action)
loss

<tf.Tensor: shape=(), dtype=float32, numpy=88.40752>

In [44]:

with tf.GradientTape() as tape:
    q_values = model(state_sample)
    q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
    loss = loss_function(updated_q_values, q_action)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

In [54]:
np.amax(model.trainable_variables[0].numpy())

0.054547507

In [53]:
np.amax(model.trainable_variables[0].numpy())

0.054547507