In [1]:
!pip install gym[atari]



In [2]:
!pip install numpy
!pip install  gym
!pip install keras
!pip install matplotlib
!pip install opencv-python
!pip install tensorflow-gpu



In [32]:
import keras
from keras import layers
from keras.utils import to_categorical
import tensorflow as tf
import numpy as np
import time
import os
import gym
import sys
from gym import error, spaces
from gym import utils
from gym.utils import seeding
try:
    import atari_py
except ImportError as e:
    raise error.DependencyNotInstalled(
            "{}. (HINT: you can install Atari dependencies by running "
            "'pip install gym[atari]'.)".format(e))

In [33]:
def create_model():
    outputs = 2
    # keras example for breakout
    inputs = layers.Input(shape=(105, 80, 4), dtype=np.float32)
    layer1 = layers.Conv2D(32, 8, strides=4, activation="relu")(inputs)
    layer2 = layers.Conv2D(64, 4, strides=2, activation="relu")(layer1)
    layer3 = layers.Conv2D(64, 3, strides=1, activation="relu")(layer2)
    layer4 = layers.Flatten()(layer3)
    layer5 = layers.Dense(512, activation="relu")(layer4)
    action = layers.Dense(outputs, activation="linear")(layer5)
    model = keras.Model(inputs=inputs, outputs=action)
    optimizer=keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)  # faster than rmsprop
    model.compile(optimizer, loss=keras.losses.Huber())  # Huber for stability
    return model

In [34]:
model = create_model()
second_model = create_model()
model.summary()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 105, 80, 4)]      0         
_________________________________________________________________
conv2d_12 (Conv2D)           (None, 25, 19, 32)        8224      
_________________________________________________________________
conv2d_13 (Conv2D)           (None, 11, 8, 64)         32832     
_________________________________________________________________
conv2d_14 (Conv2D)           (None, 9, 6, 64)          36928     
_________________________________________________________________
flatten_4 (Flatten)          (None, 3456)              0         
_________________________________________________________________
dense_8 (Dense)              (None, 512)               1769984   
_________________________________________________________________
dense_9 (Dense)              (None, 2)                 1026

In [35]:
def rgb_to_greyscale(observation):
    observation = observation[:,:,0] + observation[:,:,1] + observation[:,:,2]
    return np.where(observation > 0, 255, 0)[::2, ::2]

In [36]:
# Variable declaration
state_history = []
action_history = []
reward_history = []
lr_schedule = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.01,
    decay_steps=100000,
    decay_rate=0.9
)
optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)  # faster than rmsprop
loss_function = keras.losses.Huber()  # used for stability

In [37]:
# Parameters
epsilon = 1.0
gamma = 0.5  # values tried from 0.5 to 0.99
previous_lives = 5
max_memory_legth = 10000
improvement_check = 100
iterations = 0
games_played = 0
update_second_model = 5000
batch_size = 32

In [None]:
env = gym.make("BreakoutNoFrameskip-v4")  # tried v0 and deterministic
while True:  # training continues forever
    observation = env.reset()  # reseting env
    state = rgb_to_greyscale(observation)  # converts observation (210, 160, 3), to greydownscaled state, (105, 80)
    
    # when predictiong only LEFT or RIGHT, we have to manually FIRE
    env.step(1)  # FIRE triggers ball initialization
    
    
    episode_reward = 0  # initializes reward for the new episode which follows
    
    while True:  # agent still has lives
#         env.render()

        iterations += 1  # increasing total iterations of game
        if epsilon > np.random.rand(1)[0]:  # a leap of fate, exploration
            # case of model output being only LEFT / RIGHT 
            action = 2 if np.random.random(1)[0] > 0.5 else 3  # random left or right | 3 is left, 2 is right
            
            # case of model output being all 4 actions: NOOP, FIRE, LEFT, RIGHT
#             action = np.random.choice(4)
        else:  # agent must predict action, exploatation
            four_states = np.array(state_history[-4:])  # takes last 4 known states
            four_states = four_states.reshape(105, 80, 4)  # reshapes them into input shape 
            predictions = model.predict(np.array([four_states]), verbose=0)  # gets reward predictions for both actions
            # case of model output being only LEFT / RIGHT 
            action = 2 if predictions[0, 0] > predictions[0 , 1] else 3  # choses the actions with the greatest predicted reward
            
            # case of model output being all 4 actions: NOOP, FIRE, LEFT, RIGHT
#             action = np.argmax(predictions[0])
            
        if epsilon > 0.05:  # decay over time is applied to epsilon until it reaches critical value
            epsilon -= epsilon / 1000000  # critical value is reached in 2995732 steps
    
        # we should be more interested about total reward later in training, at the beggining we should be greedy for reward
        if gamma < 0.99:
            gamma += gamma / 1000000  
        
        observation, reward, done, info = env.step(action)  # action is played, returns new observation, possible reward, done flag and lives remained
        next_state = rgb_to_greyscale(observation)  # converts observation (210, 160, 3), to greydownscaled state, (105, 80) 
        
        if info["ale.lives"] != previous_lives:  # if number of lives decreased during this frame
            env.step(1)  # FIRE resummons ball
            previous_lives = info["ale.lives"]  # updates previous_lives with current lives
            # here we tried different negative rewards, including no negative reward for life lost and including no negative reward for DONE
#             reward -= 1  # updates reward with negative value because a life was lost
            if done:  # if game is finished, agent lost
                reward -= 1
            
        # we also tried this, the purpose being to encourage the agent to try to make the ball reach above bricks and finish the game faster, but it never reached that state
        # uncomment this later
        # if reward == 0:  # if no reward is received
        #     reward -= 0.1  # reward receives small negative value, should encourage the agent to finish the game faster
        
        # saving values
        state_history.append(state)
        action_history.append(action)
        reward_history.append(reward)
        state = next_state  # replaces old state with new one
        
        episode_reward += reward  # increases reward for this episode, for checking out improvements for games
        
        # Start Back Prop
        
        if iterations % 4 == 0 and len(action_history) > batch_size:  # doing backprop once every 4, 16 or 32 steps 
            
            indices = np.random.choice(range(4, len(action_history) - 1), size=batch_size)  # get only indices that have at least 4 previous states, and 1 next state
            
            state_sample = np.array([state_history[i-4:i] for i in indices])  # takes groups of 4 images of game board, previous and except current index
            state_sample = state_sample.reshape(batch_size, 105, 80, 4)  # reshapes group from (32, 4, 105, 80) to (32, 105, 80, 4)
            next_state_sample = np.array([state_history[i - 3: i + 1] for i in indices]) # takes gropus of 4 images of game board, previous and including current index
            next_state_sample = next_state_sample.reshape(batch_size, 105, 80, 4)  
            reward_sample = np.array([reward_history[i] for i in indices])  # has shape (32,)
            action_sample = [action_history[i] - 2 for i in indices]  
            # has len 32; 2 is decreased from each action to transform it into 0 or 1, to minimize one_hot_vectors size
            # when using all 4 actions, -2 is not decreased
            future_rewards = np.amax(second_model.predict(next_state_sample, verbose=0), axis=1)  
            # gets maximum prediction using second model of future rewards for each next state sample
            updated_q_values = reward_sample + gamma * future_rewards  # for current state, adds reward obtained to next state predicted max reward
            masks = to_categorical(action_sample, num_classes=2)  # one hot masks are created for actions, to apply backprop only for chosen actions
            
            # The code for when we tried to do backprop by using model.fit
#             q_values = model.predict(state_sample, verbose=0)  # here we predict the values
#             q_action = np.sum(q_values * masks, axis=1)  # here we get the reward predicted for chosen action
            
#             true_labels = second_model.predict(next_state_sample, verbose=0)  # here we get again the predictions
#             j = 0
#             for i in true_labels:
#                 true_labels[np.argmax(i)] = updated_q_values[j]  # here we modify predictions for chosen actions with updated_q_values = reward  + gamma*  next_state prediction 
#                 j += 1
#             state_sample = tf.convert_to_tensor(state_sample)  
#             true_labels = tf.convert_to_tensor(true_labels)  
#             model.fit(
#                 state_sample,
#                 true_labels,
#                 epochs=1,
#                 verbose=0,
#                 batch_size=32
#             )

            # faster (and better) than model.fit
            with tf.GradientTape() as tape:  # Copied example from keras q-learning. Applies backpropagation to model
                # here we needed to use tensors, otherwise error
                q_values = model(state_sample)  # same as `q_values = model.predict(state_sample, verbose=0)`, but returns tensor
                q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)  # same as `q_action = np.sum(q_values * masks, axis=1)`, but returns tensor
                loss = loss_function(updated_q_values, q_action)  
                # calculates the loss between updated_q_values, which are correct labels expected, and q_action is the output obtained
                grads = tape.gradient(loss, model.trainable_variables)  # applies gradient to weights
                optimizer.apply_gradients(zip(grads, model.trainable_variables))  # uses optimizer to update wigths
            
        
        # End Back Prop
        
        if iterations % update_second_model == 0:  # once every 5000/other numbers iterations
            second_model.set_weights(model.get_weights())  # updates second model
        
        if len(action_history) > max_memory_legth:  # if max memory was reached
            del state_history[:5000]  # deletes first 5000 elements from each list, we also tried to delete only 1
            del action_history[:5000]
            del reward_history[:5000]
            del next_state_history[:5000]
            
        if done:  # end game flag
            games_played += 1  # increasing played games
            if games_played % 10 == 0:  # once every 100/other constant played games
                model.save("a1.h5")
                print(f"Reward: {episode_reward}, games played: {games_played}, iterations made: {iterations}")
            break  # exits current game

    
    if iterations % 10000 == 0:
        print(f"Reward: {episode_reward}, games played: {games_played}, iterations made: {iterations}")
        print(games_played)
# env.close()

0.096