In [1]:
#Imports
import gym
import tensorflow as tf
import gc

# To plot pretty figures and animations
%matplotlib nbagg
import matplotlib
import matplotlib.animation as animation
import matplotlib.pyplot as plt

from collections import deque
import numpy as np

####### Space Invaders
load_model           = True
environment_name     = "SpaceInvaders-v0"
discrete_actions     = 6
saver_file           = "./models/space_invaders_rl"
height               = 210
width                = 160
channels             = 1
frames_captured      = 5
learning_rate        = .000000004  #.00000002
n_epochs             = 1001
use_ai_every_x_epoch = 1
discount_decay_rate  = 0.95
frame_limit          = 2000
max_score            = 5

####### Pitfall
# environment_name     = "Pitfall-v0"
# discrete_actions     = 18
# load_model           = False
# saver_file           = "./models/pitfall_rl"
# height               = 210
# width                = 160
# channels             = 1
# frames_captured      = 5
# learning_rate        =.00001
# n_epochs             = 11
# use_ai_every_x_epoch = 5
# discount_decay_rate  = 0.95
# frame_limit          = 1000

####### River Raid
# environment_name     = "Riverraid-v0"
# discrete_actions     = 18
# load_model           = True
# saver_file           = "./models/pitfall_rl"
# height               = 210
# width                = 160
# channels             = 1
# frames_captured      = 5
# learning_rate        =.00001
# n_epochs             = 501
# use_ai_every_x_epoch = 5
# discount_decay_rate  = 0.95
# frame_limit          = 1000


def preprocess_observation(obs):
    img = obs
    img = img.mean(axis=2) # to greyscale
    #img = (img - 128) / 128 - 1 # normalize from -1. to 1.
    img = img / 256.0  # normalize from 0 to 1.
    return img

def show_observation(image, title="Image"):
    plt.figure(figsize=(11, 7))
    plt.subplot(121)
    plt.title(title)
    plt.imshow(image) #cmap="gray"
    plt.axis("off")
    plt.show()

def softmax(x):
    ex = np.exp(x)
    sum_ex = np.sum( np.exp(x))
    return ex/sum_ex
    
env = gym.make(environment_name)
observation = env.reset()
print("obs.shape: {}".format(observation.shape)) #obs.shape: (210, 160, 3)
print("env.action_space: {}".format(env.action_space)) #env.action_space: Discrete(9)

for step in range(102):
    observation, reward_float, done_bool, info_dict = env.step(1)
    obs_greyscale = preprocess_observation(observation)

#show_observation(observation)
#show_observation(obs_greyscale)
    
print (softmax([1,-2,3]))

test_softmax = softmax([4.3210541e-25, 5.4929095e-33, 5.3535387e-02, 1.2303401e-42, 9.4646466e-01, 1.9473004e-27])
print ("test_softmax: {}".format(test_softmax))

multinomial_action_array = np.random.multinomial(1, test_softmax)
print ("multinomial_action_array: {}".format(multinomial_action_array))



obs.shape: (210, 160, 3)
env.action_space: Discrete(6)
[0.11849965 0.00589975 0.8756006 ]
test_softmax: [0.13103449 0.13103449 0.13824064 0.13103449 0.33762142 0.13103449]
multinomial_action_array: [0 0 1 0 0 0]


In [2]:
from random import randint

eps_min = 0.00
eps_max = 0.5
eps_decay_steps = 25000

def helper_discount_rewards(rewards, discount_rate, begin_index, end_index):
    '''
    Takes in rewards and applies discount rate
    '''
    discounted_rewards = np.zeros(len(rewards))
    cumulative_rewards = 0
    for step in reversed(range(len(rewards))):
        cumulative_rewards = rewards[step] + cumulative_rewards * discount_rate
        discounted_rewards[step] = cumulative_rewards
        
    relevant_discounted_rewards = discounted_rewards[begin_index:end_index]
    
    reward_mean = relevant_discounted_rewards.mean()
    reward_std = relevant_discounted_rewards.std()
        
    #return discounted_rewards
    return [(discounted_reward - reward_mean)/reward_std for discounted_reward in discounted_rewards]

def epsilon_greedy(optimal_action, number_outputs, step):
    epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step/eps_decay_steps)
    
    if step % 1000 == 0:
        print("step: " + str(step) + ", epsilon: " + str(epsilon))
        
    if np.random.rand() < epsilon:
        return np.random.randint(number_outputs-1) # random action
    else:
        return optimal_action # optimal action

# print("epsilon_greedy(4): " + str(epsilon_greedy(4,9,50))) 
# print("epsilon_greedy(4): " + str(epsilon_greedy(4,9,5000)))
# print("epsilon_greedy(4): " + str(epsilon_greedy(4,9,500000)))
    
def action_to_one_hot(action, possible_action_count):
    #9 possible positions of the joystick 
    #(0=center, 1=up, 2=right, 3=left, 4=down, 5=upper-right, 6=upper-left, 7=lower-right, 8=lower-left)
    
    return_array = np.zeros(possible_action_count)
    action_int = int(action)
    
    return_array[action_int] = 1.0

    return return_array

def get_average_logits (logits_list, discounted_rewards):

    logit_sums = np.zeros(len(logits_list[0][0]))
    logit_sums_counter = np.ones(len(logits_list[0][0]))

    for this_logit, this_reward in zip(logits_list, discounted_rewards):
        temp_array = np.zeros(len(logits_list[0][0]))
        temp_counter_array = np.zeros(len(logits_list[0][0]))
        
        action = np.argmax(this_logit)
        temp_array[action] = this_logit[0][action]
        logit_sums = logit_sums + temp_array*this_reward
        temp_counter_array[action] = 1
        logit_sums_counter = logit_sums_counter + temp_counter_array
        
    return (logit_sums/logit_sums_counter)


# print("action_to_one_hot(3, 9): " + str(action_to_one_hot(3.0, 9)))
# print("action_to_one_hot(9, 9): " + str(action_to_one_hot(8, 9)))

#print(get_average_logits(all_logits, discounted_rewards))


#print(all_logits)

In [None]:
conv1_fmaps = 32
conv1_ksize = 3
conv1_stride = 2
conv1_pad = "SAME"

conv2_fmaps = 64
conv2_ksize = 3
conv2_stride = 2
conv2_pad = "SAME"

n_hidden_in = 64 * 263 * 40  # conv3 has 64 maps of 525x80 each

n_inputs  = discrete_actions
n_outputs = discrete_actions
n_hidden1 = 512
n_hidden2 = 256
n_hidden3 = 256
n_hidden4 = 256
n_hidden5 = 256
n_hidden6 = 256
n_hidden7 = 128
n_hidden8 = 64

dropout_keep_prob = 1.0

tf.reset_default_graph()

#with tf.name_scope("inputs"):
#tf_input_frame = tf.placeholder(tf.float32, shape=(None, height * width * channels))
tf_input_frame = tf.placeholder(tf.float32, shape=(None, height*frames_captured, width, channels))
tf_input_value = tf.placeholder(tf.float32, shape=(None, n_inputs))
tf_input_learning_rate = tf.placeholder(tf.float32)
tf_dropout_keep_prob = tf.placeholder(tf.float32)
tf_reward = tf.placeholder(tf.float32)
    
#with tf.name_scope("hidden"):
initializer = tf.contrib.layers.variance_scaling_initializer()


convs   = [16,32]
kerns   = [8,8]
strides = [4,4]
pads    = 'valid'
activ   = tf.nn.elu

# Policy Network
conv1 = tf.layers.conv2d(
        inputs = tf_input_frame,
        filters = convs[0],
        kernel_size = kerns[0],
        strides = strides[0],
        padding = pads,
        activation = activ,
        name='conv1')

conv2 = tf.layers.conv2d(
        inputs=conv1,
        filters = convs[1],
        kernel_size = kerns[1],
        strides = strides[1],
        padding = pads,
        activation = activ,
        name='conv2')

flat = tf.layers.flatten(conv2)



hidden1 = tf.layers.dense(flat, n_hidden1, activation=tf.nn.elu, name="hidden1", kernel_initializer=initializer)
#hidden1_drop = tf.nn.dropout(hidden1, tf_dropout_keep_prob)
hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.elu, name="hidden2", kernel_initializer=initializer)
#hidden2_drop = tf.nn.dropout(hidden2, tf_dropout_keep_prob)
hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=tf.nn.elu, name="hidden3", kernel_initializer=initializer)

hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.elu, name="hidden4", kernel_initializer=initializer)

hidden5 = tf.layers.dense(hidden4, n_hidden5, activation=tf.nn.elu, name="hidden5", kernel_initializer=initializer)

hidden6 = tf.layers.dense(hidden5, n_hidden6, activation=tf.nn.elu, name="hidden6", kernel_initializer=initializer)

hidden7 = tf.layers.dense(hidden6, n_hidden7, activation=tf.nn.elu, name="hidden7", kernel_initializer=initializer)

hidden8 = tf.layers.dense(hidden7, n_hidden8, activation=tf.nn.elu, name="hidden8", kernel_initializer=initializer)

#with tf.name_scope("output"):
logits = tf.layers.dense(hidden8, n_outputs, name="output")

#with tf.name_scope("train"):
#cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=tf_input_value, logits=logits)
#loss = tf.reduce_mean(cross_entropy)
loss = tf.reduce_mean(tf.square(logits-tf_input_value))
optimizer = tf.train.AdamOptimizer(tf_input_learning_rate)
training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

print ("hidden1: " + str(hidden1))
print ("hidden2: " + str(hidden2))
print ("hidden3: " + str(hidden3))
print ("logits: " + str(logits))


hidden1: Tensor("hidden1/Elu:0", shape=(?, 512), dtype=float32)
hidden2: Tensor("hidden2/Elu:0", shape=(?, 256), dtype=float32)
hidden3: Tensor("hidden3/Elu:0", shape=(?, 256), dtype=float32)
logits: Tensor("output/BiasAdd:0", shape=(?, 6), dtype=float32)


In [None]:
env = gym.make(environment_name)
global_step = 0

with tf.Session() as sess:
    if load_model:
        print("Loading existing model before training: {}".format(saver_file))
        saver.restore(sess, saver_file)
    else:
        print("Creating new model before training: {}".format(saver_file))
        sess.run(init)
    
    for epoch in range(n_epochs):  #play n_epochs games
        observation = env.reset()
        temp_lives = 3
        score = 0.0
        frames = np.empty([height, width, 0])
        actions = np.empty([0])
        rewards = np.empty([0])
        all_logits = []
        punish_frames = []
        
        #get the first frame
        input_value = 0  #set an initial input value
        #observation, reward_float, done_bool, info_dict = env.step(input_value)
        obs_greyscale = preprocess_observation(observation)
        obs_greyscale_reshape = np.reshape(obs_greyscale, (height,width,1))
        frames = np.append(frames, obs_greyscale_reshape, axis=2)
        actions = np.append(actions, input_value)
        last_action_step = 0
        decision_step_counter = 0
    
        
        game_step_counter = 0
        action_from_ai_epsilon_greedy = 0
        
        #play the game until the first death, recording the frames along the way
        gc.disable()
        while True:
            #this_frame = frames[:,:,np.ma.size(frames, axis=2)-1]
            #this_flattened_frame = this_frame.flatten()
            #this_flattened_frame_reshaped = np.reshape(this_flattened_frame, (1, this_flattened_frame.size))
            
            num_frames = np.ma.size(frames, axis=2)
                       
            concatenated_frames = frames[:,:,num_frames-1]
            
            for j in range(1,frames_captured,1):
                if num_frames-j <= 1:
                    this_frame = frames[:,:,0]
                elif j >= num_frames:
                    this_frame = frames[:,:,num_frames-1]
                else:
                    this_frame = frames[:,:,num_frames-1-j]
                concatenated_frames = np.append(concatenated_frames, this_frame, axis=0)
            
            #concatenated_frames_reshaped = np.reshape(concatenated_frames, (1, 210*5*160))
            concatenated_frames_reshaped = np.reshape(concatenated_frames, (1, height*frames_captured, width, 1))
            
            #todo - write a function to build the concatenated_frames
            
            #use the input value from the AI
            #temp_input = [0,0,0,0,0,0]
            temp_input = np.zeros(discrete_actions) #[0, 0, 0, ...]
            temp_input[action_from_ai_epsilon_greedy] = 1
            temp_input_reshaped = np.reshape(temp_input, (1, len(temp_input)))
            
            #tf_dropout_keep_prob: 1.0,
            feed_dict = {tf_input_frame : concatenated_frames_reshaped, 
                         tf_input_value : temp_input_reshaped,
                         tf_input_learning_rate: 0.0,
                         tf_dropout_keep_prob: 1.0,
                         tf_reward: 0.0}
            logits_out = sess.run([logits], feed_dict=feed_dict)
            #all_gradients.append(gradients_out)
            #all_variables.append(variables_out)

            all_logits.append(logits_out[0])
            
            #if global_step % 5 == 0:  #only allow a change of direction every 5 steps.  
            action_from_ai_logits_argmax = np.argmax(logits_out[0])
            
#             print(str(logits_out[0]) + ": " + str(np.argmax(logits_out[0])))
            
#             print("output_probability_out: " + str(output_probability_out))
#             print("action_from_ai_logits_argmax: " + str(action_from_ai_logits_argmax) + ", action_from_ai: " + str(action_from_ai), ", numpy_choice: " + str(numpy_choice))
#             print("var: " + str(var_out))
             
    
#            print ("logits_out[0]: {}".format(logits_out[0]))
    
            positive_logits = logits_out[0] + abs(np.amin(logits_out[0]))
            softmax_logits = softmax(positive_logits / np.amax(positive_logits))
#            print ("softmax_logits[0]: {}".format(softmax_logits[0]))

            try:
                multinomial_action_array = np.random.multinomial(1, softmax_logits[0])
                action_from_multinomial_action = np.argmax(multinomial_action_array)
#                print ("multinomial_action_array: {}".format(multinomial_action_array))
            except ValueError:
                #I have no idea why this occassionally errors out.
                action_from_multinomial_action = np.argmax(softmax_logits)
                print ("multinomial error, using action {}".format(action_from_multinomial_action))
                continue
                
#            print ("action_from_multinomial_action: {}".format(action_from_multinomial_action))
        
            # decide which action to use
            if epoch % use_ai_every_x_epoch == 0 and epoch >= 0:
                if game_step_counter == 0:
                    print("Using strict AI actions")
                action_from_ai_epsilon_greedy = action_from_ai_logits_argmax  #do what the AI says
            else:
#                print("Using probability-based actions")
                action_from_ai_epsilon_greedy = action_from_multinomial_action  #use probability-based action
            #action_from_ai_epsilon_greedy = action_from_ai  #do what the AI says with probabilities using tf.multinomial probabilities
            #action_from_ai_epsilon_greedy = numpy_choice #do what the AI says with probabilities using numpy  probabilities
                
            #run the next step given the input from the logits
            observation, reward_float, done_bool, info_dict = env.step(action_from_ai_epsilon_greedy)
            
            #add this frame to our frame buffer
            obs_greyscale = preprocess_observation(observation)
            obs_greyscale_reshape = np.reshape(obs_greyscale, (height,width,1))
            frames = np.append(frames, obs_greyscale_reshape, axis=2)
            actions = np.append(actions, action_from_ai_epsilon_greedy)
            
            score = score + reward_float
            
            if reward_float > max_score:
                rewards = np.append(rewards, max_score)
            else:
                rewards = np.append(rewards, reward_float)
            
            lives = info_dict['ale.lives']

            if done_bool:
                punish_frames.append(len(rewards) - 40)
                print("Death at frame {}".format(len(rewards)))
                break
                
            if game_step_counter > frame_limit:
                break
                
            if lives != temp_lives:  #we lost a life.  consider this game over.
                #print("Lost a life.  Current lives: {}".format(lives))
                temp_lives = lives
                if len(rewards) > 5:
                    punish_frames.append(len(rewards) - 40)
                    print("Death at frame {}".format(len(rewards)))

            if epoch % 1 == 0:
                env.render()  #display the current frame.
                
            decision_step_counter += 1
            game_step_counter += 1
            global_step += 1
        gc.enable()
        if score == 0:
            continue #causes an error and breaks the model.  just continue.
        
        if epoch % 10 == 0 and epoch > 0:
            print("Saving model at epoch {}: {}".format(epoch, saver_file))
            saver.save(sess, saver_file)
        
        #punish death
        for this_frame in punish_frames:
            #rewards[this_frame] = (-1.0 * score / len(punish_frames)) / 2.0
            rewards[this_frame] = -10
        
        num_frames = np.ma.size(frames, axis=2)
        frames_to_skip_begin = 0
        frames_to_skip_end  = 0  #number of frames between pacman being eaten and game reset
    
        discounted_rewards = helper_discount_rewards(rewards, discount_decay_rate, frames_to_skip_begin, num_frames) #-1-frames_to_skip_end
        discounted_rewards_median = np.median(discounted_rewards)
        discounted_rewards_mean = np.mean(discounted_rewards)
        average_logits = get_average_logits(all_logits, discounted_rewards)
        
        display_actions = np.zeros(discrete_actions)
        ai_actions = np.zeros(discrete_actions)
        loss_out_sum = 0.0
        frames_taught = 0.000000001
        
        skipped_frames = 0
        temp_action = 0
        
        reward_frame_counter = 0
        punish_frame_counter = 0
            
        frames_to_train = np.arange(num_frames-1-frames_to_skip_end)
        np.random.shuffle(frames_to_train)
        
        for i in range(frames_to_skip_begin, num_frames-1-frames_to_skip_end):   #skip the first frames

            this_random_frame_index = frames_to_train[i]

            if i >= num_frames-1-frames_to_skip_end: 
                continue  #only train if frame is not during pacman's death throws 
            
            if True: 
                concatenated_frames = frames[:,:,this_random_frame_index]
                for j in range(-1,-1*frames_captured,-1):
                    this_frame = frames[:,:,this_random_frame_index+j]
                    this_flattened_frame = this_frame.flatten()
                    concatenated_frames = np.append(concatenated_frames, this_frame, axis=0)
                    
#                concatenated_frames_reshaped = np.reshape(concatenated_frames, (1, 210*5*160))
                concatenated_frames_reshaped = np.reshape(concatenated_frames, (1, height*frames_captured, width, 1))
            
#                action_taken_one_hot = action_to_one_hot(actions[i], n_outputs, rewards[i])
                action_taken_one_hot = action_to_one_hot(actions[this_random_frame_index], n_outputs)
                display_actions = np.add(display_actions, action_taken_one_hot)
                
                #reward_for_frame = discounted_rewards[i] + (-1.0 * abs(discounted_rewards_median))  # hopefully, adding the median will keep the AI from getting stuck taking one action
                #reward_for_frame = discounted_rewards[i] - .05
                reward_for_frame = discounted_rewards[this_random_frame_index]
                #print("reward for frame {} is {}.  action: {}".format(i, reward_for_frame, action_taken_one_hot_reshaped))
               
                if reward_for_frame > 0.85:
                    reward_for_frame = 1.0
                elif reward_for_frame < 0.0:
                    reward_for_frame = -1.0
                    
                action_taken_one_hot = action_taken_one_hot * reward_for_frame
                
#                 if reward_for_frame != 1.0:
#                     action_taken_one_hot = action_taken_one_hot - average_logits[int(actions[this_random_frame_index])]
    
                action_taken_one_hot_reshaped = np.reshape(action_taken_one_hot, (1, len(action_taken_one_hot)))
    
#                print(action_taken_one_hot_reshaped)
                
                feed_dict = {tf_input_frame : concatenated_frames_reshaped, 
                             tf_input_value : action_taken_one_hot_reshaped,
                             tf_input_learning_rate: learning_rate,
                             tf_dropout_keep_prob: dropout_keep_prob,
                             tf_reward: reward_for_frame}
                loss_out, _, logits_out = sess.run([loss, training_op, logits], feed_dict=feed_dict)

                frames_taught = frames_taught + 1
                loss_out_sum += loss_out

                action_from_ai = np.argmax(logits_out[0])
                action_from_ai_one_hot = action_to_one_hot(action_from_ai, n_outputs)
                ai_actions = np.add(ai_actions, action_from_ai_one_hot)
            else:
                skipped_frames = skipped_frames + 1
            
            temp_action = actions[i]
#             if epoch % 10 == 0:
#                 print("ai action[" + str(i) + "]: " + str(actions[i]) + ", action_taken_one_hot: " + 
#                       str(action_taken_one_hot_reshaped) + ", loss_out: " + str(loss_out))
#                 print("logits: " + str(logits_out[0]))
        
        print("Epoch: " + str(epoch) + ", frames: " + str(num_frames) + ", score: " + str(score) + ", average loss: " + str(loss_out_sum/frames_taught))
        #print("actions trained: {}, rewards: {}, punishments: {}, median: {}, mean: {}".format(display_actions, reward_frame_counter, punish_frame_counter, discounted_rewards_median, discounted_rewards_mean))
        print("actions trained:           {}".format(display_actions))
        print("actions out while training:{}".format(ai_actions))
        #print("discounted_rewards: " + str(discounted_rewards))
        #TODO - should learning rate decrease over time?
        print("")


Loading existing model before training: ./models/space_invaders_rl
INFO:tensorflow:Restoring parameters from ./models/space_invaders_rl
Using strict AI actions
Death at frame 173
Death at frame 501
Death at frame 615
Epoch: 0, frames: 616, score: 210.0, average loss: 0.11620613068751116
actions trained:           [ 76. 222.  73.  92. 117.  35.]
actions out while training:[ 77. 224.  86.  80. 101.  47.]

Using strict AI actions
Death at frame 247
Death at frame 459
Death at frame 606
Epoch: 1, frames: 607, score: 210.0, average loss: 0.11986857366005292
actions trained:           [158. 267.  90.  44.  15.  32.]
actions out while training:[136. 267.  94.  54.  20.  35.]

Using strict AI actions
Death at frame 138
Death at frame 311
Death at frame 459
Epoch: 2, frames: 460, score: 155.0, average loss: 0.1392513610755508
actions trained:           [ 61. 196. 161.  16.  16.   9.]
actions out while training:[ 59. 194. 161.  18.  16.  11.]

Using strict AI actions
Death at frame 189
Death at 

Epoch: 29, frames: 1031, score: 340.0, average loss: 0.13932423998524066
actions trained:           [126. 474.  79.  97. 138. 116.]
actions out while training:[134. 461.  76.  97. 146. 116.]

Using strict AI actions
Death at frame 169
Death at frame 297
Death at frame 620
Saving model at epoch 30: ./models/space_invaders_rl
Epoch: 30, frames: 621, score: 155.0, average loss: 0.12397126111826456
actions trained:           [ 65. 249.  88.  98.  60.  60.]
actions out while training:[ 56. 255.  87.  88.  61.  73.]

Using strict AI actions
Death at frame 263
Death at frame 766
Death at frame 1174
Epoch: 31, frames: 1175, score: 440.0, average loss: 0.14375702955413544
actions trained:           [124. 570.  83. 121. 190.  86.]
actions out while training:[138. 527.  81. 138. 197.  93.]

Using strict AI actions
Death at frame 215
Death at frame 301
Death at frame 403
Epoch: 32, frames: 404, score: 125.0, average loss: 0.11367942419128534
actions trained:           [ 40. 185.  29.  23.  20. 106

Death at frame 237
Death at frame 317
Death at frame 496
Epoch: 59, frames: 497, score: 65.0, average loss: 0.09133874955248297
actions trained:           [179. 209.  61.  11.  21.  15.]
actions out while training:[130. 218.  75.  20.  36.  17.]

Using strict AI actions
Death at frame 408
Death at frame 804
Death at frame 953
Saving model at epoch 60: ./models/space_invaders_rl
Epoch: 60, frames: 954, score: 185.0, average loss: 0.13455074130166383
actions trained:           [ 29. 554.  76.  28. 193.  73.]
actions out while training:[ 57. 512.  76.  39. 188.  81.]

Using strict AI actions
Death at frame 265
Death at frame 601
Death at frame 769
Epoch: 61, frames: 770, score: 135.0, average loss: 0.13578620646441603
actions trained:           [146. 384.  64.  43. 104.  28.]
actions out while training:[126. 382.  63.  62. 108.  28.]

Using strict AI actions
Death at frame 219
Death at frame 436
Death at frame 543
Epoch: 62, frames: 544, score: 155.0, average loss: 0.12349084550689317
act

Death at frame 378
Death at frame 728
Death at frame 1029
Epoch: 89, frames: 1030, score: 265.0, average loss: 0.1354535523417426
actions trained:           [ 72. 539. 142. 100. 119.  57.]
actions out while training:[ 88. 518. 139. 110. 110.  64.]

Using strict AI actions
Death at frame 739
Death at frame 819
Death at frame 1086
Saving model at epoch 90: ./models/space_invaders_rl
Epoch: 90, frames: 1087, score: 390.0, average loss: 0.13470495034007104
actions trained:           [118. 474.  97. 100.  99. 198.]
actions out while training:[117. 478.  98. 110. 109. 174.]

Using strict AI actions
Death at frame 694
Death at frame 787
Death at frame 942
Epoch: 91, frames: 943, score: 285.0, average loss: 0.13375661125141983
actions trained:           [ 87. 412. 113. 146.  65. 119.]
actions out while training:[ 94. 405. 123. 139.  65. 116.]

Using strict AI actions
Death at frame 162
Death at frame 252
Death at frame 772
Epoch: 92, frames: 773, score: 215.0, average loss: 0.1321591038420747


Death at frame 267
Death at frame 466
Death at frame 811
Epoch: 119, frames: 812, score: 290.0, average loss: 0.133941110823817
actions trained:           [111. 345. 140.  56.  69.  90.]
actions out while training:[104. 334. 154.  65.  70.  84.]

Using strict AI actions
Death at frame 174
Death at frame 317
Death at frame 747
Saving model at epoch 120: ./models/space_invaders_rl
Epoch: 120, frames: 748, score: 210.0, average loss: 0.12290250064522282
actions trained:           [ 85. 433.  47.  29.  62.  91.]
actions out while training:[ 65. 428.  63.  42.  75.  74.]

Using strict AI actions
Death at frame 386
Death at frame 1053
Death at frame 1135
Epoch: 121, frames: 1136, score: 315.0, average loss: 0.1241256126873117
actions trained:           [110. 371. 154. 227. 115. 158.]
actions out while training:[101. 393. 161. 205. 112. 163.]

Using strict AI actions
Death at frame 321
Death at frame 679
Death at frame 990
Epoch: 122, frames: 991, score: 305.0, average loss: 0.138115470427506

Death at frame 170
Death at frame 327
Death at frame 813
Epoch: 149, frames: 814, score: 210.0, average loss: 0.12947479230124076
actions trained:           [ 78. 270.  65. 125. 183.  92.]
actions out while training:[ 77. 279.  65. 118. 173. 101.]

Using strict AI actions
Death at frame 324
Death at frame 461
Death at frame 635
Saving model at epoch 150: ./models/space_invaders_rl
Epoch: 150, frames: 636, score: 150.0, average loss: 0.12327079642868913
actions trained:           [ 34. 397.  57.  71.  41.  35.]
actions out while training:[ 35. 372.  66.  74.  51.  37.]

Using strict AI actions
Death at frame 463
Death at frame 613
Death at frame 804
Epoch: 151, frames: 805, score: 320.0, average loss: 0.13865674182807372
actions trained:           [ 78. 283. 129.  50. 135. 129.]
actions out while training:[ 95. 282. 127.  50. 137. 113.]

Using strict AI actions
Death at frame 167
Death at frame 324
Death at frame 526
Epoch: 152, frames: 527, score: 105.0, average loss: 0.118675703638099

Death at frame 426
Death at frame 704
Death at frame 1098
Epoch: 179, frames: 1099, score: 310.0, average loss: 0.12809966058666558
actions trained:           [232. 272. 157. 148. 166. 123.]
actions out while training:[188. 311. 156. 135. 177. 131.]

Using strict AI actions
Death at frame 178
Death at frame 293
Death at frame 478
Saving model at epoch 180: ./models/space_invaders_rl
Epoch: 180, frames: 479, score: 210.0, average loss: 0.11867430373985885
actions trained:           [ 47. 186.  74.  48.  75.  48.]
actions out while training:[ 46. 190.  74.  51.  73.  44.]

Using strict AI actions
Death at frame 182
Death at frame 463
Death at frame 688
Epoch: 181, frames: 689, score: 215.0, average loss: 0.13325432102483833
actions trained:           [148. 183. 105.  69. 126.  57.]
actions out while training:[121. 191. 115.  77. 126.  58.]

Using strict AI actions
Death at frame 142
Death at frame 256
Death at frame 743
Epoch: 182, frames: 744, score: 285.0, average loss: 0.1437866196556

Death at frame 166
Death at frame 386
Death at frame 485
Epoch: 209, frames: 486, score: 135.0, average loss: 0.1396102371655128
actions trained:           [124. 207.  65.  33.  44.  12.]
actions out while training:[117. 196.  69.  43.  50.  10.]

Using strict AI actions
Death at frame 247
Death at frame 484
Death at frame 1027
Saving model at epoch 210: ./models/space_invaders_rl
Epoch: 210, frames: 1028, score: 375.0, average loss: 0.12123021351271897
actions trained:           [209. 261. 222.  69. 117. 149.]
actions out while training:[151. 276. 214.  84. 145. 157.]

Using strict AI actions
Death at frame 228
Death at frame 309
Death at frame 535
Epoch: 211, frames: 536, score: 135.0, average loss: 0.11901768466714821
actions trained:           [ 30. 330.  31.  91.  33.  20.]
actions out while training:[ 34. 310.  40.  79.  40.  32.]

Using strict AI actions
Death at frame 219
Death at frame 735
Death at frame 942
Epoch: 212, frames: 943, score: 485.0, average loss: 0.15293215159043

Death at frame 280
Death at frame 417
Death at frame 513
Epoch: 239, frames: 514, score: 120.0, average loss: 0.11860726835155742
actions trained:           [ 24. 216.  96.  28. 104.  45.]
actions out while training:[ 27. 219.  89.  47.  90.  41.]

Using strict AI actions
Death at frame 179
Death at frame 510
Death at frame 874
Saving model at epoch 240: ./models/space_invaders_rl
Epoch: 240, frames: 875, score: 265.0, average loss: 0.13343416524087892
actions trained:           [ 15. 570. 109.  57. 101.  22.]
actions out while training:[ 20. 536. 114.  71. 107.  26.]

Using strict AI actions
Death at frame 150
Death at frame 625
Death at frame 845
Epoch: 241, frames: 846, score: 150.0, average loss: 0.14435689442625171
actions trained:           [182. 332.  84. 111. 101.  35.]
actions out while training:[171. 309.  92. 122. 109.  42.]

Using strict AI actions
Death at frame 186
Death at frame 648
Death at frame 847
Epoch: 242, frames: 848, score: 260.0, average loss: 0.126216206888257

Death at frame 569
Death at frame 769
Death at frame 872
Epoch: 269, frames: 873, score: 315.0, average loss: 0.12196653134613385
actions trained:           [ 39. 288.  87. 193. 189.  76.]
actions out while training:[ 49. 299.  98. 174. 157.  95.]

Using strict AI actions
Death at frame 170
Death at frame 537
Death at frame 727
Saving model at epoch 270: ./models/space_invaders_rl
Epoch: 270, frames: 728, score: 180.0, average loss: 0.1201167636034349
actions trained:           [104. 413.  68.  36.  75.  31.]
actions out while training:[ 85. 396.  65.  52.  91.  38.]

Using strict AI actions
Death at frame 351
Death at frame 480
Death at frame 640
Epoch: 271, frames: 641, score: 210.0, average loss: 0.12245687573562429
actions trained:           [ 33. 318.  81.  22.  84. 102.]
actions out while training:[ 42. 306.  83.  38.  80.  91.]

Using strict AI actions
Death at frame 160
Death at frame 660
Death at frame 817
Epoch: 272, frames: 818, score: 210.0, average loss: 0.1280317930878042

In [14]:
#set above epochs to 1.  
#create a concatenated image of the last 5 frames



frame_to_view = 222
concatenated_frames = frames[:,:,frame_to_view]


for j in range(frame_to_view-1, frame_to_view):
    print("discounted rewared at frame {}: {}".format(j, discounted_rewards[j]))
    this_frame = frames[:,:,j]
    #print("this_frame: " + str(this_frame.shape))
    concatenated_frames = np.append(concatenated_frames, this_frame, axis=0)

    
# print(concatenated_frames)
# print the image
#show_observation(concatenated_frames)

counter = 0
for this_reward, this_discounted_reward in zip(rewards, discounted_rewards):
    print("{}: reward: {}, discounted reward: {}".format(counter, this_reward, this_discounted_reward))
    counter += 1


#print(actions)
#print(all_logits)

# logit_sums = np.zeros(6)

# for this_logit in all_logits:
#     logit_sums = logit_sums + this_logit
# #    print(this_logit)

# print(logit_sums/len(all_logits))
# print(get_average_logits(all_logits, discounted_rewards))

# print(actions[5])
# print(average_logits[int(actions[5])])


discounted rewared at frame 221: 0.7386609940220507
0: reward: 0.0, discounted reward: -0.28489180016272203
1: reward: 0.0, discounted reward: -0.2845605741472154
2: reward: 0.0, discounted reward: -0.28421191518352423
3: reward: 0.0, discounted reward: -0.2838449057480599
4: reward: 0.0, discounted reward: -0.2834585800265184
5: reward: 0.0, discounted reward: -0.28305192137226426
6: reward: 0.0, discounted reward: -0.2826238596309441
7: reward: 0.0, discounted reward: -0.2821732683242913
8: reward: 0.0, discounted reward: -0.2816989616857094
9: reward: 0.0, discounted reward: -0.28119969153983376
10: reward: 0.0, discounted reward: -0.28067414401785934
11: reward: 0.0, discounted reward: -0.2801209360999915
12: reward: 0.0, discounted reward: -0.27953861197592017
13: reward: 0.0, discounted reward: -0.2789256392137398
14: reward: 0.0, discounted reward: -0.27828040472723414
15: reward: 0.0, discounted reward: -0.27760121053091236
16: reward: 0.0, discounted reward: -0.276886269271626

In [18]:
print("num_frames: " + str(num_frames))


#show_observation(frames[:,:,125])
#show_observation(concatenated_frames)


# print("rewards: " + str(rewards))
# print("discounted_rewards: " + str(discounted_rewards))




#print("action_taken_one_hot_reshaped: " + str(action_taken_one_hot_reshaped))
#rint("all_logits: " + str(all_logits))



# num_frames = np.ma.size(frames, axis=2)

# X_input = np.empty([n_steps, 210 * 160])
# input_index = 0

# for i in range(num_frames, num_frames-n_steps, -1):
#     this_frame = frames[:,:,i-1]
#     X_input[input_index] = this_frame.flatten()
#     input_index = input_index + 1
    
#obs_greyscale_reshape.shape



#show_observation(obs_greyscale_reshape[:,:,0])

num_frames: 1086
0: reward: 0.0, discounted reward: 0.0766933783174092
1: reward: 0.0, discounted reward: 0.07899669881315724
2: reward: 0.0, discounted reward: 0.08137125602526862
3: reward: 0.0, discounted reward: 0.08381925315115664
4: reward: 0.0, discounted reward: 0.0863429615283608
5: reward: 0.0, discounted reward: 0.08894472274197332
6: reward: 0.0, discounted reward: 0.09162695079724396
7: reward: 0.0, discounted reward: 0.09439213435937864
8: reward: 0.0, discounted reward: 0.09724283906261028
9: reward: 0.0, discounted reward: 0.10018170989068412
10: reward: 0.0, discounted reward: 0.10321147363096643
11: reward: 0.0, discounted reward: 0.10633494140445336
12: reward: 0.0, discounted reward: 0.1095550112740275
13: reward: 0.0, discounted reward: 0.1128746709333823
14: reward: 0.0, discounted reward: 0.1162970004791089
15: reward: 0.0, discounted reward: 0.11982517526851774
16: reward: 0.0, discounted reward: 0.12346246886584647
17: reward: 0.0, discounted reward: 0.12721225

In [None]:
y_input = np.empty([n_steps, 10])
for i in range(0,  29):
    y_input[i] = np.zeros(10)
    
y_input  

In [28]:
#len(rewards)
#len(actions)

In [8]:
all_logits

[array([ 180.31735229, -245.90328979,  -55.46500015,  -39.02265549,
        -171.08708191,   39.90904999, -253.30400085,   10.52585697,
           9.43663216], dtype=float32)]

In [6]:
#len(all_gradients)
np.shape(all_gradients)

(6, 10)

In [6]:
import sys
print(sys.getsizeof(all_variables))
#np.shape(all_gradients[1])
#all_gradients
#all_variables

128


In [8]:

import sys

def get_size(obj, seen=None):
    """Recursively finds size of objects"""
    size = sys.getsizeof(obj)
    if seen is None:
        seen = set()
    obj_id = id(obj)
    if obj_id in seen:
        return 0
    # Important mark as seen *before* entering recursion to gracefully handle
    # self-referential objects
    seen.add(obj_id)
    if isinstance(obj, dict):
        size += sum([get_size(v, seen) for v in obj.values()])
        size += sum([get_size(k, seen) for k in obj.keys()])
    elif hasattr(obj, '__dict__'):
        size += get_size(obj.__dict__, seen)
    elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
        size += sum([get_size(i, seen) for i in obj])
    return size

get_size(all_variables)

8392