In [1]:
#Imports
import gym
import tensorflow as tf
import gc

# To plot pretty figures and animations
%matplotlib nbagg
import matplotlib
import matplotlib.animation as animation
import matplotlib.pyplot as plt

from collections import deque
import numpy as np

####### Space Invaders
load_model           = True
environment_name     = "SpaceInvaders-v0"
discrete_actions     = 6
saver_file           = "./models/space_invaders_few_layers_rl"
height               = 210
width                = 160
channels             = 1
frames_captured      = 5
learning_rate        = .00000002  #.00000002
n_epochs             = 1001
use_ai_every_x_epoch = 1
discount_decay_rate  = 0.95
frame_limit          = 2000
max_score            = 5

####### Pitfall
# environment_name     = "Pitfall-v0"
# discrete_actions     = 18
# load_model           = False
# saver_file           = "./models/pitfall_rl"
# height               = 210
# width                = 160
# channels             = 1
# frames_captured      = 5
# learning_rate        =.00001
# n_epochs             = 11
# use_ai_every_x_epoch = 5
# discount_decay_rate  = 0.95
# frame_limit          = 1000

####### River Raid
# environment_name     = "Riverraid-v0"
# discrete_actions     = 18
# load_model           = True
# saver_file           = "./models/pitfall_rl"
# height               = 210
# width                = 160
# channels             = 1
# frames_captured      = 5
# learning_rate        =.00001
# n_epochs             = 501
# use_ai_every_x_epoch = 5
# discount_decay_rate  = 0.95
# frame_limit          = 1000


def preprocess_observation(obs):
    img = obs
    img = img.mean(axis=2) # to greyscale
    #img = (img - 128) / 128 - 1 # normalize from -1. to 1.
    img = img / 256.0  # normalize from 0 to 1.
    return img

def show_observation(image, title="Image"):
    plt.figure(figsize=(11, 7))
    plt.subplot(121)
    plt.title(title)
    plt.imshow(image) #cmap="gray"
    plt.axis("off")
    plt.show()

def softmax(x):
    ex = np.exp(x)
    sum_ex = np.sum( np.exp(x))
    return ex/sum_ex
    
env = gym.make(environment_name)
observation = env.reset()
print("obs.shape: {}".format(observation.shape)) #obs.shape: (210, 160, 3)
print("env.action_space: {}".format(env.action_space)) #env.action_space: Discrete(9)

for step in range(102):
    observation, reward_float, done_bool, info_dict = env.step(1)
    obs_greyscale = preprocess_observation(observation)

#show_observation(observation)
#show_observation(obs_greyscale)
    
print (softmax([1,-2,3]))

test_softmax = softmax([4.3210541e-25, 5.4929095e-33, 5.3535387e-02, 1.2303401e-42, 9.4646466e-01, 1.9473004e-27])
print ("test_softmax: {}".format(test_softmax))

multinomial_action_array = np.random.multinomial(1, test_softmax)
print ("multinomial_action_array: {}".format(multinomial_action_array))



obs.shape: (210, 160, 3)
env.action_space: Discrete(6)
[0.11849965 0.00589975 0.8756006 ]
test_softmax: [0.13103449 0.13103449 0.13824064 0.13103449 0.33762142 0.13103449]
multinomial_action_array: [0 0 1 0 0 0]


In [2]:
from random import randint

eps_min = 0.00
eps_max = 0.5
eps_decay_steps = 25000

def helper_discount_rewards(rewards, discount_rate, begin_index, end_index):
    '''
    Takes in rewards and applies discount rate
    '''
    discounted_rewards = np.zeros(len(rewards))
    cumulative_rewards = 0
    for step in reversed(range(len(rewards))):
        cumulative_rewards = rewards[step] + cumulative_rewards * discount_rate
        discounted_rewards[step] = cumulative_rewards
        
    relevant_discounted_rewards = discounted_rewards[begin_index:end_index]
    
    reward_mean = relevant_discounted_rewards.mean()
    reward_std = relevant_discounted_rewards.std()
        
    #return discounted_rewards
    return [(discounted_reward - reward_mean)/reward_std for discounted_reward in discounted_rewards]

def epsilon_greedy(optimal_action, number_outputs, step):
    epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step/eps_decay_steps)
    
    if step % 1000 == 0:
        print("step: " + str(step) + ", epsilon: " + str(epsilon))
        
    if np.random.rand() < epsilon:
        return np.random.randint(number_outputs-1) # random action
    else:
        return optimal_action # optimal action

# print("epsilon_greedy(4): " + str(epsilon_greedy(4,9,50))) 
# print("epsilon_greedy(4): " + str(epsilon_greedy(4,9,5000)))
# print("epsilon_greedy(4): " + str(epsilon_greedy(4,9,500000)))
    
def action_to_one_hot(action, possible_action_count):
    #9 possible positions of the joystick 
    #(0=center, 1=up, 2=right, 3=left, 4=down, 5=upper-right, 6=upper-left, 7=lower-right, 8=lower-left)
    
    return_array = np.zeros(possible_action_count)
    action_int = int(action)
    
    return_array[action_int] = 1.0

    return return_array

def get_average_logits (logits_list, discounted_rewards):

    logit_sums = np.zeros(len(logits_list[0][0]))
    logit_sums_counter = np.ones(len(logits_list[0][0]))

    for this_logit, this_reward in zip(logits_list, discounted_rewards):
        temp_array = np.zeros(len(logits_list[0][0]))
        temp_counter_array = np.zeros(len(logits_list[0][0]))
        
        action = np.argmax(this_logit)
        temp_array[action] = this_logit[0][action]
        logit_sums = logit_sums + temp_array*this_reward
        temp_counter_array[action] = 1
        logit_sums_counter = logit_sums_counter + temp_counter_array
        
    return (logit_sums/logit_sums_counter)


# print("action_to_one_hot(3, 9): " + str(action_to_one_hot(3.0, 9)))
# print("action_to_one_hot(9, 9): " + str(action_to_one_hot(8, 9)))

#print(get_average_logits(all_logits, discounted_rewards))


#print(all_logits)

In [None]:
conv1_fmaps = 32
conv1_ksize = 3
conv1_stride = 2
conv1_pad = "SAME"

conv2_fmaps = 64
conv2_ksize = 3
conv2_stride = 2
conv2_pad = "SAME"

n_hidden_in = 64 * 263 * 40  # conv3 has 64 maps of 525x80 each

n_inputs  = discrete_actions
n_outputs = discrete_actions
n_hidden1 = 512
n_hidden2 = 256
n_hidden3 = 256
n_hidden4 = 256
n_hidden5 = 256
n_hidden6 = 256
n_hidden7 = 128
n_hidden8 = 64

dropout_keep_prob = 1.0

tf.reset_default_graph()

#with tf.name_scope("inputs"):
#tf_input_frame = tf.placeholder(tf.float32, shape=(None, height * width * channels))
tf_input_frame = tf.placeholder(tf.float32, shape=(None, height*frames_captured, width, channels))
tf_input_value = tf.placeholder(tf.float32, shape=(None, n_inputs))
tf_input_learning_rate = tf.placeholder(tf.float32)
tf_dropout_keep_prob = tf.placeholder(tf.float32)
tf_reward = tf.placeholder(tf.float32)
    
#with tf.name_scope("hidden"):
initializer = tf.contrib.layers.variance_scaling_initializer()


convs   = [32,64,64]
kerns   = [8,4,3]
strides = [4,2,2]
pads    = 'valid'
activ   = tf.nn.elu

# Policy Network
conv1 = tf.layers.conv2d(
        inputs = tf_input_frame,
        filters = convs[0],
        kernel_size = kerns[0],
        strides = strides[0],
        padding = pads,
        activation = activ,
        name='conv1')

conv2 = tf.layers.conv2d(
        inputs=conv1,
        filters = convs[1],
        kernel_size = kerns[1],
        strides = strides[1],
        padding = pads,
        activation = activ,
        name='conv2')

conv3 = tf.layers.conv2d(
        inputs=conv2,
        filters = convs[2],
        kernel_size = kerns[2],
        strides = strides[2],
        padding = pads,
        activation = activ,
        name='conv3')

flat = tf.layers.flatten(conv3)



hidden1 = tf.layers.dense(flat, n_hidden1, activation=tf.nn.elu, name="hidden1", kernel_initializer=initializer)
#hidden1_drop = tf.nn.dropout(hidden1, tf_dropout_keep_prob)


#with tf.name_scope("output"):
logits = tf.layers.dense(hidden1, n_outputs, name="output", activation=None)

loss = tf.reduce_mean(tf.square(logits-tf_input_value))
optimizer = tf.train.AdamOptimizer(tf_input_learning_rate)
training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()
saver = tf.train.Saver()


In [None]:
env = gym.make(environment_name)
global_step = 0

with tf.Session() as sess:
    if load_model:
        print("Loading existing model before training: {}".format(saver_file))
        saver.restore(sess, saver_file)
    else:
        print("Creating new model before training: {}".format(saver_file))
        sess.run(init)
    
    for epoch in range(n_epochs):  #play n_epochs games
        observation = env.reset()
        temp_lives = 3
        score = 0.0
        frames = np.empty([height, width, 0])
        actions = np.empty([0])
        rewards = np.empty([0])
        all_logits = []
        punish_frames = []
        
        #get the first frame
        input_value = 0  #set an initial input value
        #observation, reward_float, done_bool, info_dict = env.step(input_value)
        obs_greyscale = preprocess_observation(observation)
        obs_greyscale_reshape = np.reshape(obs_greyscale, (height,width,1))
        frames = np.append(frames, obs_greyscale_reshape, axis=2)
        actions = np.append(actions, input_value)
        last_action_step = 0
        decision_step_counter = 0
    
        
        game_step_counter = 0
        action_from_ai_epsilon_greedy = 0
        
        #play the game until the first death, recording the frames along the way
        gc.disable()
        while True:
            #this_frame = frames[:,:,np.ma.size(frames, axis=2)-1]
            #this_flattened_frame = this_frame.flatten()
            #this_flattened_frame_reshaped = np.reshape(this_flattened_frame, (1, this_flattened_frame.size))
            
            num_frames = np.ma.size(frames, axis=2)
                       
            concatenated_frames = frames[:,:,num_frames-1]
            
            for j in range(1,frames_captured,1):
                if num_frames-j <= 1:
                    this_frame = frames[:,:,0]
                elif j >= num_frames:
                    this_frame = frames[:,:,num_frames-1]
                else:
                    this_frame = frames[:,:,num_frames-1-j]
                concatenated_frames = np.append(concatenated_frames, this_frame, axis=0)
            
            #concatenated_frames_reshaped = np.reshape(concatenated_frames, (1, 210*5*160))
            concatenated_frames_reshaped = np.reshape(concatenated_frames, (1, height*frames_captured, width, 1))
            
            #todo - write a function to build the concatenated_frames
            
            #use the input value from the AI
            #temp_input = [0,0,0,0,0,0]
            temp_input = np.zeros(discrete_actions) #[0, 0, 0, ...]
            temp_input[action_from_ai_epsilon_greedy] = 1
            temp_input_reshaped = np.reshape(temp_input, (1, len(temp_input)))
            
            #tf_dropout_keep_prob: 1.0,
            feed_dict = {tf_input_frame : concatenated_frames_reshaped, 
                         tf_input_value : temp_input_reshaped,
                         tf_input_learning_rate: 0.0,
                         tf_dropout_keep_prob: 1.0,
                         tf_reward: 0.0}
            logits_out = sess.run([logits], feed_dict=feed_dict)
            #all_gradients.append(gradients_out)
            #all_variables.append(variables_out)

            all_logits.append(logits_out[0])
            
            #if global_step % 5 == 0:  #only allow a change of direction every 5 steps.  
            action_from_ai_logits_argmax = np.argmax(logits_out[0])
            
#             print(str(logits_out[0]) + ": " + str(np.argmax(logits_out[0])))
            
#             print("output_probability_out: " + str(output_probability_out))
#             print("action_from_ai_logits_argmax: " + str(action_from_ai_logits_argmax) + ", action_from_ai: " + str(action_from_ai), ", numpy_choice: " + str(numpy_choice))
#             print("var: " + str(var_out))
             
    
#            print ("logits_out[0]: {}".format(logits_out[0]))
    
            positive_logits = logits_out[0] + abs(np.amin(logits_out[0]))
            softmax_logits = softmax(positive_logits / np.amax(positive_logits))
#            print ("softmax_logits[0]: {}".format(softmax_logits[0]))

            try:
                multinomial_action_array = np.random.multinomial(1, softmax_logits[0])
                action_from_multinomial_action = np.argmax(multinomial_action_array)
#                print ("multinomial_action_array: {}".format(multinomial_action_array))
            except ValueError:
                #I have no idea why this occassionally errors out.
                action_from_multinomial_action = np.argmax(softmax_logits)
                print ("multinomial error, using action {}".format(action_from_multinomial_action))
                continue
                
#            print ("action_from_multinomial_action: {}".format(action_from_multinomial_action))
        
            # decide which action to use
            if epoch % use_ai_every_x_epoch == 0 and epoch >= 0:
                if game_step_counter == 0:
                    print("Using strict AI actions")
                action_from_ai_epsilon_greedy = action_from_ai_logits_argmax  #do what the AI says
            else:
#                print("Using probability-based actions")
                action_from_ai_epsilon_greedy = action_from_multinomial_action  #use probability-based action
            #action_from_ai_epsilon_greedy = action_from_ai  #do what the AI says with probabilities using tf.multinomial probabilities
            #action_from_ai_epsilon_greedy = numpy_choice #do what the AI says with probabilities using numpy  probabilities
                
            #run the next step given the input from the logits
            observation, reward_float, done_bool, info_dict = env.step(action_from_ai_epsilon_greedy)
            
            #add this frame to our frame buffer
            obs_greyscale = preprocess_observation(observation)
            obs_greyscale_reshape = np.reshape(obs_greyscale, (height,width,1))
            frames = np.append(frames, obs_greyscale_reshape, axis=2)
            actions = np.append(actions, action_from_ai_epsilon_greedy)
            
            score = score + reward_float
            
            if reward_float > max_score:
                rewards = np.append(rewards, max_score)
            else:
                rewards = np.append(rewards, reward_float)
            
            lives = info_dict['ale.lives']

            if done_bool:
                punish_frames.append(len(rewards) - 40)
                print("Death at frame {}".format(len(rewards)))
                break
                
            if game_step_counter > frame_limit:
                break
                
            if lives != temp_lives:  #we lost a life.  consider this game over.
                #print("Lost a life.  Current lives: {}".format(lives))
                temp_lives = lives
                if len(rewards) > 5:
                    punish_frames.append(len(rewards) - 40)
                    print("Death at frame {}".format(len(rewards)))

            if epoch % 1 == 0:
                env.render()  #display the current frame.
                
            decision_step_counter += 1
            game_step_counter += 1
            global_step += 1
        gc.enable()
        if score == 0:
            continue #causes an error and breaks the model.  just continue.
        
        if epoch % 10 == 0 and epoch > 0:
            print("Saving model at epoch {}: {}".format(epoch, saver_file))
            saver.save(sess, saver_file)
        
        #punish death
        for this_frame in punish_frames:
            #rewards[this_frame] = (-1.0 * score / len(punish_frames)) / 2.0
            rewards[this_frame] = -10
        
        num_frames = np.ma.size(frames, axis=2)
        frames_to_skip_begin = 0
        frames_to_skip_end  = 0  #number of frames between pacman being eaten and game reset
    
        discounted_rewards = helper_discount_rewards(rewards, discount_decay_rate, frames_to_skip_begin, num_frames) #-1-frames_to_skip_end
        discounted_rewards_median = np.median(discounted_rewards)
        discounted_rewards_mean = np.mean(discounted_rewards)
        average_logits = get_average_logits(all_logits, discounted_rewards)
        
        display_actions = np.zeros(discrete_actions)
        ai_actions = np.zeros(discrete_actions)
        loss_out_sum = 0.0
        frames_taught = 0.000000001
        
        skipped_frames = 0
        temp_action = 0
        
        reward_frame_counter = 0
        punish_frame_counter = 0
            
        frames_to_train = np.arange(num_frames-1-frames_to_skip_end)
        np.random.shuffle(frames_to_train)
        
        for i in range(frames_to_skip_begin, num_frames-1-frames_to_skip_end):   #skip the first frames

            this_random_frame_index = frames_to_train[i]

            if i >= num_frames-1-frames_to_skip_end: 
                continue  #only train if frame is not during pacman's death throws 
            
            if True: 
                concatenated_frames = frames[:,:,this_random_frame_index]
                for j in range(-1,-1*frames_captured,-1):
                    this_frame = frames[:,:,this_random_frame_index+j]
                    this_flattened_frame = this_frame.flatten()
                    concatenated_frames = np.append(concatenated_frames, this_frame, axis=0)
                    
#                concatenated_frames_reshaped = np.reshape(concatenated_frames, (1, 210*5*160))
                concatenated_frames_reshaped = np.reshape(concatenated_frames, (1, height*frames_captured, width, 1))
            
#                action_taken_one_hot = action_to_one_hot(actions[i], n_outputs, rewards[i])
                action_taken_one_hot = action_to_one_hot(actions[this_random_frame_index], n_outputs)
                display_actions = np.add(display_actions, action_taken_one_hot)
                
                #reward_for_frame = discounted_rewards[i] + (-1.0 * abs(discounted_rewards_median))  # hopefully, adding the median will keep the AI from getting stuck taking one action
                #reward_for_frame = discounted_rewards[i] - .05
                reward_for_frame = discounted_rewards[this_random_frame_index]
                #print("reward for frame {} is {}.  action: {}".format(i, reward_for_frame, action_taken_one_hot_reshaped))
               
                if reward_for_frame > 0.85:
                    reward_for_frame = 1.0
                elif reward_for_frame < 0.0:
                    reward_for_frame = -1.0
                    
                action_taken_one_hot = action_taken_one_hot * reward_for_frame
                
#                 if reward_for_frame != 1.0:
#                     action_taken_one_hot = action_taken_one_hot - average_logits[int(actions[this_random_frame_index])]
    
                action_taken_one_hot_reshaped = np.reshape(action_taken_one_hot, (1, len(action_taken_one_hot)))
    
#                print(action_taken_one_hot_reshaped)
                
                feed_dict = {tf_input_frame : concatenated_frames_reshaped, 
                             tf_input_value : action_taken_one_hot_reshaped,
                             tf_input_learning_rate: learning_rate,
                             tf_dropout_keep_prob: dropout_keep_prob,
                             tf_reward: reward_for_frame}
                loss_out, _, logits_out = sess.run([loss, training_op, logits], feed_dict=feed_dict)

                frames_taught = frames_taught + 1
                loss_out_sum += loss_out

                action_from_ai = np.argmax(logits_out[0])
                action_from_ai_one_hot = action_to_one_hot(action_from_ai, n_outputs)
                ai_actions = np.add(ai_actions, action_from_ai_one_hot)
            else:
                skipped_frames = skipped_frames + 1
            
            temp_action = actions[i]
#             if epoch % 10 == 0:
#                 print("ai action[" + str(i) + "]: " + str(actions[i]) + ", action_taken_one_hot: " + 
#                       str(action_taken_one_hot_reshaped) + ", loss_out: " + str(loss_out))
#                 print("logits: " + str(logits_out[0]))
        
        print("Epoch: " + str(epoch) + ", frames: " + str(num_frames) + ", score: " + str(score) + ", average loss: " + str(loss_out_sum/frames_taught))
        #print("actions trained: {}, rewards: {}, punishments: {}, median: {}, mean: {}".format(display_actions, reward_frame_counter, punish_frame_counter, discounted_rewards_median, discounted_rewards_mean))
        print("actions trained:           {}".format(display_actions))
        print("actions out while training:{}".format(ai_actions))
        #print("discounted_rewards: " + str(discounted_rewards))
        #TODO - should learning rate decrease over time?
        print("")


Loading existing model before training: ./models/space_invaders_few_layers_rl
INFO:tensorflow:Restoring parameters from ./models/space_invaders_few_layers_rl
Using strict AI actions
Death at frame 189
Death at frame 272
Death at frame 359
Epoch: 0, frames: 360, score: 35.0, average loss: 0.07836907350307962
actions trained:           [146. 121.  79.  13.   0.   0.]
actions out while training:[116. 124. 106.  13.   0.   0.]

Using strict AI actions
Death at frame 242
Death at frame 320
Death at frame 430
Epoch: 1, frames: 431, score: 75.0, average loss: 0.0891012638523662
actions trained:           [ 15. 162. 240.  13.   0.   0.]
actions out while training:[ 68. 191. 141.  28.   0.   2.]

Using strict AI actions
Death at frame 262
Death at frame 671
Death at frame 1151
Epoch: 2, frames: 1152, score: 185.0, average loss: 0.13627873583845557
actions trained:           [340. 156. 200. 353.  18.  84.]
actions out while training:[147. 364. 234. 190.  90. 126.]

Using strict AI actions
Death 

Death at frame 200
Death at frame 299
Death at frame 406
Epoch: 29, frames: 407, score: 85.0, average loss: 0.08484573287093126
actions trained:           [ 94.  74.  24. 176.   6.  32.]
actions out while training:[ 39.  81.  28. 196.   9.  53.]

Using strict AI actions
Death at frame 262
Death at frame 581
Death at frame 804
Saving model at epoch 30: ./models/space_invaders_few_layers_rl
Epoch: 30, frames: 805, score: 190.0, average loss: 0.13216957761492185
actions trained:           [ 14. 186.  50. 246.  79. 229.]
actions out while training:[ 60. 177. 101. 100. 129. 237.]

Using strict AI actions
Death at frame 707
Death at frame 920
Death at frame 1005
Epoch: 31, frames: 1006, score: 260.0, average loss: 0.13518166088969874
actions trained:           [ 32. 201. 161.  14. 148. 449.]
actions out while training:[ 99. 198. 109.  65. 144. 390.]

Using strict AI actions
Death at frame 258
Death at frame 631
Death at frame 727
Epoch: 32, frames: 728, score: 135.0, average loss: 0.12371931

Death at frame 450
Death at frame 774
Death at frame 919
Epoch: 59, frames: 920, score: 425.0, average loss: 0.14190296341471795
actions trained:           [424. 152.  62.   6.  46. 229.]
actions out while training:[133. 180. 122.  40. 208. 236.]

Using strict AI actions
Death at frame 206
Death at frame 301
Death at frame 387
Saving model at epoch 60: ./models/space_invaders_few_layers_rl
Epoch: 60, frames: 388, score: 35.0, average loss: 0.06766682289744683
actions trained:           [  3. 125.   7.  16. 213.  23.]
actions out while training:[  2. 102.   6.  14. 248.  15.]

Using strict AI actions
Death at frame 211
Death at frame 437
Death at frame 934
Epoch: 61, frames: 935, score: 310.0, average loss: 0.1356230746120216
actions trained:           [ 14. 148. 118.  37. 336. 281.]
actions out while training:[ 23. 163. 162.  72. 270. 244.]

Using strict AI actions
Death at frame 561
Death at frame 757
Death at frame 877
Epoch: 62, frames: 878, score: 290.0, average loss: 0.13188670363

Death at frame 202
Death at frame 283
Death at frame 382
Epoch: 89, frames: 383, score: 55.0, average loss: 0.0669559996030296
actions trained:           [ 74. 187.   6.  31.  54.  30.]
actions out while training:[ 85. 195.   6.  12.  56.  28.]

Using strict AI actions
Death at frame 231
Death at frame 331
Death at frame 709
Saving model at epoch 90: ./models/space_invaders_few_layers_rl
Epoch: 90, frames: 710, score: 180.0, average loss: 0.13046460888232914
actions trained:           [ 20. 359.  40.  54.  61. 175.]
actions out while training:[ 69. 231.  42.  86.  85. 196.]

Using strict AI actions
Death at frame 230
Death at frame 331
Death at frame 956
Epoch: 91, frames: 957, score: 280.0, average loss: 0.14091903711616657
actions trained:           [198. 142.  80.  98. 183. 255.]
actions out while training:[ 87. 226. 122.  87. 163. 271.]

Using strict AI actions
Death at frame 648
Death at frame 834
Death at frame 940
Epoch: 92, frames: 941, score: 350.0, average loss: 0.12713778169

Death at frame 205
Death at frame 485
Death at frame 689
Epoch: 119, frames: 690, score: 135.0, average loss: 0.12007130949925328
actions trained:           [ 92. 258.  74.  53.  54. 158.]
actions out while training:[127. 215.  87.  76.  49. 135.]

Using strict AI actions
Death at frame 279
Death at frame 587
Death at frame 680
Saving model at epoch 120: ./models/space_invaders_few_layers_rl
Epoch: 120, frames: 681, score: 120.0, average loss: 0.12122473956700495
actions trained:           [101. 126. 187.  85.  13. 168.]
actions out while training:[ 78. 164.  51.  48.  87. 252.]

Using strict AI actions
Death at frame 218
Death at frame 313
Death at frame 821
Epoch: 121, frames: 822, score: 210.0, average loss: 0.1380062660754828
actions trained:           [172. 156.  61.  68. 116. 248.]
actions out while training:[ 80. 149.  87. 122. 138. 245.]

Using strict AI actions
Death at frame 230
Death at frame 325
Death at frame 638
Epoch: 122, frames: 639, score: 105.0, average loss: 0.10279

Death at frame 170
Death at frame 317
Death at frame 432
Epoch: 149, frames: 433, score: 90.0, average loss: 0.12180801598001985
actions trained:           [ 13. 265.  39.  95.  20.   0.]
actions out while training:[ 25. 215.  41.  66.  85.   0.]

Using strict AI actions
Death at frame 207
Death at frame 352
Death at frame 583
Saving model at epoch 150: ./models/space_invaders_few_layers_rl
Epoch: 150, frames: 584, score: 200.0, average loss: 0.13067240698197105
actions trained:           [236. 174.  30.  28.  61.  54.]
actions out while training:[113. 223. 108.  32.  51.  56.]

Using strict AI actions
Death at frame 199
Death at frame 349
Death at frame 511
Epoch: 151, frames: 512, score: 130.0, average loss: 0.12022707091980604
actions trained:           [  1. 341. 143.  13.  10.   3.]
actions out while training:[  1. 320.  81.  39.  65.   5.]

Using strict AI actions
Death at frame 233
Death at frame 333
Death at frame 644
Epoch: 152, frames: 645, score: 155.0, average loss: 0.12362

Death at frame 222
Death at frame 311
Death at frame 696
Epoch: 179, frames: 697, score: 160.0, average loss: 0.13353295625399428
actions trained:           [ 57. 173. 197. 148.  49.  72.]
actions out while training:[ 82. 176. 140.  80. 120.  98.]

Using strict AI actions
Death at frame 256
Death at frame 421
Death at frame 567
Saving model at epoch 180: ./models/space_invaders_few_layers_rl
Epoch: 180, frames: 568, score: 130.0, average loss: 0.11845186220360905
actions trained:           [ 85. 107.  43.  12. 191. 129.]
actions out while training:[ 84. 115.  60.  11. 224.  73.]

Using strict AI actions
Death at frame 412
Death at frame 639
Death at frame 904
Epoch: 181, frames: 905, score: 480.0, average loss: 0.13082996922939147
actions trained:           [108. 205.  32.  93. 311. 155.]
actions out while training:[141. 268.  62. 103. 180. 150.]

Using strict AI actions
Death at frame 201
Death at frame 279
Death at frame 381
Epoch: 182, frames: 382, score: 105.0, average loss: 0.1117

Death at frame 255
Death at frame 444
Death at frame 645
Epoch: 209, frames: 646, score: 230.0, average loss: 0.12771121222332765
actions trained:           [ 50.  98.  28.  39. 238. 192.]
actions out while training:[ 65. 132.  83.  95. 141. 129.]

Using strict AI actions
Death at frame 195
Death at frame 277
Death at frame 366
Saving model at epoch 210: ./models/space_invaders_few_layers_rl
Epoch: 210, frames: 367, score: 90.0, average loss: 0.1257702681756004
actions trained:           [  6. 167.  56. 136.   0.   1.]
actions out while training:[ 13. 172.  74.  96.   0.  11.]

Using strict AI actions
Death at frame 214
Death at frame 878
Death at frame 1099
Epoch: 211, frames: 1100, score: 595.0, average loss: 0.13227425579807126
actions trained:           [123. 290. 158.  54. 132. 342.]
actions out while training:[106. 344. 105.  72. 164. 308.]

Using strict AI actions
Death at frame 413
Death at frame 775
Death at frame 928
Epoch: 212, frames: 929, score: 260.0, average loss: 0.1251

Death at frame 284
Death at frame 596
Death at frame 824
Epoch: 239, frames: 825, score: 155.0, average loss: 0.12027191218149923
actions trained:           [168. 194.   2. 100.  75. 285.]
actions out while training:[ 64. 220.  54.  72. 175. 239.]

Using strict AI actions
Death at frame 207
Death at frame 350
Death at frame 576
Saving model at epoch 240: ./models/space_invaders_few_layers_rl
Epoch: 240, frames: 577, score: 155.0, average loss: 0.12466885683067916
actions trained:           [  5. 163.  14. 104. 209.  81.]
actions out while training:[  8. 201.  23.  53. 219.  72.]

Using strict AI actions
Death at frame 438
Death at frame 614
Death at frame 804
Epoch: 241, frames: 805, score: 210.0, average loss: 0.13939281700783393
actions trained:           [ 24. 338.  82.  52. 112. 196.]
actions out while training:[ 92. 274.  90.  42.  82. 224.]

Using strict AI actions
Death at frame 389
Death at frame 555
Death at frame 633
Epoch: 242, frames: 634, score: 105.0, average loss: 0.0783

Death at frame 230
Death at frame 824
Death at frame 981
Epoch: 269, frames: 982, score: 460.0, average loss: 0.12747399661416284
actions trained:           [ 43. 281. 142.   6.  56. 453.]
actions out while training:[170. 321. 129.  22.  87. 252.]

Using strict AI actions
Death at frame 248
Death at frame 447
Death at frame 644
Saving model at epoch 270: ./models/space_invaders_few_layers_rl
Epoch: 270, frames: 645, score: 130.0, average loss: 0.12679434755692703
actions trained:           [ 85. 316.  12.  88.  93.  50.]
actions out while training:[ 59. 293.  83.  49.  69.  91.]

Using strict AI actions
Death at frame 341
Death at frame 432
Death at frame 696
Epoch: 271, frames: 697, score: 170.0, average loss: 0.12810867342607168
actions trained:           [ 17. 347. 107.   2. 103. 120.]
actions out while training:[ 78. 220. 109.  34.  39. 216.]

Using strict AI actions
Death at frame 336
Death at frame 753
Death at frame 838
Epoch: 272, frames: 839, score: 210.0, average loss: 0.1271

In [None]:
#set above epochs to 1.  
#create a concatenated image of the last 5 frames



frame_to_view = 222
concatenated_frames = frames[:,:,frame_to_view]


for j in range(frame_to_view-1, frame_to_view):
    print("discounted rewared at frame {}: {}".format(j, discounted_rewards[j]))
    this_frame = frames[:,:,j]
    #print("this_frame: " + str(this_frame.shape))
    concatenated_frames = np.append(concatenated_frames, this_frame, axis=0)

    
# print(concatenated_frames)
# print the image
#show_observation(concatenated_frames)

counter = 0
for this_reward, this_discounted_reward in zip(rewards, discounted_rewards):
    print("{}: reward: {}, discounted reward: {}".format(counter, this_reward, this_discounted_reward))
    counter += 1


#print(actions)
#print(all_logits)

# logit_sums = np.zeros(6)

# for this_logit in all_logits:
#     logit_sums = logit_sums + this_logit
# #    print(this_logit)

# print(logit_sums/len(all_logits))
# print(get_average_logits(all_logits, discounted_rewards))

# print(actions[5])
# print(average_logits[int(actions[5])])


In [18]:
print("num_frames: " + str(num_frames))


#show_observation(frames[:,:,125])
#show_observation(concatenated_frames)


# print("rewards: " + str(rewards))
# print("discounted_rewards: " + str(discounted_rewards))




#print("action_taken_one_hot_reshaped: " + str(action_taken_one_hot_reshaped))
#rint("all_logits: " + str(all_logits))



# num_frames = np.ma.size(frames, axis=2)

# X_input = np.empty([n_steps, 210 * 160])
# input_index = 0

# for i in range(num_frames, num_frames-n_steps, -1):
#     this_frame = frames[:,:,i-1]
#     X_input[input_index] = this_frame.flatten()
#     input_index = input_index + 1
    
#obs_greyscale_reshape.shape



#show_observation(obs_greyscale_reshape[:,:,0])

num_frames: 1086
0: reward: 0.0, discounted reward: 0.0766933783174092
1: reward: 0.0, discounted reward: 0.07899669881315724
2: reward: 0.0, discounted reward: 0.08137125602526862
3: reward: 0.0, discounted reward: 0.08381925315115664
4: reward: 0.0, discounted reward: 0.0863429615283608
5: reward: 0.0, discounted reward: 0.08894472274197332
6: reward: 0.0, discounted reward: 0.09162695079724396
7: reward: 0.0, discounted reward: 0.09439213435937864
8: reward: 0.0, discounted reward: 0.09724283906261028
9: reward: 0.0, discounted reward: 0.10018170989068412
10: reward: 0.0, discounted reward: 0.10321147363096643
11: reward: 0.0, discounted reward: 0.10633494140445336
12: reward: 0.0, discounted reward: 0.1095550112740275
13: reward: 0.0, discounted reward: 0.1128746709333823
14: reward: 0.0, discounted reward: 0.1162970004791089
15: reward: 0.0, discounted reward: 0.11982517526851774
16: reward: 0.0, discounted reward: 0.12346246886584647
17: reward: 0.0, discounted reward: 0.12721225

In [None]:
y_input = np.empty([n_steps, 10])
for i in range(0,  29):
    y_input[i] = np.zeros(10)
    
y_input  

In [28]:
#len(rewards)
#len(actions)

In [8]:
all_logits

[array([ 180.31735229, -245.90328979,  -55.46500015,  -39.02265549,
        -171.08708191,   39.90904999, -253.30400085,   10.52585697,
           9.43663216], dtype=float32)]

In [6]:
#len(all_gradients)
np.shape(all_gradients)

(6, 10)

In [6]:
import sys
print(sys.getsizeof(all_variables))
#np.shape(all_gradients[1])
#all_gradients
#all_variables

128


In [8]:

import sys

def get_size(obj, seen=None):
    """Recursively finds size of objects"""
    size = sys.getsizeof(obj)
    if seen is None:
        seen = set()
    obj_id = id(obj)
    if obj_id in seen:
        return 0
    # Important mark as seen *before* entering recursion to gracefully handle
    # self-referential objects
    seen.add(obj_id)
    if isinstance(obj, dict):
        size += sum([get_size(v, seen) for v in obj.values()])
        size += sum([get_size(k, seen) for k in obj.keys()])
    elif hasattr(obj, '__dict__'):
        size += get_size(obj.__dict__, seen)
    elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
        size += sum([get_size(i, seen) for i in obj])
    return size

get_size(all_variables)

8392