In [1]:
# Based on https://blog.floydhub.com/spinning-up-with-deep-reinforcement-learning/
import gym
import random
import time
import os
import pylab as pl
import matplotlib.pyplot as plt
from IPython import display
from datetime import datetime
from keras.layers import Dense
from keras.models import Sequential

Using TensorFlow backend.


In [2]:
# Collect a game of Pong with random input.
frames = []
frames_ram = []
STEPS = 300

UP_ACTION = 2
DOWN_ACTION = 3

env = gym.make("Pong-ram-v0")

observation = env.reset()

for i in range(STEPS):
    gym.envs.registry
    action = random.randint(UP_ACTION, DOWN_ACTION)

    observation, reward, done, info = env.step(action)
    frames.append(env.unwrapped._get_image())
    frames_ram.append(env.unwrapped._get_ram())
    
    if done:
        observation = env.reset()
        frames.append(observation)

In [3]:
# Draw our frames to see what happened and print ram to find important addresses.
ram_address_position_player_y = 60
ram_address_position_ball_x = 49
ram_address_position_ball_y = 50

'''
for i in range(len(frames)):
    plt.imshow(frames[i])
    print(frames_ram[i])
    print('Player position y: ' + str(frames_ram[i][ram_address_position_player_y]))
    print('Ball position x: ' + str(frames_ram[i][ram_address_position_ball_x]))
    print('Ball position y: ' + str(frames_ram[i][ram_address_position_ball_y]))
    display.display(pl.gcf())
    time.sleep(0.5)
    display.clear_output(wait=True)
'''


"\nfor i in range(len(frames)):\n    plt.imshow(frames[i])\n    print(frames_ram[i])\n    print('Player position y: ' + str(frames_ram[i][ram_address_position_player_y]))\n    print('Ball position x: ' + str(frames_ram[i][ram_address_position_ball_x]))\n    print('Ball position y: ' + str(frames_ram[i][ram_address_position_ball_y]))\n    display.display(pl.gcf())\n    time.sleep(0.5)\n    display.clear_output(wait=True)\n"

In [8]:
# Create the neural network.
# TODO: What is a adam optimizer?
model = Sequential()
# I guess 200 units was causing the model to overfit. Lets try 9.
model.add(Dense(units=9,input_dim=3, activation='relu', kernel_initializer='glorot_uniform'))
model.add(Dense(units=1, activation='sigmoid', kernel_initializer='RandomNormal'))
# TODO: How can we interprate accuracy in this context?
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [5]:
import numpy as np
import gym

# gym initialization
env = gym.make("Pong-ram-v0")
observation = env.reset()
prev_input = None

# Macros
UP_ACTION = 2
DOWN_ACTION = 3

# Hyperparameters
gamma = 0.99

# initialization of variables used in the main loop
x_train, y_train, rewards = [],[],[]
reward_sum = 0
episode_nb = 0

In [6]:
log_dir = './log' + datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
resume = True
epochs_before_saving = 100

# load pre-trained model if exist
if (resume and os.path.isfile('my_model_weights.h5')):
    print("loading previous weights")
    model.load_weights('my_model_weights.h5')

In [None]:
def discount_rewards(r, gamma):
    """ take 1D float array of rewards and compute discounted reward """
    r = np.array(r)
    discounted_r = np.zeros_like(r)
    running_add = 0
    # we go from last reward to first one so we don't have to do exponentiations
    for t in reversed(range(0, r.size)):
        if r[t] != 0: running_add = 0 # if the game ended (in Pong), reset the reward sum
        running_add = running_add * gamma + r[t] # the point here is to use Horner's method to compute those rewards efficiently
        discounted_r[t] = running_add
    discounted_r -= np.mean(discounted_r) #normalizing the result
    discounted_r /= np.std(discounted_r) #idem
    return discounted_r

In [7]:
# main loop
while (True):
    ram = env.unwrapped._get_ram()
    position_player_y = int(ram[ram_address_position_player_y])
    position_ball_x = int(ram[ram_address_position_ball_x])
    position_ball_y = int(ram[ram_address_position_ball_y])

    # We will use as input an array of the difference between the current and previous positions.
    cur_input = np.array([position_player_y, position_ball_x, position_ball_y])
    #print('Current input: ' + str(cur_input))
    #print('Prev input: ' + str(prev_input))
    x = cur_input - prev_input if prev_input is not None else np.zeros(len(cur_input))
    print('Difference: ' + str(x))
    prev_input = cur_input
    
    # forward the policy network and sample action according to the proba distribution
    proba = model.predict(np.array([x]))
    #print('Prediction: ' + str(proba))
    # Variable proba is the probability prediction of how good UP_ACTION is for this frame.
    # Then select UP_ACTION by proba percent. Easy way to still allow the other action.
    # TODO: Mathematical reason for random number?
    action = UP_ACTION if np.random.uniform() < proba else DOWN_ACTION
    y = 1 if action == UP_ACTION else 0 # 0 and 1 are our labels

    # log the input and label to train later
    x_train.append(x)
    y_train.append(y)

    # do one step in our environment
    observation, reward, done, info = env.step(action)
    #print('Observation: ' + str(observation))
    #print('Reward: ' + str(reward))
    #print('Done: ' + str(done))
    #print('Info: ' + str(info))
    rewards.append(reward)
    reward_sum += reward
    
    # end of an episode
    if done:
        print('At the end of episode', episode_nb, 'the total reward was :', reward_sum)
        
        # increment episode number
        episode_nb += 1
        
        # training
        # TODO: Is np.vstack is really necessary?
        # TODO: Clarify sample_weight=discount_rewards(rewards, gamma)
        model.fit(x=np.vstack(x_train), y=np.vstack(y_train), verbose=1, sample_weight=discount_rewards(rewards, gamma))
                                                     
        if episode_nb % epochs_before_saving == 0:    
            model.save_weights('my_model_weights' + datetime.now().strftime("%Y%m%d-%H%M%S") + '.h5')
            
        # Reinitialization
        x_train, y_train, rewards = [],[],[]
        observation = env.reset()
        reward_sum = 0
        prev_input = None

Current input: [109   0  22]
Prev input: None
Difference: [0. 0. 0.]
Current input: [111   0  20]
Prev input: [109   0  22]
Difference: [ 2  0 -2]
Current input: [112   0  16]
Prev input: [111   0  20]
Difference: [ 1  0 -4]
Current input: [108   0  12]
Prev input: [112   0  16]
Difference: [-4  0 -4]
Current input: [100   0   8]
Prev input: [108   0  12]
Difference: [-8  0 -4]
Current input: [100   0   4]
Prev input: [100   0   8]
Difference: [ 0  0 -4]
Current input: [96  0  0]
Prev input: [100   0   4]
Difference: [-4  0 -4]
Current input: [78  0  0]
Prev input: [96  0  0]
Difference: [-18   0   0]
Current input: [57  0  0]
Prev input: [78  0  0]
Difference: [-21   0   0]
Current input: [42  0  0]
Prev input: [57  0  0]
Difference: [-15   0   0]
Current input: [40  0  0]
Prev input: [42  0  0]
Difference: [-2  0  0]
Current input: [38  0  0]
Prev input: [40  0  0]
Difference: [-2  0  0]
Current input: [42  0  0]
Prev input: [38  0  0]
Difference: [4 0 0]
Current input: [50  0  0]
Pr

Difference: [-7  0 -4]
Current input: [ 52 205 128]
Prev input: [ 61 205 132]
Difference: [-9  0 -4]
Current input: [ 41 205 122]
Prev input: [ 52 205 128]
Difference: [-11   0  -6]
Current input: [ 39 205 116]
Prev input: [ 41 205 122]
Difference: [-2  0 -6]
Current input: [ 38 205 110]
Prev input: [ 39 205 116]
Difference: [-1  0 -6]
Current input: [ 38 205 104]
Prev input: [ 38 205 110]
Difference: [ 0  0 -6]
Current input: [ 51 205  96]
Prev input: [ 38 205 104]
Difference: [13  0 -8]
Current input: [ 61 130 130]
Prev input: [ 51 205  96]
Difference: [ 10 -75  34]
Current input: [ 82 134 134]
Prev input: [ 61 130 130]
Difference: [21  4  4]
Current input: [ 94 136 136]
Prev input: [ 82 134 134]
Difference: [12  2  2]
Current input: [ 94 138 138]
Prev input: [ 94 136 136]
Difference: [0 2 2]
Current input: [ 80 141 142]
Prev input: [ 94 138 138]
Difference: [-14   3   4]
Current input: [ 70 143 144]
Prev input: [ 80 141 142]
Difference: [-10   2   2]
Current input: [ 59 145 146]
Pre

Current input: [ 43 203 204]
Prev input: [ 51 199 200]
Difference: [-8  4  4]
Current input: [ 39 205 202]
Prev input: [ 43 203 204]
Difference: [-4  2 -2]
Current input: [ 38 205 194]
Prev input: [ 39 205 202]
Difference: [-1  0 -8]
Current input: [ 38 205 192]
Prev input: [ 38 205 194]
Difference: [ 0  0 -2]
Current input: [ 38 205 186]
Prev input: [ 38 205 192]
Difference: [ 0  0 -6]
Current input: [ 38 205 182]
Prev input: [ 38 205 186]
Difference: [ 0  0 -4]
Current input: [ 38 205 176]
Prev input: [ 38 205 182]
Difference: [ 0  0 -6]
Current input: [ 38 205 172]
Prev input: [ 38 205 176]
Difference: [ 0  0 -4]
Current input: [ 38 205 166]
Prev input: [ 38 205 172]
Difference: [ 0  0 -6]
Current input: [ 38 205 162]
Prev input: [ 38 205 166]
Difference: [ 0  0 -4]
Current input: [ 43 205 156]
Prev input: [ 38 205 162]
Difference: [ 5  0 -6]
Current input: [ 51 205 152]
Prev input: [ 43 205 156]
Difference: [ 8  0 -4]
Current input: [ 61 205 148]
Prev input: [ 51 205 152]
Differenc

Current input: [113 196 196]
Prev input: [120 194 194]
Difference: [-7  2  2]
Current input: [113 199 200]
Prev input: [113 196 196]
Difference: [0 3 4]
Current input: [115 201 202]
Prev input: [113 199 200]
Difference: [2 2 2]
Current input: [116 204 204]
Prev input: [115 201 202]
Difference: [1 3 2]
Current input: [122 205 204]
Prev input: [116 204 204]
Difference: [6 1 0]
Current input: [129 205 198]
Prev input: [122 205 204]
Difference: [ 7  0 -6]
Current input: [114 205 192]
Prev input: [129 205 198]
Difference: [-15   0  -6]
Current input: [122 205 184]
Prev input: [114 205 192]
Difference: [ 8  0 -8]
Current input: [120 205 180]
Prev input: [122 205 184]
Difference: [-2  0 -4]
Current input: [123 205 174]
Prev input: [120 205 180]
Difference: [ 3  0 -6]
Current input: [120 205 170]
Prev input: [123 205 174]
Difference: [-3  0 -4]
Current input: [113 205 166]
Prev input: [120 205 170]
Difference: [-7  0 -4]
Current input: [115 205 162]
Prev input: [113 205 166]
Difference: [ 2  0

Current input: [123 178 178]
Prev input: [135 176 176]
Difference: [-12   2   2]
Current input: [129 182 182]
Prev input: [123 178 178]
Difference: [6 4 4]
Current input: [132 184 184]
Prev input: [129 182 182]
Difference: [3 2 2]
Current input: [129 187 188]
Prev input: [132 184 184]
Difference: [-3  3  4]
Current input: [140 191 192]
Prev input: [129 187 188]
Difference: [11  4  4]
Current input: [149 193 194]
Prev input: [140 191 192]
Difference: [9 2 2]
Current input: [170 197 198]
Prev input: [149 193 194]
Difference: [21  4  4]
Current input: [182 200 200]
Prev input: [170 197 198]
Difference: [12  3  2]
Current input: [198 203 204]
Prev input: [182 200 200]
Difference: [16  3  4]
Current input: [196 205 204]
Prev input: [198 203 204]
Difference: [-2  2  0]
Current input: [179 205 198]
Prev input: [196 205 204]
Difference: [-17   0  -6]
Current input: [168 205 194]
Prev input: [179 205 198]
Difference: [-11   0  -4]
Current input: [158 205 192]
Prev input: [168 205 194]
Differenc

Current input: [ 38 169 170]
Prev input: [ 38 165 166]
Difference: [0 4 4]
Current input: [ 43 172 172]
Prev input: [ 38 169 170]
Difference: [5 3 2]
Current input: [ 39 175 176]
Prev input: [ 43 172 172]
Difference: [-4  3  4]
Current input: [ 38 178 178]
Prev input: [ 39 175 176]
Difference: [-1  3  2]
Current input: [ 38 182 182]
Prev input: [ 38 178 178]
Difference: [0 4 4]
Current input: [ 38 185 186]
Prev input: [ 38 182 182]
Difference: [0 3 4]
Current input: [ 38 188 188]
Prev input: [ 38 185 186]
Difference: [0 3 2]
Current input: [ 38 191 192]
Prev input: [ 38 188 188]
Difference: [0 3 4]
Current input: [ 38 193 194]
Prev input: [ 38 191 192]
Difference: [0 2 2]
Current input: [ 38 197 198]
Prev input: [ 38 193 194]
Difference: [0 4 4]
Current input: [ 38 201 202]
Prev input: [ 38 197 198]
Difference: [0 4 4]
Current input: [ 51 205 206]
Prev input: [ 38 201 202]
Difference: [13  4  4]
Current input: [ 61 205 202]
Prev input: [ 51 205 206]
Difference: [10  0 -4]
Current input

NameError: name 'discount_rewards' is not defined