In [1]:
# Based on https://blog.floydhub.com/spinning-up-with-deep-reinforcement-learning/
import gym
import random
import time
import os
import pylab as pl
import matplotlib.pyplot as plt
import numpy as np
import gym
from IPython import display
from datetime import datetime
from keras.layers import Dense
from keras.models import Sequential
from sklearn import preprocessing



Using TensorFlow backend.


In [2]:
# Collect a game of Pong with random input.
frames = []
frames_ram = []
STEPS = 300

UP_ACTION = 2
DOWN_ACTION = 3

env = gym.make("Pong-ram-v0")

observation = env.reset()

for i in range(STEPS):
    gym.envs.registry
    action = random.randint(UP_ACTION, DOWN_ACTION)

    observation, reward, done, info = env.step(action)
    frames.append(env.unwrapped._get_image())
    frames_ram.append(env.unwrapped._get_ram())
    
    if done:
        observation = env.reset()
        frames.append(observation)

In [3]:
# Draw our frames to see what happened and print ram to find important addresses.
ram_address_position_player_y = 60
ram_address_position_ball_x = 49
ram_address_position_ball_y = 50

'''
for i in range(len(frames)):
    plt.imshow(frames[i])
    print(frames_ram[i])
    print('Player position y: ' + str(frames_ram[i][ram_address_position_player_y]))
    print('Ball position x: ' + str(frames_ram[i][ram_address_position_ball_x]))
    print('Ball position y: ' + str(frames_ram[i][ram_address_position_ball_y]))
    display.display(pl.gcf())
    time.sleep(0.5)
    display.clear_output(wait=True)
'''


"\nfor i in range(len(frames)):\n    plt.imshow(frames[i])\n    print(frames_ram[i])\n    print('Player position y: ' + str(frames_ram[i][ram_address_position_player_y]))\n    print('Ball position x: ' + str(frames_ram[i][ram_address_position_ball_x]))\n    print('Ball position y: ' + str(frames_ram[i][ram_address_position_ball_y]))\n    display.display(pl.gcf())\n    time.sleep(0.5)\n    display.clear_output(wait=True)\n"

In [9]:
# Create the neural network.
# TODO: What is a adam optimizer?
def get_model():
    model = Sequential()
    model.add(Dense(units=16,input_dim=6, activation='relu', kernel_initializer='glorot_uniform'))
    model.add(Dense(units=1, activation='sigmoid', kernel_initializer='RandomNormal'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def discount_rewards(r, gamma):
    """ take 1D float array of rewards and compute discounted reward """
    r = np.array(r)
    #print('discount_rewards r:' + str(r))
    discounted_r = np.zeros_like(r)
    running_add = 0
    # we go from last reward to first one so we don't have to do exponentiations
    for t in reversed(range(0, r.size)):
        if r[t] != 0: running_add = 0 # if the game ended (in Pong), reset the reward sum
        running_add = running_add * gamma + r[t] # the point here is to use Horner's method to compute those rewards efficiently
        discounted_r[t] = running_add
    discounted_r -= np.mean(discounted_r) #normalizing the result
    discounted_r /= np.std(discounted_r) #idem
    #print('discount_rewards discounted_r:' + str(discounted_r))
    return discounted_r

# gym initialization
env = gym.make("Pong-ram-v0")
observation = env.reset()
prev_input = None

# Macros
UP_ACTION = 2
DOWN_ACTION = 3

# Hyperparameters
gamma = 0.99
max_unit_count = 128
unit_step_step = 2
max_episodes = 1000

resume = True
epochs_before_saving = 1

# load pre-trained model if exist
if (resume and os.path.isfile('my_model_weights.h5')):
    print("loading previous weights")
    model.load_weights('my_model_weights.h5')

# main loop
current_unit_count = unit_step_step
# TODO: add increasing of layers
while(current_unit_count <= max_unit_count):
    x_train, y_train, rewards = [],[],[]
    reward_sum = 0
    episode_nb = 0
    model = get_model()
    all_reward_sums = np.array([])
    prev_position_player_y = 0
    prev_position_ball_x = 0
    prev_position_ball_y = 0
    while (episode_nb < max_episodes):
        ram = env.unwrapped._get_ram()
        position_player_y = float(ram[ram_address_position_player_y])
        position_ball_x = float(ram[ram_address_position_ball_x])
        position_ball_y = float(ram[ram_address_position_ball_y])
    
        # TODO: Try to give direction as number between 0-1 (0-360°) instead of prev position.
        x = np.array([position_player_y, position_ball_x, position_ball_y, prev_position_player_y, prev_position_ball_x, prev_position_ball_y])    
        # TODO: normalize based on real max and min values for x and y
        x -= 100.0
        x /= 200.0
        #print('X: ' + str(x))
        
        prev_position_player_y = position_player_y
        prev_position_ball_x = position_ball_x
        prev_position_ball_y = position_ball_y
        
        # TODO: Why in the simulation it gets stuck at the top? But actually it should regulate itself?
        # Draw current state.
        '''
        plt.imshow(env.unwrapped._get_image())
        print('TV:')
        display.display(pl.gcf())
        time.sleep(0.5)
        display.clear_output(wait=True)
        '''
        
        # forward the policy network and sample action according to the proba distribution
        proba = model.predict(np.array([x]))
        # print('Prediction: ' + str(proba))
        # Variable proba is the probability prediction of how good UP_ACTION is for this frame.
        # Then select UP_ACTION by proba percent. Easy way to still allow the other action.
        # TODO: Mathematical reason for random number?
        action = UP_ACTION if np.random.uniform() < proba else DOWN_ACTION
        y = 1 if action == UP_ACTION else 0 # 0 and 1 are our labels
    
        # log the input and label to train later
        x_train.append(x)
        y_train.append(y)
    
        # do one step in our environment
        observation, reward, done, info = env.step(action)
        #print('Observation: ' + str(observation))
        #print('Reward: ' + str(reward))
        #print('Done: ' + str(done))
        #print('Info: ' + str(info))
        rewards.append(reward)
        reward_sum += reward
        
        # end of an episode
        if done:
            print('At the end of episode', episode_nb, 'the total reward was :', reward_sum)
            all_reward_sums = np.append(all_reward_sums, reward_sum)
            
            average_of_last = str(np.average(all_reward_sums[-epochs_before_saving:]))
            print('Average of current run: ' + str(average_of_last))
            
            # increment episode number
            episode_nb += 1
            
            # training
            # TODO: Is np.vstack is really necessary?
            # TODO: Clarify the use of gamma in sample_weight=discount_rewards(rewards, gamma)
            history = model.fit(x=np.vstack(x_train), y=np.vstack(y_train), verbose=1, sample_weight=discount_rewards(rewards, gamma))
            accuracy = history.history['accuracy']                 
                
            if episode_nb % epochs_before_saving == 0:    
                model.save_weights(str(current_unit_count) + ' | ' + str(average_of_last) + ' | ' + str(accuracy) +  ' | ' + str(episode_nb) + '.h5')
                
            # Reinitialization
            x_train, y_train, rewards = [],[],[]
            observation = env.reset()
            reward_sum = 0
            prev_input = None
    current_unit_count += unit_step_step

At the end of episode 0 the total reward was : -21.0
Average of current run: -21.0
Epoch 1/1
At the end of episode 1 the total reward was : -21.0
Average of current run: -21.0
Epoch 1/1
At the end of episode 2 the total reward was : -20.0
Average of current run: -20.0
Epoch 1/1
At the end of episode 3 the total reward was : -20.0
Average of current run: -20.0
Epoch 1/1
At the end of episode 4 the total reward was : -21.0
Average of current run: -21.0
Epoch 1/1
At the end of episode 5 the total reward was : -21.0
Average of current run: -21.0
Epoch 1/1
At the end of episode 6 the total reward was : -20.0
Average of current run: -20.0
Epoch 1/1
At the end of episode 7 the total reward was : -21.0
Average of current run: -21.0
Epoch 1/1
At the end of episode 8 the total reward was : -21.0
Average of current run: -21.0
Epoch 1/1
At the end of episode 9 the total reward was : -21.0
Average of current run: -21.0
Epoch 1/1
At the end of episode 0 the total reward was : -20.0
Average of curren

KeyboardInterrupt: 