# Atari gym environment
Random policy for MsPacman Pong and Boxing


# To load model, run cells at bottom of notebook

In [8]:
import sys
import os
import math
import pdb
import time
import random
import numpy as np
import pandas as pd
import tensorflow as tf
import gym
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from datetime import datetime
from skimage.color import rgb2gray
from skimage.transform import resize

 Convert the 4 210x160x3 uint8 frames into a single agent state, size 28x28x4

In [9]:
def preprocess(observation):
    resized = resize(observation, (28, 28), preserve_range=True)
    return rgb2gray(resized).astype("uint8")


Initialize the state for an environment, need to get the firs four frames and stack them together

In [10]:
def get_init_state(env):
    state_list = []
    observation = env.reset()
    for _ in range(4):
        state_list.append(preprocess(observation))
        action = env.action_space.sample()
        observation, _, _, info = env.step(action)
    state = np.stack(state_list, axis=2)
    return env, state, observation

Need to keep four frames at a time, each time we see a new state we need to drop the oldest and add the newest

In [11]:
def update_state(state, observation):
    state = np.append(state, preprocess(observation).reshape(28, 28, 1), axis=2)
    new_state = state[:, :, 1:]
    return new_state

In [12]:
def clip_rewards(reward):
    if reward > 0:
        return 1
    elif reward < 0:
        return -1
    else:
        return 0

Create conv layers

In [13]:
def conv_layer(x, W, b, stride=2):
    conv = tf.nn.conv2d(x, W, strides=[1, stride, stride, 1], padding='SAME')
    conv = tf.nn.relu(tf.nn.bias_add(conv, b))
    return conv

Q- network to estimate the action value function

In [14]:
def Q_net(state, action, n_act, alpha=0.001):
    weights = {
        'conv1': tf.Variable(tf.truncated_normal([6, 6, 4, 16], 0, 0.01)),
        'conv2': tf.Variable(tf.truncated_normal([4, 4, 16, 32], 0, 0.01)),
        'hidden': tf.Variable(tf.truncated_normal([7*7*32, 256], 0, 0.01)),
        'output': tf.Variable(tf.truncated_normal([256, n_act], 0, 0.01))
    }
    biases = {
        'conv1': tf.Variable(tf.constant(0.1, shape=[16])),
        'conv2': tf.Variable(tf.constant(0.1, shape=[32])),
        'hidden': tf.Variable(tf.constant(0.1, shape=[256])),
        'output': tf.Variable(tf.constant(0.1, shape=[n_act]))
    }

    # Convolutional layers (with ReLU)
    conv1 = conv_layer(state, weights['conv1'], biases['conv1'], stride=2)
    conv2 = conv_layer(conv1, weights['conv2'], biases['conv2'], stride=2)
    conv2_flat = tf.reshape(conv2,  [-1, 7*7*32])

    # Fully connected layers
    fc = tf.nn.relu(tf.matmul(conv2_flat, weights['hidden']) + biases['hidden'])
    q = tf.matmul(fc, weights['output']) + biases['output']
    return q

In [15]:
# Training parameters
test_episodes = 100
max_eps_length = 10000000
gamma = 0.99
epsilon = 0.01
game = 'Boxing-v3'

In [16]:
def init_tf_vars():
    state = tf.placeholder("float", [None,28, 28, 4])
    action = tf.placeholder(tf.int32, [None, 2])
    reward = tf.placeholder("float", [None, 1])
    return state, action, reward

In [22]:
this_env = gym.make(game)  # for evaluation
n_act = this_env.action_space.n
alpha = 0.001
state, action, reward   = init_tf_vars()
q = Q_net(state, action, n_act, alpha=alpha)
init = tf.global_variables_initializer()

[2017-04-11 22:45:04,342] Making new env: Boxing-v3


In [None]:
with tf.Session() as sess:
    sess.run(init)
    start = time.time()
    score_list = []
    frame_count_list =[]
    

    print("\n#------ Running 100 episodes for testing")
    for episode in range(test_episodes):
        score  = frame_count  = 0
        print('Running episode:{}'.format(episode+1))
        print("#------- Total time elapsed = %s\n" % str(time.time()-start))
        this_env, s_test, test_obs = get_init_state(this_env)

        for test_t in range(max_eps_length):
            q_now = sess.run(q, feed_dict={state: s_test.reshape(1, 28, 28, 4)})
            a = np.argmax(q_now)
            next_test_obs, test_r, done, info = this_env.step(a)
            

            if test_r > 0:
                score += test_r


            s_prime_test = update_state(s_test, test_obs)

            if done:
                score_list.append(score)
                frame_count_list.append(test_t+1)
                break
            s_test = s_prime_test
            test_obs = next_test_obs

    # Print results
    results_list = (np.mean(np.array(score_list)), np.std(np.array(score_list)), 
                     np.mean(np.array(frame_count_list)), np.std(np.array(frame_count_list)))
    print("\nMean score = %f, stdev score = %f, mean frame count = %f, stdev frame count = %f" % results_list)
    # Save performance and loss results
    saved_results = list(results_list)
    filename = './results/B2results_'+game+'.npy'
    #np.save(filename, saved_results)

# Load Data

In [26]:
def totuple(a):
    try:
        return tuple(totuple(i) for i in a)
    except TypeError:
        return a

In [27]:
games = ['MsPacman-V3','Pong-V3','Boxing-V3']
for i in range(3):
    game =games[i]
    filename = './results/B2results_'+game+'.npy'
    restored_data = np.load(filename)
    print('The results for {} were:'.format(game))
    print("Mean score = %f, stdev score = %f, mean frame count = %f, stdev frame count = %f\n" % totuple(restored_data))

The results for MsPacman-V3 were:
Mean score = 165.400000, stdev score = 112.751231, mean frame count = 650.020000, stdev frame count = 104.148066

The results for Pong-V3 were:
Mean score = 0.300000, stdev score = 2.100000, mean frame count = 1126.480000, stdev frame count = 674.100133

The results for Boxing-V3 were:
Mean score = 14.560000, stdev score = 2.150907, mean frame count = 2376.450000, stdev frame count = 13.150190

