In [6]:
from __future__ import print_function
import numpy as np
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
import gym

In [3]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [4]:
# initialize variables
n_actions = 2

In [7]:
env = gym.make("Pong-v0")
observation = env.reset()
prev_x = None

[2018-01-01 12:34:02,227] Making new env: Pong-v0


In [2]:
def prepro(I):
    """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
    I = I[35:195] # crop
    I = I[::2,::2,0] # downsample by factor of 2
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    return I.astype(np.float).ravel()

In [6]:
img = keras.layers.Input(shape=(80,80,1))
conv1 = Conv2D(16, (3, 3), activation='relu')(img)
maxpool1 = MaxPooling2D(pool_size=(2, 2))(conv1)
conv2 = Conv2D(16, (3, 3), activation='relu')(maxpool1)
maxpool2 = MaxPooling2D(pool_size=(2, 2))(conv2)
conv3 = Conv2D(16, (3, 3), activation='relu')(maxpool2)
maxpool3 = MaxPooling2D(pool_size=(2, 2))(conv3)
flatten = Flatten()(maxpool3)
input2 = keras.layers.Input(shape=(n_actions,))
concat = keras.layers.Concatenate(axis=-1)([input2,flatten])
dense1 = Dense(64, activation='relu')(concat)
dense2 = Dense(32, activation='relu')(dense1)
out = keras.layers.Dense(1, activation='relu')(dense2)
model = keras.models.Model(inputs=[img, input2], outputs=out)
model.summary()

model.compile(loss='mean_squared_error',
              optimizer=keras.optimizers.Adam())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 80, 80, 1)    0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 78, 78, 16)   160         input_1[0][0]                    
__________________________________________________________________________________________________
max_pooling2d_1 (MaxPooling2D)  (None, 39, 39, 16)   0           conv2d_1[0][0]                   
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, 37, 37, 16)   2320        max_pooling2d_1[0][0]            
__________________________________________________________________________________________________
max_poolin

In [7]:
s = np.zeros([1,80,80,1])
a = np.zeros([1,2])
model.predict([s,a])

array([[ 0.]], dtype=float32)

In [None]:
def Q(s,n_a):
    """takes the state, and the number of actions. Returns a numpy array of estimates of Q(s,a)"""
    q = np.zeros(n_a)
    for i in range(n_a):
        a = np.zeros(n_a)
        a[i] = 1
        q[i] = model.predict([s,a])
    return q

In [36]:
np.random.randint(2)

1

In [None]:
def act(Q):
    """pick action based on Q"""
    
    return 1

In [None]:
def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(xrange(0, r.size)):
        if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

In [10]:
states,actions,drs = [],[],[]
episode_number = 0

In [None]:
while True:
    env.render()
    # 1 iter per frame
    cur_s = prepro(observation)
    s = cur_s - prev_s if prev_s is not None else np.zeros(D)
    prev_s = cur_s
    
    # choose action, apply, get measurements
    Q = (s,n_actions)
    a = act(Q)
    observation, reward, done, info = env.step(action+2)
    states.append(s)
    actions.append(a)
    rewards.append(reward)
    if reward == 1: print("!!!!!")
    
    # when finished, calculate rewards, train Q against these rewards
    if done:
        episode_number += 1
        eps = np.vstack(states)
        epa = np.vstack(actions)
        epr = discount_rewards(np.vstack(rewards))
        print(np.sum(rewards))
        states,actions,drs = [],[],[]
        if episode_number%32==0:
            model.fit(x=[eps,eps],y=epr,epochs=8,verbose=0)
        observation = env.reset()
        prev_s = None

In [9]:
help(env.step)

Help on method step in module gym.core:

step(action) method of gym.wrappers.time_limit.TimeLimit instance
    Run one timestep of the environment's dynamics. When end of
    episode is reached, you are responsible for calling `reset()`
    to reset this environment's state.
    
    Accepts an action and returns a tuple (observation, reward, done, info).
    
    Args:
        action (object): an action provided by the environment
    
    Returns:
        observation (object): agent's observation of the current environment
        reward (float) : amount of reward returned after previous action
        done (boolean): whether the episode has ended, in which case further step() calls will return undefined results
        info (dict): contains auxiliary diagnostic information (helpful for debugging, and sometimes learning)

