In [None]:
from keras.layers import *
from __future__ import division
import gym
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation, Reshape, Dropout
from keras.optimizers import sgd
import os
import random
from os.path import isfile
from collections import deque

from gym.core import Wrapper
class RewardForPassingGates(Wrapper):
    """Skiing wrapper that rewards player +1 for passing gates only."""
    def _reset(self):
        """On game reset, remember the hash of initial score"""
        s = self.env.reset()
        self.prev_score_hash = hash(s[31:38,67:81].tobytes()) #hash of the image chunk with scoreboard
        return s
    def _step(self,action):
        """on each step, if score has changed, give +1 reward, else +0"""
        s,_,done,info = self.env.step(action)
        new_score_hash = hash(s[31:38,67:81].tobytes()) #hash of the same image chunk

        #reward = +1 if we have just crossed the gate, else 0
        r = int(new_score_hash != self.prev_score_hash)

        #remember new score
        self.prev_score_hash = new_score_hash
        return s,r,done,info

NUM_ACTIONS = 3
NUM_STATES = 3
MAX_REPLAY_STATES = 10
BATCH_SIZE = 20
NUM_GAMES_TRAIN = 5
JUMP_FPS = 5
WEIGHT_FILE = 'weights.h5'


replay = []

gamma = 0.99
epsilon = 1

env = gym.make("Skiing-v0")

env = RewardForPassingGates(env)


layers = [
    #Reshape((1, 250, 160, 3), input_shape=(250, 160, 3)),
    Convolution2D(16, 5, 5, border_mode='same', input_shape=(170, 160, 3)),
    MaxPooling2D(pool_size=(3, 3), strides=(2,2)),
    Convolution2D(32, 5, 5),
    MaxPooling2D(pool_size=(2, 2), strides=(2,2)),
    Flatten(),
    Dense(250, activation='tanh'),
    Dropout(0.2),
    Dense(output_dim=3, activation='softmax')
]


def t(st):
    return st.reshape(1, 170, 160, 3)

model = Sequential(layers)
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["mse"])
model.summary

NUM_ACTIONS
prb = 0.2
gamma = 0.5
epsilon = 0.35

st = env.reset()

st = st[:-80]
st = t(st)
for number_game in range(10):
  new_state = env.reset()
  new_state = new_state[:-80]
  
  reward_game = 0
  done = False
  loss = 0
  index_train_per_game = 0
  print( '[+] Starting Game ' + str(number_game))
  while not done:
    env.render()
    index_train_per_game += 1
    if random.random() < epsilon:
      action = np.random.randint(NUM_ACTIONS)
    else:

      if new_state.shape[0] != 170:
            new_state = new_state[:-80]
      q = model.predict(t(new_state))[0]
      action = np.argmax(q)
        
        
    old_state = new_state
    
    new_state, reward, done, info = env.step(action)
    
    reward_game += reward
    replay.append([new_state, reward, action, done, old_state])
    
    
    if len(replay) > MAX_REPLAY_STATES:
        replay.pop(np.random.randint(MAX_REPLAY_STATES) + 1)
        
    if JUMP_FPS != 1 and index_train_per_game % JUMP_FPS == 0:
      continue
    
    
    len_mini_batch = min(len(replay), BATCH_SIZE)
    
    mini_batch = random.sample(replay, len_mini_batch)
    
    X_train = []
    Y_train = []
    
    
    for index_rep in range(len_mini_batch):
      new_rep_state, reward_rep, action_rep, done_rep, old_rep_state = mini_batch[index_rep]
      temp = model.predict(t(new_rep_state[:-80]))
      if index_rep % 10 == 0 and index_rep != 0:
          print(temp, "index = ", index_rep)
      
      if old_rep_state.shape[0] != 170:
            old_rep_state = old_rep_state[:-80]
    
      old_q = model.predict(t(old_rep_state))[0]
      new_q = temp[0]
      update_target = np.copy(old_q)
      if done_rep:
        update_target[action_rep] = -1
      else:
        update_target[action_rep] = reward_rep + (gamma * np.max(new_q))
      X_train.append(old_rep_state)
      Y_train.append(update_target)
        
    X_train = np.array(X_train)
    Y_train = np.array(Y_train)
   
        
    loss += np.array(model.train_on_batch(X_train, Y_train))

    if reward_game > 200:
      break
  loss_print = (loss / index_train_per_game * JUMP_FPS).tolist()
  print ("[+] End Game {} | Reward {} | Epsilon {:.4f} | TrainPerGame {} | Loss [{:.4f}, {:.4f}] ".format(number_game, reward_game, epsilon, index_train_per_game, loss_print[0], loss_print[1]))
  #if epsilon >= 0.8:
  #  epsilon -= (1 / (NUM_GAMES_TRAIN))
  if isfile(WEIGHT_FILE):
    os.remove(WEIGHT_FILE)
  model.save_weights(WEIGHT_FILE)


[2017-01-18 22:38:56,585] Making new env: Skiing-v0


[+] Starting Game 0
[+] End Game 0 | Reward 5 | Epsilon 0.3500 | TrainPerGame 1946 | Loss [3.9502, 0.0704] 
[+] Starting Game 1


In [41]:
model.predict(t(new_state))[0]

array([  7.24852782e-08,   9.91017699e-01,   9.01098147e-08], dtype=float32)

In [98]:
model.save('lunch')