In [1]:
import numpy as np
from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT, RIGHT_ONLY
from IPython.display import clear_output
from gym import wrappers
import matplotlib.pyplot as plt

import keras
from keras.models import Sequential, Model
from keras.layers import Input, Conv2D, Dense, Flatten, MaxPooling1D
from keras.layers import AveragePooling2D, MaxPooling2D, LSTM, Concatenate, Reshape, GRU, BatchNormalization
from keras.initializers import Constant
from keras.constraints import MaxNorm

version = 0
movement_type = COMPLEX_MOVEMENT

def make_env(version, movement_type):
    env = gym_super_mario_bros.make('SuperMarioBros-v' + str(version))
    env = BinarySpaceToDiscreteSpaceEnv(env, movement_type)
    return env

env = make_env(version, movement_type)
obs_shape = env.observation_space.shape
square_shape = (16,16)
strides = int(square_shape[0]/2)
output_dim = len(env.get_action_meanings())

def get_mario_model(obs_shape = obs_shape, rnn_dim = 20, square_shape = square_shape, strides = strides, output_dim = output_dim, hidden_size = 27):
    
    model = Sequential()
    model.add(Conv2D(batch_input_shape = np.concatenate(([1],obs_shape)), filters = 3, kernel_size = (8,8), strides = 8, activation = 'relu', padding = 'same'))
    model.add(AveragePooling2D(pool_size = 2))
    model.add(BatchNormalization())
    model.add(Conv2D(filters = 3, kernel_size = (4,4), strides = 2, activation = 'relu', padding = 'same'))
    model.add(BatchNormalization())
    model.add(Conv2D(filters = 3, kernel_size = (4,4), strides = 2, activation = 'relu', padding = 'same'))
    model.add(BatchNormalization())
    #model.add(AveragePooling2D(pool_size = 2))
    model.add(Flatten())
    #model.add(Dense(100))
    model.add(Reshape((1,48)))
    #model.add(Flatten())
    #model.add(GRU(rnn_dim, batch_size = 1, stateful = True, activation = 'relu'))
    model.add(Dense(output_dim, activation = 'softmax'))
    model.add(Dense(output_dim, activation = 'softmax'))
    model.compile(optimizer = 'adadelta', loss = 'mse')
    return model

def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import random

import numpy as np
from numpy.random import dirichlet

def gameplay(env_args, agent, max_frames, render = False):  
    
    version = env_args[0]
    movement_type = env_args[1]
    env = make_env(version, movement_type)
    
    agent.reset_states()
    reward_hist = []
    
    life = 2
    fitness = 0
    done = True
    x_pos = -1
    resting = 0
    score = 0
    action = 0
    reward = 0
    action = 0
    info = dict()
    info['life'] = 2
    for step in range(max_frames):
        if done:
            state = env.reset()
        if step%10 == 0:
            np_state = state.reshape(1,240,256,3)
            action_vec = agent.predict(np_state)
            action = np.argmax(action_vec)
        #print(action_vec)
        #print(rev_soft(action_vec))
        #break
        clear_output(wait = True)
        print('last mistake: ',env.get_action_meanings()[action])
        print(action_vec)
        state, reward, done, info = env.step(action)
        reward += (info['score']/100 - score)
        score = info['score']/100
        #print(np.shape(np_state), np.shape(action_vec))
        #print(action_vec)
        
        if reward > 0:
            agent.fit(x = np_state, y = action_vec, verbose = 0)
            #print(action_vec)
        else:
            agent.fit(x = np_state, y = dirichlet(action_vec[0][0], size=1).reshape(1,1,len(env.get_keys_to_action())), epochs = 10, verbose = 0)
        fitness += float(reward)
        if render:
            env.render()
        if abs(info['x_pos'] - x_pos) < 5:
            resting += 1
        else:
            x_pos = info['x_pos']
            resting = 0
        if resting > 100:
            env.reset()
            resting = 0
            #env.close()
            #return fitness
        
            #env.close()
            #return fitness
            pass
        #reward_hist.append(float(reward))
    env.close()
    return fitness

In [3]:
mario = get_mario_model()
mario.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (1, 30, 32, 3)            579       
_________________________________________________________________
average_pooling2d_1 (Average (1, 15, 16, 3)            0         
_________________________________________________________________
batch_normalization_1 (Batch (1, 15, 16, 3)            12        
_________________________________________________________________
conv2d_2 (Conv2D)            (1, 8, 8, 3)              147       
_________________________________________________________________
batch_normalization_2 (Batch (1, 8, 8, 3)              12        
_________________________________________________________________
conv2d_3 (Conv2D)            (1, 4, 4, 3)              147       
_________________________________________________________________
batc

In [4]:
gameplay(env_args = [0, movement_type], agent = mario, max_frames = 100000, render = True)

last mistake:  right B
[[[0.06151859 0.08476613 0.09580789 0.16872647 0.09092003 0.07400467
   0.08651996 0.05664944 0.05290625 0.11307614 0.06643151 0.04867288]]]


KeyboardInterrupt: 

In [None]:
dirichlet(action_vec[0][0], size=1)

In [None]:
np.zeros(action_vec.shape)[0][0]