In [1]:
import numpy as np
from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT, RIGHT_ONLY
from IPython.display import clear_output
from gym import wrappers
import matplotlib.pyplot as plt

import keras
from keras.models import Sequential, Model
from keras.layers import Input, Conv2D, Dense, Flatten, MaxPooling1D
from keras.layers import AveragePooling2D, MaxPooling2D, LSTM, Concatenate, Reshape, GRU, BatchNormalization
from keras.initializers import Constant
from keras.constraints import MaxNorm

version = 0
movement_type = SIMPLE_MOVEMENT

def make_env(version, movement_type):
    env = gym_super_mario_bros.make('SuperMarioBros-v' + str(version))
    env = BinarySpaceToDiscreteSpaceEnv(env, movement_type)
    return env

env = make_env(version, movement_type)
obs_shape = env.observation_space.shape
square_shape = (16,16)
strides = int(square_shape[0]/2)
output_dim = len(env.get_action_meanings())

def get_mario_model(obs_shape = obs_shape, square_shape = square_shape, strides = strides, output_dim = output_dim, hidden_size = 27):
    
    model = Sequential()
    model.add(Conv2D(batch_input_shape = np.concatenate(([1],obs_shape)), filters = 3, kernel_size = (8,8), strides = 8, activation = 'relu', padding = 'same'))
    model.add(AveragePooling2D(pool_size = 2))
    model.add(BatchNormalization())
    model.add(Conv2D(filters = 3, kernel_size = (4,4), strides = 2, activation = 'relu', padding = 'same'))
    model.add(BatchNormalization())
    model.add(Conv2D(filters = 3, kernel_size = (4,4), strides = 2, activation = 'relu', padding = 'same'))
    model.add(BatchNormalization())
    #model.add(AveragePooling2D(pool_size = 2))
    model.add(Flatten())
    #model.add(Dense(100))
    model.add(Reshape((1,48)))
    #model.add(Flatten())
    model.add(GRU(output_dim, batch_size = 1, stateful = True, activation = 'softmax'))
    #model.add(Dense(output_dim, activation = 'softmax'))
    model.compile(optimizer = 'adadelta', loss = 'mse')
    return model

def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [8]:
import math

eps = 10**-10
def rev_soft(vector):
    for i in range(len(vector)):
        vector[0][i] = math.exp(-math.log(vector[0][i]+eps))
    return vector/sum(vector)

def gameplay(env_args, agent, max_frames, render = False):  
    
    version = env_args[0]
    movement_type = env_args[1]
    env = make_env(version, movement_type)
    
    agent.reset_states()
    reward_hist = []
    
    life = 2
    fitness = 0
    done = True
    x_pos = -1
    resting = 0
    score = 0
    action = 0
    reward = 0
    action = 0
    info = dict()
    info['life'] = 2
    for step in range(max_frames):
        if done:
            state = env.reset()
        np_state = state.reshape(1,240,256,3)
        action_vec = agent.predict(np_state)
        clear_output(wait = True)
        action = np.argmax(action_vec)
        print('last mistake: ',env.get_action_meanings()[action])
        print(action_vec)
        state, reward, done, info = env.step(action)
        reward += (info['score']/100 - score)
        score = info['score']/100
        #print(np.shape(np_state), np.shape(action_vec))
        #print(action_vec)
        
        if reward > 0:
            agent.train_on_batch(x = np_state, y = action_vec)
            #print(action_vec)
        else:
            agent.train_on_batch(x = np_state, y = -(action_vec))
        fitness += float(reward)
        if render:
            env.render()
        if abs(info['x_pos'] - x_pos) < 5:
            resting += 1
        else:
            x_pos = info['x_pos']
            resting = 0
        if resting > 300:
            env.reset()
            resting = 0
            #env.close()
            #return fitness
        
            #env.close()
            #return fitness
            pass
        #reward_hist.append(float(reward))
    env.close()
    return fitness

In [9]:
mario = get_mario_model()
mario.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_7 (Conv2D)            (1, 30, 32, 3)            579       
_________________________________________________________________
average_pooling2d_3 (Average (1, 15, 16, 3)            0         
_________________________________________________________________
batch_normalization_7 (Batch (1, 15, 16, 3)            12        
_________________________________________________________________
conv2d_8 (Conv2D)            (1, 8, 8, 3)              147       
_________________________________________________________________
batch_normalization_8 (Batch (1, 8, 8, 3)              12        
_________________________________________________________________
conv2d_9 (Conv2D)            (1, 4, 4, 3)              147       
_________________________________________________________________
batch_normalization_9 (Batch (1, 4, 4, 3)              12        
__________

In [11]:
gameplay(env_args = [0, movement_type], agent = mario, max_frames = 100000, render = True)

last mistake:  A
[[0.001183   0.00108814 0.00156289 0.00241907 0.00113375 0.989678
  0.00157725]]


KeyboardInterrupt: 

In [None]:
env = make_env(0, movement_type)

In [None]:
dir(env)

In [None]:
a = np.random.shuffle(np.array(range(10)))

In [None]:
eps = 10**-10
def rev_soft(vector):
    vector = np.array(vector)
    for i in range(len(vector)):
        vector[i] = math.exp(-math.log(vector[i]+eps))
    return vector/sum(vector)

In [None]:
rev_soft([0.1, 0.1, 0.3, 0.5])

In [None]:
#mario = get_mario_model()
mario.layers[-1].get_weights()