In [9]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output

import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT, RIGHT_ONLY
from gym import wrappers
from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv


import keras
import keras.backend as K
from keras.models import Sequential, Model
from keras.layers import Input, Conv2D, Conv3D, Dense, Flatten, MaxPooling1D, SeparableConv2D, Activation, Lambda
from keras.layers import AveragePooling2D, MaxPooling2D, LSTM, Concatenate, Reshape, GRU, BatchNormalization, UpSampling2D
from keras.initializers import Constant
from keras.constraints import MaxNorm
from keras.applications.xception import Xception
from keras.applications.mobilenet_v2 import MobileNetV2


version = 0
movement_type = SIMPLE_MOVEMENT

def make_env(version, movement_type):
    env = gym_super_mario_bros.make('SuperMarioBros-v' + str(version))
    env = BinarySpaceToDiscreteSpaceEnv(env, movement_type)
    return env

env = make_env(version, movement_type)
obs_shape = env.observation_space.shape
square_shape = (16,16)
strides = int(square_shape[0]/2)
output_dim = len(env.get_action_meanings())
#output_dim = 4
buffer = 7

def get_mario_vision_model(buffer = buffer, obs_shape = obs_shape, square_shape = square_shape, strides = strides, output_dim = output_dim, hidden_size = 10):
    image = Input((240, 256, 3))
    encoded = Lambda(lambda x: K.spatial_2d_padding(x), output_shape=(256,256,3))(image)
    encoded = Lambda(function = lambda x: x/255.0)(encoded)
    encoded = Conv2D(kernel_size = (8,8), filters=3, padding = 'same', activation = 'relu')(encoded)
    encoded = MaxPooling2D((4,4))(encoded)
    encoded = Conv2D(kernel_size = (4,4), filters=6, padding = 'same', activation = 'relu')(encoded)
    encoded = MaxPooling2D((4,4))(encoded)
    encoded = Conv2D(kernel_size = (2,2), filters=12, padding = 'same', activation = 'relu')(encoded)
    encoded = MaxPooling2D((2,2))(encoded)
    encoded = Conv2D(kernel_size = (2,2), filters=24, padding = 'same', activation = 'relu')(encoded)
    encoded = MaxPooling2D((2,2))(encoded)
    
    encoder = Model(image, encoded)
 
    decoded = UpSampling2D((2,2))(encoded)
    decoded = Conv2D(kernel_size = (2,2), filters=12, activation = 'relu', padding = 'same')(decoded)
    decoded = UpSampling2D((2,2))(decoded)
    decoded = Conv2D(kernel_size = (2,2), filters=6, activation = 'relu', padding = 'same')(decoded)
    decoded = UpSampling2D((4,4))(decoded)
    decoded = Conv2D(kernel_size = (4,4), filters=3, activation = 'relu', padding = 'same')(decoded)
    decoded = UpSampling2D((4,4))(decoded)
    decoded = Conv2D(kernel_size = (8,8), filters=3, activation = 'relu', padding = 'same')(decoded)
    
    autoencoder = Model(image, decoded)
    
    encoder.compile(optimizer = 'adam', loss = 'mse')
    autoencoder.compile(optimizer = 'adam', loss = 'mse')
    return encoder, autoencoder

def get_mario_action_model(framecount, frame_dim, output_dim):
    model = Sequential()
    model.add(Reshape((framecount, frame_dim)))
    model.add(GRU(output_dim, input_shape = (framecount, frame_dim)))
    model.compile(optimizer = 'adam', loss = 'mse')
    model.build()
    return model
    

def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

print(movement_type)

def action_translator(action_vec, movement_type = movement_type, threshold = 1/3, verbose = False):
    
    actions = []
    if action_vec[0] > threshold:
        actions.append('right')
        if action_vec[3] > 0:
            actions.append('B')
    elif action_vec[0] < -threshold:
        actions.append('left')
        if action_vec[3] > 0:
            actions.append('B')
    if action_vec[2] > 0:
            actions.append('A')
    elif action_vec[0] < threshold and action_vec[0] > -threshold:
        if action_vec[1] > threshold:
            actions.append('up')
        elif action_vec[1] < -threshold:
            actions.append('down')
    
    if verbose:
        print(actions)
    for i, commands in enumerate(movement_type):
        if set(actions) == set(commands):
            return i
    return 0

encoder, autoencoder = get_mario_vision_model()

encoded_dim = 1 
for dim in encoder.layers[-1].output_shape:
    if dim != None:
        encoded_dim *= dim

action_model = get_mario_action_model(buffer, encoded_dim, output_dim)

[['NOOP'], ['right'], ['right', 'A'], ['right', 'B'], ['right', 'A', 'B'], ['A'], ['left']]


In [5]:
autoencoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 240, 256, 3)       0         
_________________________________________________________________
lambda_1 (Lambda)            (None, 256, 256, 3)       0         
_________________________________________________________________
lambda_2 (Lambda)            (None, 256, 256, 3)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 256, 256, 3)       579       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 64, 64, 3)         0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 64, 64, 6)         294       
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 16, 16, 6)         0         
__________

In [6]:
def mutation(agents, progenitor_label, mutant_label, cancer_level = 1, chance = 0.1):
    progenitor = agents[progenitor_label]
    mutant = agents[mutant_label]
    for pro_layer, mut_layer in zip(progenitor.layers, mutant.layers):
        new_weights = []
        for sublayer in pro_layer.get_weights():
            mutated =  np.random.normal(0, cancer_level, np.shape(sublayer))
            for x in np.nditer(mutated, op_flags=['readwrite']):
                if np.random.random() > chance:
                    x[...] = 0
            new_weights.append(np.clip(sublayer + mutated, -1, 1))
        mut_layer.set_weights(new_weights)
    
    return mutant

def survival_of_the_fittest(agents, fitness_vec):
    top_agents = sorted(zip(agents, fitness_vec), key  = lambda agent: -agent[1])
    sorted_agents = list(np.array(top_agents).T[0])
    sorted_fitness = list(np.array(top_agents).T[1])
    return sorted_agents, sorted_fitness

def reproduction(agents, chance, cancer_level, num_survivors):
    offspring = []
    num_offspring = len(agents)//num_survivors
    for i in range(len(agents)):
        if i < num_survivors:
            offspring.append(agents[i])
        else:
            offspring.append(mutation(agents = agents, progenitor_label = (i-num_survivors)//num_offspring, 
                                  mutant_label = i, 
                                  chance = chance, cancer_level = cancer_level))
        
            
    return offspring

def get_fitness_vec(agents, env, max_frames, num_survivors, fitness_vec, buffer, first):
    for i, agent in enumerate(agents):
        if i >= num_survivors:
            fitness_vec[i] = gameplay(agent, env, max_frames, buffer = buffer)
        elif first:
            fitness_vec[i] = gameplay(agent, env, max_frames, buffer = buffer)
    return fitness_vec


def info_reward(info):
    reward = info['x_pos']
    reward += info['score']/100
    if info['status'] == 'tall':
        reward += 10
    elif info['status'] == 'fireball':
        reward += 20
    reward += info['coins']
    reward += 15*info['life']
    reward += 1000*(info['stage']-1)
    reward += 5000*(info['world']-1)
    return reward
    
def gameplay(encoder, action_model, env, max_frames, buffer = 3, max_rest = 800):  
    
    agent.reset_states()
    reward_hist = []
    
    life = 2
    fitness = 0
    done = True
    x_pos = -1
    resting = 0
    score = 0
    action = 0
    reward = 0
    action_vec = np.ones((1,1,len(env.get_keys_to_action())))
    action = 0
    prev_reward = 0
    for step in range(max_frames):
        if done:
            state = env.reset()
        if step%buffer == buffer - 1:
            np_state.append(np.array(state))
            np_state = np.stack(np_state)
            encoded_state = encoder.predict(np_state)
            action_vec = action_model.predict(encoded_state)
            action_vec = np.rand
            action = np.argmax(action_vec)
        elif step%buffer == 0:
            np_state = []
        else:
            np_state.append(np.array(state))
        state, reward, done, info = env.step(action)
        fitness += reward
        if abs(info['x_pos'] - x_pos) < 1:
            resting += 1
        else:
            x_pos = info['x_pos']
            resting = 0
        if resting > max_rest:
            return fitness
        if life != info['life']:
            return fitness
    return fitness

import time

def evolution_step(env, generation, num_agents_per_gen, num_survivors, chance, cancer_level, max_frames, fitness_vec, buffer, first):
    
    fitness_hist = []
    current = time.time()
    fitness_vec = get_fitness_vec(generation, env, max_frames, num_survivors, fitness_vec, buffer= buffer, first = first)
    #para_step(fitness_vec)
    fit_time = time.time() - current
    survivors, fitness_vec = survival_of_the_fittest(generation, fitness_vec)
    current = time.time()
    generation = reproduction(survivors, chance = chance, cancer_level = cancer_level, num_survivors = num_survivors)
    gen_time = time.time() - current
    times = (fit_time, gen_time)
    return generation, fitness_vec, times

In [3]:
mario_count = 32
num_survivors = 4  
env = make_env(version, movement_type)
buffer = 4
version = 0
chance = 0.02
cancer_level = .2

name = str(mario_count) + '-b' + str(buffer) + '-v' + str(version) + '-p' + str(chance) + '-q' + str(cancer_level) + '.pkl'

max_frames = 5000

try:
    agents = pickle.load(open('gen' + name, 'rb'))
    fitness_hist = pickle.load(open('fithist' + name,'rb'))
    fitness_vec = list(reversed(range(len(agents))))
    epoch_count = len(fitness_hist)
except:
    agents = [get_mario_model() for _ in range(mario_count)]
    fitness_hist = []
    fitness_vec = np.zeros(mario_count)
    epoch_count = 0
    
max_frames = 5000

agents[0].summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lambda_16 (Lambda)           (1, 240, 256, 3)          0         
_________________________________________________________________
conv2d_61 (Conv2D)           (1, 30, 32, 1)            193       
_________________________________________________________________
activation_76 (Activation)   (1, 30, 32, 1)            0         
_________________________________________________________________
conv2d_62 (Conv2D)           (1, 15, 16, 2)            34        
_________________________________________________________________
activation_77 (Activation)   (1, 15, 16, 2)            0         
_________________________________________________________________
conv2d_63 (Conv2D)           (1, 8, 8, 4)              36        
_________________________________________________________________
activation_78 (Activation)   (1, 8, 8, 4)              0         
__________

In [10]:
fitness_hist = []

encoder.summary()
action_model = get_mario_action_model(buffer, encoded_dim, output_dim)
action_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 240, 256, 3)       0         
_________________________________________________________________
lambda_5 (Lambda)            (None, 256, 256, 3)       0         
_________________________________________________________________
lambda_6 (Lambda)            (None, 256, 256, 3)       0         
_________________________________________________________________
conv2d_17 (Conv2D)           (None, 256, 256, 3)       579       
_________________________________________________________________
max_pooling2d_9 (MaxPooling2 (None, 64, 64, 3)         0         
_________________________________________________________________
conv2d_18 (Conv2D)           (None, 64, 64, 6)         294       
_________________________________________________________________
max_pooling2d_10 (MaxPooling (None, 16, 16, 6)         0         
__________

ValueError: This model has not yet been built. Build the model first by calling build() or calling fit() with some data. Or specify input_shape or batch_input_shape in the first layer for automatic build. 

In [5]:
from multiprocessing import Process, Lock, Manager, current_process
import numpy as np
import os

def paraplay(fitness_vec, env_no, nproc, max_frames, buffer, max_rest, verbose):
    for agent_no in range(len(agents)):
        if agent_no%nproc == env_no:
            fitness = gameplay(agents[agent_no], envs[env_no], max_frames, buffer, max_rest, verbose)
            fitness_vec[agent_no] = fitness
    print(os.getpid())

In [6]:
def para_step(fitness_vec):
    manager = Manager()
    jobs = []

    fitness_vec = manager.list(fitness_vec)
    
    for i in range(7):
        env_no = i
        p = Process(target=paraplay, args=(fitness_vec, env_no, 7, 10, 8, 60, False))
        jobs.append(p)
        
    for job in jobs:
        job.start()
    
    for job in jobs:
        job.join()

In [7]:
import copy

fitness_hist = []
epoch_count = 1

best_agent = agents[0]
best_score = 0 

In [38]:
max_frames = 1000
max_rest = 100

reward_hist = []

life = 2
fitness = 0
done = True
x_pos = -1
resting = 0
score = 0
action = 0
reward = 0
action_vec = np.ones((1,1,len(env.get_keys_to_action())))
action = 0
prev_reward = 0
for step in range(max_frames):
    if done:
        state = env.reset()
    if step%buffer == buffer - 1:
        np_state.append(np.array(state))
        np_state = np.stack(np_state)
        encoded_state = encoder.predict(np_state)
        action_vec = action_model.predict(encoded_state)
        action_vec = np.rand
        action = np.argmax(action_vec)
    elif step%buffer == 0:
        np_state = []
    else:
        np_state.append(np.array(state))
    state, reward, done, info = env.step(action)
    fitness += reward
    if abs(info['x_pos'] - x_pos) < 1:
        resting += 1
    else:
        x_pos = info['x_pos']
        resting = 0
    if resting > max_rest:
        print(fitness)
    if life != info['life']:
        print(fitness)
print(fitness)

ValueError: total size of new array must be unchanged