In [42]:
import numpy as np
from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT, RIGHT_ONLY
from IPython.display import clear_output
from gym import wrappers
import matplotlib.pyplot as plt

In [1]:
import keras
from keras.models import Sequential, Model
from keras.layers import Input, Conv2D, Dense, Flatten, MaxPooling1D
from keras.layers import AveragePooling2D, MaxPooling2D, LSTM, Concatenate, Reshape, GRU, BatchNormalization, SeparableConv2D
from keras.initializers import Constant
from keras.constraints import MaxNorm

version = 0
movement_type = SIMPLE_MOVEMENT

def make_env(version, movement_type):
    env = gym_super_mario_bros.make('SuperMarioBros-v' + str(version))
    env = BinarySpaceToDiscreteSpaceEnv(env, movement_type)
    return env

env = make_env(version, movement_type)
obs_shape = env.observation_space.shape
square_shape = (16,16)
strides = int(square_shape[0]/2)
#output_dim = len(env.get_action_meanings())
output_dim = 4

def get_mario_model(obs_shape = obs_shape, square_shape = square_shape, strides = strides, output_dim = output_dim, hidden_size = 27):
    
    model = Sequential()
    model.add(SeparableConv2D(batch_input_shape = np.concatenate(([1],obs_shape)), filters = 3, kernel_size = (8,8), 
                     strides = 8, activation = 'relu', padding = 'valid'))
    model.add(AveragePooling2D(pool_size = 2))
    model.add(Conv2D(filters = 3, kernel_size = (4,4), strides = 2, activation = 'relu', padding = 'same'))
    #model.add(AveragePooling2D(pool_size = 2))
    model.add(Conv2D(filters = 6, kernel_size = (4,4), strides = 2, activation = 'relu', padding = 'same'))
    model.add(Conv2D(filters = 12, kernel_size = (4,4), strides = 2, activation = 'relu', padding = 'same'))
    model.add(BatchNormalization())
    model.add(Flatten())
    #model.add(Dense(100))
    model.add(Reshape((1,48)))
    #model.add(Flatten())
    model.add(GRU(output_dim, batch_size = 1, stateful = True, activation = 'tanh'))
    #model.add(Dense(output_dim, activation = 'softmax'))
    model.compile(optimizer = 'adam', loss = 'mse')
    return model

def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


NameError: name 'SIMPLE_MOVEMENT' is not defined

In [3]:
print(COMPLEX_MOVEMENT)


def action_translator(action_vec, movement_type = movement_type, threshold = 0.2, verbose = False):
    actions = []
    if action_vec[0] > threshold:
        actions.append('right')
        if action_vec[3] > 0:
            actions.append('B')
    elif action_vec[0] < -threshold:
        actions.append('left')
        if action_vec[3] > 0:
            actions.append('B')
    if action_vec[2] > 0:
            actions.append('A')
    elif action_vec[0] < threshold and action_vec[0] > -threshold:
        if action_vec[1] > threshold:
            actions.append('up')
        elif action_vec[1] < -threshold:
            actions.append('down')
    
    if verbose:
        print(actions)
    for i, commands in enumerate(movement_type):
        if set(actions) == set(commands):
            return i
    return 0

[['NOOP'], ['right'], ['right', 'A'], ['right', 'B'], ['right', 'A', 'B'], ['A'], ['left'], ['left', 'A'], ['left', 'B'], ['left', 'A', 'B'], ['down'], ['up']]


In [4]:
def mutation(agents, progenitor_label, mutant_label, cancer_level = 1, chance = 0.1):
    import copy
    progenitor = agents[progenitor_label]
    mutant = agents[mutant_label]
    for pro_layer, mut_layer in zip(progenitor.layers, mutant.layers):
        new_weights = []
        for sublayer in pro_layer.get_weights():
            mutated =  np.random.uniform(-cancer_level, cancer_level, np.shape(sublayer))
            for x in np.nditer(mutated, op_flags=['readwrite']):
                if np.random.random() > chance:
                    x[...] = 0
            new_weights.append(sublayer + mutated)
        mut_layer.set_weights(new_weights)
    
    return mutant

def survival_of_the_fittest(agents, fitness_vec):
    top_agents = sorted(zip(agents, fitness_vec), key  = lambda agent: -agent[1])
    sorted_agents = list(np.array(top_agents).T[0])
    return sorted_agents

def reproduction(agents, chance, cancer_level):
    offspring = []
    offspring.append(agents[0])
    for i in range(1,len(agents)):
        offspring.append(mutation(agents = agents, progenitor_label = 0, 
                                  mutant_label = i, 
                                  chance = chance, cancer_level = cancer_level))
    return offspring

def get_fitness_vec(agents, env, max_frames, num_survivors, fitness_vec, buffer = 3):
    for i, agent in enumerate(agents[:num_survivors]):
        fitness_vec[i] = gameplay(agent, env, max_frames, buffer = 3)
    return fitness_vec

import sklearn.preprocessing
a = range(len(env.get_keys_to_action()))
label_binarizer = sklearn.preprocessing.LabelBinarizer()
label_binarizer.fit(range(max(a)+1))


def gameplay(agent, env, max_frames, label_binarizer = label_binarizer, buffer = 3):  
    
    agent.reset_states()
    reward_hist = []
    
    life = 2
    fitness = 0
    done = True
    x_pos = -1
    resting = 0
    score = 0
    action = 0
    reward = 0
    action_vec = np.ones((1,1,len(env.get_keys_to_action())))
    action = 0
    for step in range(max_frames):
        if done:
            state = env.reset()
        #clear_output(wait = True)
        if step%buffer == 0:
            np_state = state.reshape(1,240,256,3)
            action_vec = agent.predict(np_state)
            action = action_translator(action_vec[0], verbose = False)
        state, reward, done, info = env.step(action)
        #reward += (info['score']/100 - score)
        #score = info['score']/100
        #fitness += float(reward*(np.max(action_vec)+1))
        fitness += float(reward)
        #print(reward)
        #print(action_vec)
        
        if abs(info['x_pos'] - x_pos) < 1:
            resting += 1
        else:
            x_pos = info['x_pos']
            resting = 0
        if resting > 150:
            #env.close()
            return fitness
        if life != info['life']:
            #env.close()
            return fitness
        #reward_hist.append(float(reward))
    #env.close()
    return fitness

import time

def evolution_step(env, generation, num_agents_per_gen, num_survivors, chance, cancer_level, max_frames, fitness_vec, buffer = 3):
    
    num_offspring = num_agents_per_gen//num_survivors
    fitness_hist = []
    current = time.time()
    fitness_vec = get_fitness_vec(generation, env, max_frames, 0, fitness_vec, buffer = 3)
    fit_time = time.time() - current
    survivors = survival_of_the_fittest(generation, fitness_vec)
    current = time.time()
    generation = reproduction(survivors, chance = chance, cancer_level = cancer_level)
    gen_time = time.time() - current
    times = (fit_time, gen_time)
    return generation, max(fitness_vec), times

In [5]:
def survival_of_the_fittest(agents, fitness_vec):
    top_agents = sorted(zip(agents, fitness_vec), key  = lambda agent: -agent[1])
    sorted_agents = list(np.array(top_agents).T[0])
    return sorted_agents

In [6]:
def reproduction(agents, chance, cancer_level):
    offspring = []
    offspring.append(agents[0])
    for i in range(1,len(agents)):
        offspring.append(mutation(agents = agents, progenitor_label = 0, 
                                  mutant_label = i, 
                                  chance = chance, cancer_level = cancer_level))
    return offspring

In [44]:
import sklearn.preprocessing
a = range(len(env.get_keys_to_action()))
label_binarizer = sklearn.preprocessing.LabelBinarizer()
label_binarizer.fit(range(max(a)+1))


def gameplay(agent, env, max_frames, label_binarizer = label_binarizer, buffer = 3):  
    
    agent.reset_states()
    reward_hist = []
    
    life = 2
    fitness = 0
    done = True
    x_pos = -1
    resting = 0
    score = 0
    action = 0
    reward = 0
    action_vec = np.ones((1,1,len(env.get_keys_to_action())))
    action = 0
    for step in range(max_frames):
        if done:
            state = env.reset()
        #clear_output(wait = True)
        if step%buffer == 0:
            np_state = state.reshape(1,240,256,3)
            action_vec = agent.predict(np_state)
            action = action_translator(action_vec[0], verbose = False)
        state, reward, done, info = env.step(action)
        #reward += (info['score']/100 - score)
        #score = info['score']/100
        #fitness += float(reward*(np.max(action_vec)+1))
        fitness += float(reward)
        #print(reward)
        #print(action_vec)
        
        if abs(info['x_pos'] - x_pos) < 1:
            resting += 1
        else:
            x_pos = info['x_pos']
            resting = 0
        if resting > 150:
            #env.close()
            return fitness
        if life != info['life']:
            #env.close()
            return fitness
        #reward_hist.append(float(reward))
    #env.close()
    return fitness

In [8]:
def get_fitness_vec(agents, env, max_frames, num_survivors, fitness_vec, buffer = 3):
    for i, agent in enumerate(agents):
        if i > num_survivors:
            fitness_vec[i] = gameplay(agent, env, max_frames, buffer = 3)
    return fitness_vec

In [9]:
import time

def evolution_step(env, generation, num_agents_per_gen, num_survivors, chance, cancer_level, max_frames, fitness_vec, buffer = 3):
    
    num_offspring = num_agents_per_gen//num_survivors
    fitness_hist = []
    current = time.time()
    fitness_vec = get_fitness_vec(generation, env, max_frames, 0, fitness_vec, buffer = 3)
    fit_time = time.time() - current
    survivors = survival_of_the_fittest(generation, fitness_vec)
    current = time.time()
    generation = reproduction(survivors, chance = chance, cancer_level = cancer_level)
    gen_time = time.time() - current
    times = (fit_time, gen_time)
    return generation, max(fitness_vec), times

In [10]:
import pickle

mario_count = 2
mario = get_mario_model()
mario.summary()
final_generation = [get_mario_model() for _ in range(mario_count)]
#final_generation = pickle.load(open('best_generation.pkl','rb'))

fitness_hist = []
#fitness_hist = pickle.load(open('fitness_hist.pkl','rb'))
epoch_count = 0
fitness = 0
fitness_vec = [0]*len(final_generation)

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (1, 30, 32, 3)            579       
_________________________________________________________________
average_pooling2d_1 (Average (1, 15, 16, 3)            0         
_________________________________________________________________
conv2d_2 (Conv2D)            (1, 8, 8, 3)              147       
_________________________________________________________________
conv2d_3 (Conv2D)            (1, 4, 4, 6)              294       
_________________________________________________________________
conv2d_4 (Conv2D)            (1, 2, 2, 12)             1164      
_________________________________________________________________
batch_normalization_1 (Batch (1, 2, 2, 12)             48        
_________________________________________________________________
flat

In [1]:
%%time

env = make_env(version, movement_type)

num_epochs = 1000
for epoch in range(num_epochs):
    epoch_count += 1
    max_frames = 5000
    final_generation, fitness, times = evolution_step(env = env, generation = final_generation,
                      num_agents_per_gen = mario_count, num_survivors = 1, buffer = 3,
                      chance = 0.005, cancer_level = 1, max_frames = max_frames, fitness_vec = fitness_vec)
    fitness_hist.append(fitness)
    clear_output(wait = True)
    print('epoch: ', epoch_count)
    print('fitness: ', fitness)
    print('times: ', times)
    pickle.dump(final_generation, open('weighted2_reward' + str(movement_type) + 'generation.pkl', 'wb'))
    pickle.dump(fitness_hist, open('new_fitness_hist.pkl','wb'))
plt.plot(fitness_hist)

NameError: name 'make_env' is not defined

In [2]:
agent = agents[0]

reward_hist = []

agent.reset_states()
buffer = 3

env = make_env(version, movement_type)
env = wrappers.Monitor(env, "./gym-results", force=True)

life = 2
fitness = 0
done = True
x_pos = -1
resting = 0
score = 0
state_history = []
for step in range(10000):
    if done:
        state =  env.reset()
    if step%buffer == 0:
        np_state = np.array(state.reshape(1,240,256,3))
        action_vec = agent.predict(np_state)
        action = action_translator(action_vec[0])
    state, reward, done, info = env.step(action)
    reward += info['score']/100 - score
    score = info['score']/100
    if life != info['life']:
        #env.close()
        break
    if abs(info['x_pos'] - x_pos) < 1:
        resting += 1
    else:
        x_pos = info['x_pos']
        resting = 0
    if resting > 50:
        break
    reward_hist.append(float(reward))
    clear_output(wait = True)
    plt.imshow(np_state[0])
    plt.show()
    state_history.append(np_state[0])
env.close()

import io
import base64
from IPython.display import HTML

video = io.open('./gym-results/openaigym.video.%s.video000000.mp4' % env.file_infix, 'r+b').read()

encoded = base64.b64encode(video)
print(fitness)

video = HTML(data='''
        <video width="360" height="auto" alt="test" controls><source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'''
    .format(encoded.decode('ascii')))

NameError: name 'agents' is not defined

In [27]:
video

In [14]:

final_generation[1].layers[0].get_weights()

[array([[[[ 3.24058831e-02, -1.44240394e-01,  5.75020015e-02],
          [-1.75468668e-01,  8.46975148e-02, -8.17062855e-02],
          [ 1.05910540e-01,  1.35828316e-01,  1.57587361e-02]],
 
         [[ 1.02869004e-01, -1.13769859e-01,  1.22863293e-01],
          [-3.36441994e-02,  1.15431389e-02,  9.46033597e-02],
          [ 2.64967466e-03,  1.06778413e-01,  9.64252055e-02]],
 
         [[ 5.38550317e-02,  6.85678720e-02,  6.02620244e-02],
          [-9.96830761e-02, -1.20229512e-01, -2.77282596e-02],
          [ 7.34265745e-02,  2.75697410e-02, -2.71953940e-02]],
 
         [[ 2.52249241e-02,  7.85366297e-02,  2.86211073e-02],
          [-3.00897956e-02,  2.89447010e-02,  1.24070078e-01],
          [-8.09809566e-02, -5.46855154e-03,  8.60202014e-02]],
 
         [[-1.20655984e-01, -6.60990477e-02,  3.66608202e-02],
          [ 1.19713962e-01,  6.60248399e-02, -9.87692475e-02],
          [-1.81681663e-01, -8.71637762e-02, -5.30388393e-02]],
 
         [[-3.67123187e-02,  1.45450234e

In [15]:
agent.layers

[<keras.layers.convolutional.Conv2D at 0x7f2da04fd668>,
 <keras.layers.pooling.AveragePooling2D at 0x7f2da02fd1d0>,
 <keras.layers.convolutional.Conv2D at 0x7f2da02fd240>,
 <keras.layers.convolutional.Conv2D at 0x7f2da0314fd0>,
 <keras.layers.convolutional.Conv2D at 0x7f2da02bd978>,
 <keras.layers.normalization.BatchNormalization at 0x7f2da02d69b0>,
 <keras.layers.core.Flatten at 0x7f2da02d6748>,
 <keras.layers.core.Reshape at 0x7f2da027e048>,
 <keras.layers.recurrent.GRU at 0x7f2da027e9e8>]

In [16]:
eyes = Sequential()
for layer in agent.layers[:4]:
    eyes.add(layer)

In [17]:
vision_history = []
for state in state_history:
    vision = eyes.predict(state.reshape(1,240,256,3))[0]
    vision_history.append((vision*255//np.max(vision)).astype('uint8'))

In [18]:
import imageio
imageio.mimwrite('gameplay.gif', state_history , fps = 60)



imageio.mimwrite('vision.gif', vision_history , fps = 60)

ValueError: Image must be 2D (grayscale, RGB, or RGBA).

In [None]:
action_translator([0,1,1,1])