In [1]:
import gym
import numpy as np
import torch
import time

from gym.wrappers import Monitor

import torch
from torch.distributions import Categorical
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import math
import copy
import os

from itertools import count

import matplotlib.pyplot as plt
%matplotlib notebook

# NEURO-EVO

In [2]:
class CartPoleAgent(nn.Module):
    def __init__(self):
        super(CartPoleAgent, self).__init__()
        self.affine1 = nn.Linear(4, 124)
        # self.dropout = nn.Dropout(p=0.6)
        self.affine2 = nn.Linear(124, 2)

        self.saved_log_probs = []
        self.rewards = []

    def forward(self, x):
        x = self.affine1(x)
        # x = self.dropout(x)
        x = F.relu(x)
        action_scores = self.affine2(x)
        return F.softmax(action_scores, dim=1)

In [3]:
def get_initialized_agents(folderName):
    agents = []
    for path in os.listdir(folderName):
        if path[-4:] == '.pth':
            try:
                model = CartPoleAgent()
                model.load_state_dict(torch.load(folderName + '/' + path))
                agents.append(model)
            except Exception as e:
                print(e)
    return agents

    get_initialized_agents(folderName)

In [4]:
def run_agents(agents):
    game_actions = 2
    reward_agents = []
    env = gym.make("CartPole-v1")
    env.spec.reward_threshold = 500
    
    for agent in agents:
        agent.eval()
    
        observation = env.reset()
        
        r, s = 0, 0
        for _ in range(250):
            
            inp = torch.tensor(observation).type('torch.FloatTensor').view(1,-1)
            output_probabilities = agent(inp).detach().numpy()[0]
            action = np.random.choice(range(game_actions), 1, p=output_probabilities).item()
            new_observation, reward, done, info = env.step(action)
            r = r + reward
            
            s = s + 1
            observation = new_observation

            if done:
                break

        reward_agents.append(r)        
        # reward_agents.append(s)
    
    return reward_agents

In [5]:
def return_average_score(agent, runs):
    score = 0.
    for i in range(runs):
        score += run_agents([agent])[0]
    return score / runs

In [6]:
def run_agents_n_times(agents, runs):
    return [return_average_score(agent, runs) for agent in agents]

In [7]:
def mutate(agent):
    child_agent = copy.deepcopy(agent)
    mutation_power = 0.02 # Set from https://arxiv.org/pdf/1712.06567.pdf
    for param in child_agent.parameters():
        if len(param.shape) == 4: # Weights of Conv2D
            for i0 in range(param.shape[0]):
                for i1 in range(param.shape[1]):
                    for i2 in range(param.shape[2]):
                        for i3 in range(param.shape[3]):
                            param[i0][i1][i2][i3] += mutation_power * np.random.randn()
        
        elif len(param.shape) == 2: # Weights of linear layer
            for i0 in range(param.shape[0]):
                for i1 in range(param.shape[1]):
                    param[i0][i1] += mutation_power * np.random.randn()
        
        elif len(param.shape) == 1: # Biases of linear layer or conv layer
            for i0 in range(param.shape[0]):
                
                param[i0] += mutation_power * np.random.randn()

    return child_agent

In [8]:
def return_children(agents, sorted_parent_indexes, elite_index):
    children_agents = []
    
    for i in range(len(agents)-1):
        selected_agent_index = sorted_parent_indexes[np.random.randint(len(sorted_parent_indexes))]
        children_agents.append(mutate(agents[selected_agent_index]))

    elite_child = add_elite(agents, sorted_parent_indexes, elite_index)
    children_agents.append(elite_child)
    elite_index = len(children_agents) - 1
    
    return children_agents, elite_index

In [9]:
def add_elite(agents, sorted_parent_indexes, elite_index=None, only_consider_top_n=10):
    candidate_elite_index = sorted_parent_indexes[:only_consider_top_n]
    
    if elite_index is not None:
        candidate_elite_index = np.append(candidate_elite_index,[elite_index])
        
    top_score = None
    top_elite_index = None
    
    for i in candidate_elite_index:
        score = return_average_score(agents[i],runs=5)
        print("Score for elite i ", i, " is ", score)
        
        if(top_score is None):
            top_score = score
            top_elite_index = i
        elif(score > top_score):
            top_score = score
            top_elite_index = i
            
    print("Elite selected with index ",top_elite_index, " and score", top_score)
    
    child_agent = copy.deepcopy(agents[top_elite_index])
    return child_agent
    

In [10]:
def play_agent(agent):
    try:
        env = gym.make("CartPole-v1")
        
        env_record = Monitor(env, './video', force=True)
        observation = env_record.reset()
        last_observation = observation
        
        r = 0
        for _ in range(250):
            env_record.render()
            inp = torch.tensor(observation).type('torch.FloatTensor').view(1,-1)
            output_probabilities = agent(inp).detach().numpy()[0]
            action = np.random.choice(range(game_actions), 1, p=output_probabilities).item()
            new_observation, reward, done, info = env_record.step(action)
            r=r+reward
            observation = new_observation

            if done:
                break

        env_record.close()
        print("Rewards: ", r)

    except Exception as e:
        env_record.close()
        print(e.__doc__)
        print(e.message)        

In [11]:
def trainEVOModel(folderName, fileName):
    game_actions = 2
    torch.set_grad_enabled(False)

    agents = get_initialized_agents('./' + folderName)

    top_limit = 5 # Number of top agents to consider as parents
    generations = 10

    elite_index = None
    for generation in range(generations):
        rewards = run_agents_n_times(agents, 10) # Average of k runs

        sorted_parent_indexes = np.argsort(rewards)[::-1][:top_limit]
        print('\n')

        top_rewards = []
        for best_parent in sorted_parent_indexes:
            top_rewards.append(rewards[best_parent])

        print("Generation ", generation, " | Mean rewards: ", np.mean(rewards), " | Mean of top 5: ",np.mean(top_rewards[:5]))
        # print(rewards)
        print("Top ",top_limit," scores", sorted_parent_indexes)
        print("Rewards for top: ",top_rewards)

        children_agents, elite_index = return_children(agents, sorted_parent_indexes, elite_index)
        agents = children_agents

# DQN

In [12]:
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.state_space = env.observation_space.shape[0]
        self.action_space = env.action_space.n
        self.affine1 = nn.Linear(self.state_space, 124)
        #self.dropout = nn.Dropout(p=0.6)
        self.affine2 = nn.Linear(124, self.action_space)

        self.saved_log_probs = []
        self.rewards = []

    def forward(self, x):
        x = self.affine1(x)
        #x = self.dropout(x)
        x = F.relu(x)
        action_scores = self.affine2(x)
        return F.softmax(action_scores, dim=1)

In [13]:
def select_action(state):
    state = torch.from_numpy(state).float().unsqueeze(0)
    probs = policy(state)
    m = Categorical(probs)
    action = m.sample()
    policy.saved_log_probs.append(m.log_prob(action))
    return action.item()

def sim_action(state):
    state = torch.from_numpy(state).float().unsqueeze(0)
    actions = policy(state)
    _, action = actions.max(1)
    return action.item()

def finish_episode():
    R = 0
    policy_loss = []
    returns = []
    for r in policy.rewards[::-1]:
        R = r + gamma * R
        returns.insert(0, R)
    returns = torch.tensor(returns)
    returns = (returns - returns.mean()) / (returns.std() + eps)
    for log_prob, R in zip(policy.saved_log_probs, returns):
        policy_loss.append(-log_prob * R)
    optimizer.zero_grad()
    policy_loss = torch.cat(policy_loss).sum()
    policy_loss.backward()
    optimizer.step()
    del policy.rewards[:]
    del policy.saved_log_probs[:]

def sim():
    tot_reward = 0
    state = env.reset()
    for t in range(1, 10000):
        action = sim_action(state)
        state, reward, done, _ = env.step(action)
        tot_reward += reward
        env.render()
        if done:
            print(tot_reward)
            break

In [14]:
def trainDQNmodel():
    running_reward = 0
    for i_episode in range(numEpisodes):
        state, ep_reward = env.reset(), 0
        for t in range(1, 10000):  # Don't infinite loop while learning
            action = select_action(state)
            state, reward, done, _ = env.step(action)
            if render:
                env.render()
            policy.rewards.append(reward)
            ep_reward += reward
            if done:
                #duration.append(t)
                break

        running_reward = 0.04 * ep_reward + (1 - 0.04) * running_reward
        finish_episode()
        if i_episode % log_interval == 0:
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                  i_episode, ep_reward, running_reward))
        if running_reward > env.spec.reward_threshold:
            print("Solved! Running reward is now {} and "
                  "the last episode runs to {} time steps!".format(running_reward, t))
            print("{},{}".format(i_episode, ep_reward))
            env.close()
            break

In [15]:
gamma = 0.95 # discount factor
seed  = 543
render = False
log_interval = 10

env = gym.make('CartPole-v1')
env.spec.reward_threshold = 200
numEpisodes = 15
numSuccessions = 10

In [16]:
policy = Policy()
optimizer = optim.Adam(policy.parameters(), lr=1.00e-2)
eps = np.finfo(np.float32).eps.item()

In [None]:
for successions in range(numSuccessions):
    
    folderName = "Succession" + str(successions)
    if not(os.path.isdir(folderName)):
        os.mkdir(folderName)
    
    for episodes in range(numEpisodes):
        trainDQNmodel()
        fileName   = "Episode" + str(episodes) + ".pth"
        torch.save(policy.state_dict(), "./" + folderName + "/" + fileName)
        
    trainEVOModel(folderName, fileName)

Episode 0	Last reward: 33.00	Average reward: 1.32
Episode 10	Last reward: 18.00	Average reward: 7.22
Episode 0	Last reward: 33.00	Average reward: 1.32
Episode 10	Last reward: 129.00	Average reward: 24.93
Episode 0	Last reward: 42.00	Average reward: 1.68
Episode 10	Last reward: 53.00	Average reward: 21.60
Episode 0	Last reward: 70.00	Average reward: 2.80
Episode 10	Last reward: 57.00	Average reward: 27.05
Episode 0	Last reward: 36.00	Average reward: 1.44
Episode 10	Last reward: 385.00	Average reward: 51.52
Episode 0	Last reward: 500.00	Average reward: 20.00
Episode 10	Last reward: 130.00	Average reward: 112.78
Episode 0	Last reward: 109.00	Average reward: 4.36
Episode 10	Last reward: 116.00	Average reward: 43.53
Episode 0	Last reward: 121.00	Average reward: 4.84
Episode 10	Last reward: 71.00	Average reward: 32.78
Episode 0	Last reward: 62.00	Average reward: 2.48
Episode 10	Last reward: 105.00	Average reward: 32.32
Episode 0	Last reward: 123.00	Average reward: 4.92
Episode 10	Last reward

In [None]:
folderName