In [1]:
import gym
import numpy as np
import torch
import matplotlib.pyplot as plt
import time

In [2]:
from gym.wrappers import Monitor

In [3]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [4]:
import math
import copy

In [5]:
class CartPoleAI(nn.Module):
        def __init__(self):
            super().__init__()
            self.fc = nn.Sequential(
                        nn.Linear(4,128, bias=True),
                        nn.ReLU(),
                        nn.Linear(128,2, bias=True),
                        nn.Softmax(dim=1)
                        )

                
        def forward(self, inputs):
            x = self.fc(inputs)
            return x

In [6]:
def init_weights(m):
    
        # nn.Conv2d weights are of shape [16, 1, 3, 3] i.e. # number of filters, 1, stride, stride
        # nn.Conv2d bias is of shape [16] i.e. # number of filters
        
        # nn.Linear weights are of shape [32, 24336] i.e. # number of input features, number of output features
        # nn.Linear bias is of shape [32] i.e. # number of output features
        
        if ((type(m) == nn.Linear) | (type(m) == nn.Conv2d)):
            torch.nn.init.xavier_uniform(m.weight)
            m.bias.data.fill_(0.00)
                

In [7]:
def return_random_agents(num_agents):
    
    agents = []
    for _ in range(num_agents):
        
        agent = CartPoleAI()
        
        for param in agent.parameters():
            param.requires_grad = False
            
        init_weights(agent)
        agents.append(agent)
        
        
    return agents
    

In [8]:
def run_agents(agents):
    
    reward_agents = []
    env = gym.make("CartPole-v0")
    
    for agent in agents:
        agent.eval()
    
        observation = env.reset()
        
        r=0
        s=0
        
        for _ in range(250):
            
            inp = torch.tensor(observation).type('torch.FloatTensor').view(1,-1)
            output_probabilities = agent(inp).detach().numpy()[0]
            action = np.random.choice(range(game_actions), 1, p=output_probabilities).item()
            new_observation, reward, done, info = env.step(action)
            r=r+reward
            
            s=s+1
            observation = new_observation

            if(done):
                break

        reward_agents.append(r)        
        #reward_agents.append(s)
        
    
    return reward_agents

In [9]:
def return_average_score(agent, runs):
    score = 0.
    for i in range(runs):
        score += run_agents([agent])[0]
    return score/runs

In [10]:
def run_agents_n_times(agents, runs):
    avg_score = []
    for agent in agents:
        avg_score.append(return_average_score(agent,runs))
    return avg_score

In [11]:
def mutate(agent):

    child_agent = copy.deepcopy(agent)
    
    mutation_power = 0.02 #hyper-parameter, set from https://arxiv.org/pdf/1712.06567.pdf
            
    for param in child_agent.parameters():
    
        if(len(param.shape)==4): #weights of Conv2D

            for i0 in range(param.shape[0]):
                for i1 in range(param.shape[1]):
                    for i2 in range(param.shape[2]):
                        for i3 in range(param.shape[3]):
                            
                            param[i0][i1][i2][i3]+= mutation_power * np.random.randn()
                                
                                    

        elif(len(param.shape)==2): #weights of linear layer
            for i0 in range(param.shape[0]):
                for i1 in range(param.shape[1]):
                    
                    param[i0][i1]+= mutation_power * np.random.randn()
                        

        elif(len(param.shape)==1): #biases of linear layer or conv layer
            for i0 in range(param.shape[0]):
                
                param[i0]+=mutation_power * np.random.randn()

    return child_agent

In [12]:
def return_children(agents, sorted_parent_indexes, elite_index):
    
    children_agents = []
    
    #first take selected parents from sorted_parent_indexes and generate N-1 children
    for i in range(len(agents)-1):
        
        selected_agent_index = sorted_parent_indexes[np.random.randint(len(sorted_parent_indexes))]
        children_agents.append(mutate(agents[selected_agent_index]))

    #now add one elite
    elite_child = add_elite(agents, sorted_parent_indexes, elite_index)
    children_agents.append(elite_child)
    elite_index=len(children_agents)-1 #it is the last one
    
    return children_agents, elite_index

In [13]:
def add_elite(agents, sorted_parent_indexes, elite_index=None, only_consider_top_n=10):
    
    candidate_elite_index = sorted_parent_indexes[:only_consider_top_n]
    
    if(elite_index is not None):
        candidate_elite_index = np.append(candidate_elite_index,[elite_index])
        
    top_score = None
    top_elite_index = None
    
    for i in candidate_elite_index:
        score = return_average_score(agents[i],runs=5)
        print("Score for elite i ", i, " is ", score)
        
        if(top_score is None):
            top_score = score
            top_elite_index = i
        elif(score > top_score):
            top_score = score
            top_elite_index = i
            
    print("Elite selected with index ",top_elite_index, " and score", top_score)
    
    child_agent = copy.deepcopy(agents[top_elite_index])
    return child_agent
    

In [14]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    return np.exp(x) / np.sum(np.exp(x), axis=0)

In [None]:
game_actions = 2 #2 actions possible: left or right

#disable gradients as we will not use them
torch.set_grad_enabled(False)

# initialize N number of agents
num_agents = 500
agents = return_random_agents(num_agents)

# How many top agents to consider as parents
top_limit = 20

# run evolution until X generations
generations = 1000

elite_index = None

for generation in range(generations):

    # return rewards of agents
    rewards = run_agents_n_times(agents, 3) #return average of 3 runs

    # sort by rewards
    sorted_parent_indexes = np.argsort(rewards)[::-1][:top_limit] #reverses and gives top values (argsort sorts by ascending by default) https://stackoverflow.com/questions/16486252/is-it-possible-to-use-argsort-in-descending-order
    print("")
    print("")
    
    top_rewards = []
    for best_parent in sorted_parent_indexes:
        top_rewards.append(rewards[best_parent])
    
    print("Generation ", generation, " | Mean rewards: ", np.mean(rewards), " | Mean of top 5: ",np.mean(top_rewards[:5]))
    #print(rewards)
    print("Top ",top_limit," scores", sorted_parent_indexes)
    print("Rewards for top: ",top_rewards)
    
    # setup an empty list for containing children agents
    children_agents, elite_index = return_children(agents, sorted_parent_indexes, elite_index)

    # kill all agents, and replace them with their children
    agents = children_agents



Generation  0  | Mean rewards:  21.888666666666666  | Mean of top 5:  45.8
Top  20  scores [ 83 288 153 440  17 190 486  48  71 350   1 197 188 156 220 464 293 112
 453 292]
Rewards for top:  [51.0, 47.333333333333336, 44.333333333333336, 43.666666666666664, 42.666666666666664, 42.666666666666664, 42.0, 41.0, 40.333333333333336, 40.0, 40.0, 40.0, 39.0, 39.0, 36.666666666666664, 36.666666666666664, 36.666666666666664, 36.333333333333336, 36.333333333333336, 35.666666666666664]
Score for elite i  83  is  33.6
Score for elite i  288  is  21.8
Score for elite i  153  is  18.4
Score for elite i  440  is  20.0
Score for elite i  17  is  24.0
Score for elite i  190  is  26.0
Score for elite i  486  is  16.6
Score for elite i  48  is  22.4
Score for elite i  71  is  27.8
Score for elite i  350  is  16.2
Elite selected with index  83  and score 33.6


Generation  1  | Mean rewards:  22.715333333333337  | Mean of top 5:  51.0
Top  20  scores [  3 250  62 258 143 325  90 281  30  26 176 201 236

Score for elite i  499  is  35.4
Elite selected with index  322  and score 51.6


Generation  9  | Mean rewards:  34.932  | Mean of top 5:  83.73333333333333
Top  20  scores [ 24  54  78 152  45 484 116 254 287 499 352 244 243 277  85 153 137 408
  58 209]
Rewards for top:  [107.33333333333333, 85.66666666666667, 76.33333333333333, 75.66666666666667, 73.66666666666667, 72.33333333333333, 71.33333333333333, 70.33333333333333, 69.33333333333333, 68.33333333333333, 68.0, 67.0, 66.66666666666667, 66.66666666666667, 66.0, 65.66666666666667, 64.33333333333333, 63.666666666666664, 62.333333333333336, 62.0]
Score for elite i  24  is  45.8
Score for elite i  54  is  26.2
Score for elite i  78  is  43.0
Score for elite i  152  is  23.6
Score for elite i  45  is  37.6
Score for elite i  484  is  49.2
Score for elite i  116  is  40.2
Score for elite i  254  is  32.4
Score for elite i  287  is  25.6
Score for elite i  499  is  57.4
Score for elite i  499  is  29.2
Elite selected with index  499  an



Generation  18  | Mean rewards:  50.012  | Mean of top 5:  109.46666666666667
Top  20  scores [ 25 181 498 327 318 119 363 307 229 258   5 101 158 173  94 281 341 112
 296 326]
Rewards for top:  [118.0, 112.66666666666667, 112.33333333333333, 107.0, 97.33333333333333, 95.66666666666667, 93.33333333333333, 93.0, 92.33333333333333, 91.33333333333333, 90.66666666666667, 90.66666666666667, 90.0, 89.0, 88.0, 87.66666666666667, 87.0, 86.66666666666667, 86.33333333333333, 86.0]
Score for elite i  25  is  43.2
Score for elite i  181  is  62.2
Score for elite i  498  is  63.8
Score for elite i  327  is  56.2
Score for elite i  318  is  70.0
Score for elite i  119  is  47.8
Score for elite i  363  is  49.2
Score for elite i  307  is  56.4
Score for elite i  229  is  40.8
Score for elite i  258  is  58.6
Score for elite i  499  is  64.6
Elite selected with index  318  and score 70.0


Generation  19  | Mean rewards:  51.23  | Mean of top 5:  109.93333333333332
Top  20  scores [  8 321 393 390 1



Generation  27  | Mean rewards:  64.978  | Mean of top 5:  139.93333333333334
Top  20  scores [132 458 247 299 343 230   1   5 375 174  72 273 494 168 416 390 271 221
 408 206]
Rewards for top:  [158.66666666666666, 147.33333333333334, 132.66666666666666, 131.66666666666666, 129.33333333333334, 127.33333333333333, 124.33333333333333, 120.0, 118.0, 117.33333333333333, 115.33333333333333, 115.0, 114.66666666666667, 114.33333333333333, 114.0, 114.0, 113.66666666666667, 113.66666666666667, 113.33333333333333, 112.33333333333333]
Score for elite i  132  is  83.2
Score for elite i  458  is  66.6
Score for elite i  247  is  70.8
Score for elite i  299  is  60.2
Score for elite i  343  is  38.8
Score for elite i  230  is  56.4
Score for elite i  1  is  73.2
Score for elite i  5  is  59.0
Score for elite i  375  is  60.0
Score for elite i  174  is  71.2
Score for elite i  499  is  43.4
Elite selected with index  132  and score 83.2


Generation  28  | Mean rewards:  66.63066666666667  | Mean 

Score for elite i  305  is  104.6
Score for elite i  186  is  53.2
Score for elite i  499  is  107.0
Elite selected with index  89  and score 117.0


Generation  36  | Mean rewards:  79.046  | Mean of top 5:  153.59999999999997
Top  20  scores [ 20 176 126 482 189 453 256  42  36 120 478 274  78 218 101  24 249 207
  38 306]
Rewards for top:  [162.66666666666666, 155.0, 151.66666666666666, 149.66666666666666, 149.0, 149.0, 146.0, 144.66666666666666, 140.66666666666666, 140.33333333333334, 140.33333333333334, 139.0, 138.0, 137.33333333333334, 137.33333333333334, 137.33333333333334, 135.33333333333334, 132.66666666666666, 130.66666666666666, 129.33333333333334]
Score for elite i  20  is  118.6
Score for elite i  176  is  105.8
Score for elite i  126  is  47.8
Score for elite i  482  is  110.2
Score for elite i  189  is  108.6
Score for elite i  453  is  63.4
Score for elite i  256  is  65.2
Score for elite i  42  is  76.6
Score for elite i  36  is  90.6
Score for elite i  120  is  60.2
S

In [None]:
def play_agent(agent):
    try: #try and exception block because, render hangs if an erorr occurs, we must do env.close to continue working    
        env = gym.make("CartPole-v0")
        
        env_record = Monitor(env, './video', force=True)
        observation = env_record.reset()
        last_observation = observation
        r=0
        for _ in range(250):
            env_record.render()
            inp = torch.tensor(observation).type('torch.FloatTensor').view(1,-1)
            output_probabilities = agent(inp).detach().numpy()[0]
            action = np.random.choice(range(game_actions), 1, p=output_probabilities).item()
            new_observation, reward, done, info = env_record.step(action)
            r=r+reward
            observation = new_observation

            if(done):
                break

        env_record.close()
        print("Rewards: ",r)

    except Exception as e:
        env_record.close()
        print(e.__doc__)
        print(e.message)        

In [None]:
play_agent(agents[96])