In [73]:
import gym
import numpy as np
import torch
import matplotlib.pyplot as plt
import time

from gym.wrappers import Monitor

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import math
import copy
import os

In [74]:
class CartPoleAgent(nn.Module):
    def __init__(self):
        super(CartPoleAgent, self).__init__()
        self.affine1 = nn.Linear(4, 124)
        # self.dropout = nn.Dropout(p=0.6)
        self.affine2 = nn.Linear(124, 2)

        self.saved_log_probs = []
        self.rewards = []

    def forward(self, x):
        x = self.affine1(x)
        # x = self.dropout(x)
        x = F.relu(x)
        action_scores = self.affine2(x)
        return F.softmax(action_scores, dim=1)

In [75]:
# Old code assuming random initialization

def init_weights(m):
    if ((type(m) == nn.Linear) | (type(m) == nn.Conv2d)):
        torch.nn.init.xavier_uniform(m.weight)
        m.bias.data.fill_(0.00)

def return_random_agents(num_agents):
    agents = []
    for _ in range(num_agents):
        
        agent = CartPoleAI()
        
        for param in agent.parameters():
            param.requires_grad = False
            
        init_weights(agent)
        agents.append(agent)
        
    return agents

In [76]:
def get_initialized_agents(dir='../models'):
    agents = []
    for path in os.listdir(dir):
        if path[-4:] == '.pth':
            try:
                model = CartPoleAgent()
                model.load_state_dict(torch.load(dir + '/' + path))
                agents.append(model)
            except Exception as e:
                print(e)
    return agents

get_initialized_agents('../models20')

[CartPoleAgent(
   (affine1): Linear(in_features=4, out_features=124, bias=True)
   (affine2): Linear(in_features=124, out_features=2, bias=True)
 ), CartPoleAgent(
   (affine1): Linear(in_features=4, out_features=124, bias=True)
   (affine2): Linear(in_features=124, out_features=2, bias=True)
 ), CartPoleAgent(
   (affine1): Linear(in_features=4, out_features=124, bias=True)
   (affine2): Linear(in_features=124, out_features=2, bias=True)
 ), CartPoleAgent(
   (affine1): Linear(in_features=4, out_features=124, bias=True)
   (affine2): Linear(in_features=124, out_features=2, bias=True)
 ), CartPoleAgent(
   (affine1): Linear(in_features=4, out_features=124, bias=True)
   (affine2): Linear(in_features=124, out_features=2, bias=True)
 )]

In [77]:
def run_agents(agents):
    reward_agents = []
    env = gym.make("CartPole-v1")
    env.spec.reward_threshold = 500
    
    for agent in agents:
        agent.eval()
    
        observation = env.reset()
        
        r, s = 0, 0
        for _ in range(250):
            
            inp = torch.tensor(observation).type('torch.FloatTensor').view(1,-1)
            output_probabilities = agent(inp).detach().numpy()[0]
            action = np.random.choice(range(game_actions), 1, p=output_probabilities).item()
            new_observation, reward, done, info = env.step(action)
            r = r + reward
            
            s = s + 1
            observation = new_observation

            if done:
                break

        reward_agents.append(r)        
        # reward_agents.append(s)
    
    return reward_agents

In [78]:
def return_average_score(agent, runs):
    score = 0.
    for i in range(runs):
        score += run_agents([agent])[0]
    return score / runs

In [79]:
def run_agents_n_times(agents, runs):
    return [return_average_score(agent, runs) for agent in agents]

In [80]:
def mutate(agent):
    child_agent = copy.deepcopy(agent)
    mutation_power = 0.02 # Set from https://arxiv.org/pdf/1712.06567.pdf
    for param in child_agent.parameters():
        if len(param.shape) == 4: # Weights of Conv2D
            for i0 in range(param.shape[0]):
                for i1 in range(param.shape[1]):
                    for i2 in range(param.shape[2]):
                        for i3 in range(param.shape[3]):
                            param[i0][i1][i2][i3] += mutation_power * np.random.randn()
        
        elif len(param.shape) == 2: # Weights of linear layer
            for i0 in range(param.shape[0]):
                for i1 in range(param.shape[1]):
                    param[i0][i1] += mutation_power * np.random.randn()
        
        elif len(param.shape) == 1: # Biases of linear layer or conv layer
            for i0 in range(param.shape[0]):
                
                param[i0] += mutation_power * np.random.randn()

    return child_agent

In [81]:
def return_children(agents, sorted_parent_indexes, elite_index):
    children_agents = []
    
    for i in range(len(agents)-1):
        selected_agent_index = sorted_parent_indexes[np.random.randint(len(sorted_parent_indexes))]
        children_agents.append(mutate(agents[selected_agent_index]))

    elite_child = add_elite(agents, sorted_parent_indexes, elite_index)
    children_agents.append(elite_child)
    elite_index = len(children_agents) - 1
    
    return children_agents, elite_index

In [82]:
def add_elite(agents, sorted_parent_indexes, elite_index=None, only_consider_top_n=10):
    candidate_elite_index = sorted_parent_indexes[:only_consider_top_n]
    
    if elite_index is not None:
        candidate_elite_index = np.append(candidate_elite_index,[elite_index])
        
    top_score = None
    top_elite_index = None
    
    for i in candidate_elite_index:
        score = return_average_score(agents[i],runs=5)
        print("Score for elite i ", i, " is ", score)
        
        if(top_score is None):
            top_score = score
            top_elite_index = i
        elif(score > top_score):
            top_score = score
            top_elite_index = i
            
    print("Elite selected with index ",top_elite_index, " and score", top_score)
    
    child_agent = copy.deepcopy(agents[top_elite_index])
    return child_agent
    

In [83]:
game_actions = 2
torch.set_grad_enabled(False)

agents = get_initialized_agents('../models40')
# agents = return_random_agents(5)

top_limit = 5 # Number of top agents to consider as parents
generations = 5

elite_index = None
for generation in range(generations):
    rewards = run_agents_n_times(agents, 10) # Average of k runs

    sorted_parent_indexes = np.argsort(rewards)[::-1][:top_limit]
    print('\n')
    
    top_rewards = []
    for best_parent in sorted_parent_indexes:
        top_rewards.append(rewards[best_parent])
    
    print("Generation ", generation, " | Mean rewards: ", np.mean(rewards), " | Mean of top 5: ",np.mean(top_rewards[:5]))
    # print(rewards)
    print("Top ",top_limit," scores", sorted_parent_indexes)
    print("Rewards for top: ",top_rewards)
    
    children_agents, elite_index = return_children(agents, sorted_parent_indexes, elite_index)
    agents = children_agents



Generation  0  | Mean rewards:  84.66  | Mean of top 5:  84.66
Top  5  scores [4 0 1 3 2]
Rewards for top:  [130.9, 94.5, 85.1, 59.3, 53.5]
Score for elite i  4  is  178.6
Score for elite i  0  is  96.0
Score for elite i  1  is  114.4
Score for elite i  3  is  68.2
Score for elite i  2  is  55.6
Elite selected with index  4  and score 178.6


Generation  1  | Mean rewards:  87.28  | Mean of top 5:  87.28
Top  5  scores [4 2 3 0 1]
Rewards for top:  [142.0, 86.8, 81.7, 76.1, 49.8]
Score for elite i  4  is  153.4
Score for elite i  2  is  46.4
Score for elite i  3  is  63.0
Score for elite i  0  is  75.2
Score for elite i  1  is  63.2
Score for elite i  4  is  107.4
Elite selected with index  4  and score 153.4


Generation  2  | Mean rewards:  90.2  | Mean of top 5:  90.2
Top  5  scores [4 2 0 3 1]
Rewards for top:  [144.6, 85.9, 81.3, 75.1, 64.1]
Score for elite i  4  is  139.2
Score for elite i  2  is  80.8
Score for elite i  0  is  50.8
Score for elite i  3  is  81.2
Score for elit

In [84]:
def play_agent(agent):
    try:
        env = gym.make("CartPole-v1")
        
        env_record = Monitor(env, './video', force=True)
        observation = env_record.reset()
        last_observation = observation
        
        r = 0
        for _ in range(250):
            env_record.render()
            inp = torch.tensor(observation).type('torch.FloatTensor').view(1,-1)
            output_probabilities = agent(inp).detach().numpy()[0]
            action = np.random.choice(range(game_actions), 1, p=output_probabilities).item()
            new_observation, reward, done, info = env_record.step(action)
            r=r+reward
            observation = new_observation

            if done:
                break

        env_record.close()
        print("Rewards: ", r)

    except Exception as e:
        env_record.close()
        print(e.__doc__)
        print(e.message)        

In [172]:
play_agent(agents[1])

Rewards:  200.0
