In [1]:
from toyCorewar import ToyCorewar
from environment import Env
from program_synthesis import Program, Instruction
from DQN_utils import LinearSchedule
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
# ToyCorewar Characteristics
NUM_ACTIONS = 225
NUM_REGISTERS = 4
MAX_LENGTH = 5
N_INSTRUCTIONS = 4
N_VARS = 3
N_VALS = 20
N_TARGETS = 4

# Dual DQN

In [3]:
torch.set_default_tensor_type('torch.FloatTensor')
class Dueling_DQN(nn.Module):
    def __init__(self):
        super(Dueling_DQN, self).__init__()
        
        h_size_a = 50
        h_size_b = 100
        h_size_c = 120
        s_size = N_TARGETS * 2
        
        self.lstm_p_a = nn.LSTM(input_size=N_INSTRUCTIONS, hidden_size=h_size_a, num_layers=2)
        self.lstm_p_b = nn.LSTM(input_size=(N_VARS * NUM_REGISTERS), hidden_size=h_size_b, num_layers=2)
        self.lstm_p_c = nn.LSTM(input_size=N_VALS, hidden_size=h_size_c, num_layers=2)
        self.fc_s1 = nn.Linear(in_features=s_size, out_features=s_size)
        self.fc_s2 = nn.Linear(in_features=s_size, out_features=s_size)
        
        self.fc1 = nn.Linear(in_features=(h_size_a + h_size_b + h_size_c + s_size), out_features=200)
        self.fc2 = nn.Linear(in_features=200, out_features=128)
        
        self.fc1_adv = nn.Linear(in_features=128, out_features=128)
        self.fc1_val = nn.Linear(in_features=128, out_features=128)
        self.fc2_adv = nn.Linear(in_features=128, out_features=NUM_ACTIONS)
        self.fc2_val = nn.Linear(in_features=128, out_features=1)
        
        self.relu = nn.ReLU()
    
    def forward(self, state):
        p_a, p_b, p_c, s = state
        
        # Process instruction, variable and value embeddings
        # in separate streams of 2-layer LSTMs
        # Collecting the hidden state
        _,(p_a,_) = self.lstm_p_a(p_a.float())
        _,(p_b,_) = self.lstm_p_b(p_b.float())
        _,(p_c,_) = self.lstm_p_c(p_c.float())
        
        # Process state vector in 2 FC layers
        s = self.relu(self.fc_s1(s.float()))
        s = self.relu(self.fc_s2(s.float()))
        
        # Concatenate P and S vectors and process in 2 FC layers
        x = torch.cat((p_a[1], p_b[1], p_c[1], s), dim=1)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        
        # Split processing in 2 streams: value and advantage
        adv = self.relu(self.fc1_adv(x))
        val = self.relu(self.fc1_val(x))
        
        adv = self.fc2_adv(adv)
        val = self.fc2_val(val).expand(-1, NUM_ACTIONS)
        
        x = val + adv - adv.mean().expand(NUM_ACTIONS)
        
        return x

## Helper functions

In [4]:
def state_to_tensors(state):
    prog_state, mem_state = state
    instr, var, val = zip(*prog_state)
    instr = torch.tensor(instr).unsqueeze(1)
    var = torch.tensor(var).unsqueeze(1)
    val = torch.tensor(val).unsqueeze(1)
    mem = torch.tensor(mem_state).view(-1).unsqueeze(0)
    return instr, var, val, mem

In [5]:
def batch_to_tensors(batch):
    tensors = [state_to_tensors(state) for state in batch]
    instr_tensors, var_tensors, val_tensors, mem_tensors = zip(*tensors)
    instr = torch.cat(instr_tensors, dim=1)
    var = torch.cat(var_tensors, dim=1)
    val = torch.cat(val_tensors, dim=1)
    mem = torch.cat(mem_tensors, dim=0)
    return instr, var, val, mem

# Dueling double Q-learning algorithm

## Training (with hindsight experience replay)

In [6]:
from collections import deque, namedtuple
import random
import inspect
import os
import time

Transition = namedtuple('Transition',
                        ('state', 'action', 'reward', 'next_state', 'done'))

def train(Q, reward_func, M, verbose=False, log_dir=None):

    env = Env(reward_func)

    replay_buffer_size = 100000
    learning_starts = 100
    learning_freq = 4
    batch_size=32
    num_actions = env.action_space_n
    
    log_freq = 100
    save_freq = 1000

    gamma = 0.99
    epsilon_schedule = LinearSchedule(schedule_timesteps=M, final_p=0.1)
    replay_buffer = deque(maxlen=replay_buffer_size)
    Q_target = Dueling_DQN()
    Q_target.load_state_dict(Q.state_dict())

    loss_function = torch.nn.MSELoss()
    optimizer = optim.RMSprop(Q.parameters())
    num_parameter_updates = 0
    target_update_freq = 1000

    if verbose:
        print("Starting training [reward = {}] for {} episodes...".format(
            reward_func.__name__, M))
    
    if log_dir is not None:
        log_file = os.path.join(log_dir, "logs")
        with open(log_file, "w") as f:
            print("Starting training for {} episodes...".format(M), file=f)
            print("Reward function:\n\n{}\n\n\n".format(inspect.getsource(reward_func)), file=f)
        model_dir = os.path.join(log_dir, "models")
        os.makedirs(model_dir)
    
    start_time = time.time()
    for episode in range(M):
        s = env.reset()

        for t in range(MAX_LENGTH):
            # Select action with E-greedy policy
            if episode < learning_starts or np.random.rand() < epsilon_schedule.value(episode):
                a = np.random.randint(num_actions)
            else:
                a = Q(state_to_tensors(s)).argmax(1).item()

            # Submit chosen action to the environment
            s_prime, reward, done, info = env.step(a)

            # Store the effect of the action
            replay_buffer.append(Transition(s, a, reward, s_prime, done))

            # New state becomes current state
            s = s_prime

            # EXPERIENCE REPLAY
            if (episode > learning_starts and episode % learning_freq == 0):
                # Sample from the replay buffer
                transitions = random.sample(replay_buffer, batch_size)

                # Extract each batch of elements from the sample of transitions
                batch = Transition(*zip(*transitions))
                state_batch = batch_to_tensors(batch.state)
                action_batch = torch.LongTensor(batch.action).unsqueeze(1)
                reward_batch = torch.tensor(batch.reward, dtype=torch.float)
                next_state_batch = batch_to_tensors(batch.next_state)
                done_batch = torch.tensor(batch.done, dtype=torch.float)

                # Get the current network's estimations for the q-values of all (state, action)
                # pairs in the batch
                q_s_a = Q(state_batch).gather(1, action_batch).squeeze()

                # Calculate the corresponding target q-values to send to the loss function
                a_prime =  Q(next_state_batch).argmax(1).unsqueeze(1)
                q_s_a_prime = Q_target(next_state_batch).gather(1, a_prime).squeeze()
                q_s_a_prime *= 1 - done_batch
                target_q_s_a = reward_batch + gamma * q_s_a_prime
                target_q_s_a = target_q_s_a.detach()

                # Backprop
                loss = loss_function(q_s_a, target_q_s_a)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                num_parameter_updates += 1

                # Update target DQN every once in a while
                if num_parameter_updates % target_update_freq == 0:
                    Q_target.load_state_dict(Q.state_dict())

            if done:
                break
        
        # Console output
        if verbose and episode % log_freq == 0:
            print("Episode {} completed".format(episode + 1))
        
        # Log output
        if log_dir is not None and episode % log_freq == 0:
            with open(log_file, "a") as f:
                print("Episode {}: [time:{}]\n".format(episode+1, time.time()-start_time), file=f)
                assess(Q, reward_func, file=f)
                print("\n\n\n", file=f)
                
                
        # Model saving

## Program synthesis (after training)

In [7]:
def assess(Q, reward_func, file=None):
    env = Env(reward_func)
    s = env.reset()
    
    for t in range(MAX_LENGTH):
        a = Q(state_to_tensors(s)).argmax(1).item()
        s_prime, reward, done, info = env.step(a)
        s = s_prime
        if done:
            break
    env.print_details(file=file)
    return env

## Reward functions

In [8]:
# Specific values for all registers, without division
def specific_register_values(cw):
    target_values = np.array([0,10,10,20], dtype=int)
    reward = 0
    for reg in range(N_TARGETS):
        reward -= abs(target_values[reg] - cw.registers[reg])
    return cw.registers, target_values, reward

In [9]:
# Specific values for all registers, with division
def specific_register_values_division(cw):
    target_values = np.array([0,10,10,20], dtype=int)
    reward = 0
    for reg in range(N_TARGETS):
        reward -= abs((target_values[reg] - cw.registers[reg]) / (target_values[reg] + 1))
    return cw.registers, target_values, reward

In [10]:
# Specific value for one register
def one_register_value(cw):
    target = 55
    register = 3 # Reminder: register indexes start from 1
    target_values = np.zeros(4, dtype=int)
    target_values[register-1] = target
    reward = -abs(target - cw.registers[register-1])
    return cw.registers, target_values, reward

In [11]:
# Maximize the sum of all register values
def maximize_all_registers(cw):
    target = np.zeros(4, dtype=int)
    reward = cw.registers.sum()
    return cw.registers, target, reward

In [12]:
# Minimize the sum of all register values
def minimize_all_registers(cw):
    target = np.zeros(4, dtype=int)
    reward = -cw.registers.sum()
    return cw.registers, target, reward

In [13]:
reward_functions = [specific_register_values,
                   specific_register_values_division,
                   one_register_value,
                   maximize_all_registers,
                   minimize_all_registers]

# Code execution

In [55]:
Q = Dueling_DQN()
train(Q, minimize_all_registers, 2000, verbose=True)
assess(Q, minimize_all_registers)

Starting training [reward = minimize_all_registers] for 2000 episodes...
Episode 1 completed
Episode 101 completed
Episode 201 completed
Episode 301 completed
Episode 401 completed
Episode 501 completed
Episode 601 completed
Episode 701 completed
Episode 801 completed
Episode 901 completed
Episode 1001 completed
Episode 1101 completed
Episode 1201 completed
Episode 1301 completed
Episode 1401 completed
Episode 1501 completed
Episode 1601 completed
Episode 1701 completed
Episode 1801 completed
Episode 1901 completed
sub 3  3  3    	[  0  0  0  0 ]     0
sub 3  3  3    	[  0  0  0  0 ]     0
sub 3  3  3    	[  0  0  0  0 ]     0
sub 3  3  3    	[  0  0  0  0 ]     0
sub 3  3  3    	[  0  0  0  0 ]     0
-------------------------------------
                      Total reward: 0


<program_synthesis.Program at 0x617064a58>

In [14]:
def run_experiment(reward_func, episodes, root_dir):
    log_dir = os.path.join(root_dir, reward_func.__name__)
    os.makedirs(log_dir)
    Q = Dueling_DQN()
    train(Q, reward_func, episodes, log_dir=log_dir)
    final_save_path = os.path.join(log_dir, "models", "final")
    torch.save(Q.state_dict(), final_save_path)

In [15]:
def run_experiment_series(name, reward_functions, episodes):
    os.makedirs("Experiments", exist_ok=True)
    root_dir = os.path.join("Experiments", name)
    os.makedirs(root_dir)
    if isinstance(episodes, int):
        episodes = [episodes] * len(reward_functions)
    for reward_func, ep in zip(reward_functions, episodes):
        run_experiment(reward_func, ep, root_dir)

In [17]:
run_experiment_series("helloWorld3", reward_functions, 300)