In [1]:
from logic import Game

import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable
import copy
import concurrent.futures
import time
import json
import random

import matplotlib.pyplot as plt

In [2]:
class DQNTrainer():
    def __init__(
        self,
        env,
        agent,
        epsilon = 1,
        epsilon_decay = 0.999,
        epsilon_min = 0.3,
        replay_size = 1e6,
        batch_size = 2048,
        gamma = 0.99,
        alpha = 0.001,
    ):
        
        self.agent = agent
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.env = env
        self.batch_size = 32
        self.gamma = gamma
        self.alpha = alpha
        self.train_initialized = False
        
        self.memory = ReplayMemory(replay_size, self.agent.input_size)
        self.initialize_policy()
        
    def initialize_policy(self):
        self.loss_fn = nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.agent.policy.parameters(), lr=self.alpha)
        
        
    def train(self, episodes, eval_interval=100):
        if not self.train_initialized:
            self.score_list = []
            self.eval_list = []
            mean_score = self.evaluate(100)
            self.eval_list.append(mean_score)
            print(f'Iteration {0}: {mean_score}')
            self.train_initialized = True
        
        for e in range(1,episodes+1):
            s = self.env.reset().copy()
            score = 0
            done = 0
            while done == 0:
                a = self.agent.get_action(s, epsilon=self.epsilon, training=True)
                s1, r, done, _ = self.env.step(a)
                score += r
                self.memory.add_transition((s, a, r, s1, int(done)))
                self.update()
                s = s1
            self.epsilon = max(self.epsilon_decay*self.epsilon, self.epsilon_min)
            self.score_list.append(score)
            if e%eval_interval == 0 or e == 0:
                mean_score = self.evaluate(100)
                self.eval_list.append(mean_score)
                print(f'Iteration {e}: {mean_score}')
                
                
    def update(self):
        
        # Extract values from data array
        n_samples = min(self.memory.replay_size, self.batch_size)
        s, a, r, s1, t = self.memory.get_replay_sample(n_samples)

        s = torch.tensor(s).float()
        a = torch.tensor(np.expand_dims(a, 1)).to(torch.int64)
        r = torch.tensor(np.expand_dims(r, axis=1))
        s1 = torch.tensor(s1).float()
        non_term = torch.tensor(np.expand_dims(1 - t, axis=1))

        # Calculate Target
        with torch.no_grad():
            v1 = torch.max(self.agent.policy.forward(s1), 1, keepdim=True)[0]
        target = r + (self.gamma * v1 * non_term)
        target = target.float()

        # Calculate prediction
        self.optimizer.zero_grad()
        pred = self.agent.policy.forward(s, train=True).gather(dim=1, index=a).float()

        # Perform gradient descent step
        loss = self.loss_fn(pred, target)
        loss.backward()
        self.optimizer.step()
        delta = loss.item()

            
    def evaluate(self, episodes):
        score_arr = np.empty(episodes)
        for e in range(episodes):
            s = self.env.reset()
            score = 0
            done = False
            while not done:
                legal_moves = self.env.get_valid_moves()
                illegal_moves = [action for action in range(4) if action not in legal_moves]
                a = self.agent.get_action(s, training=False, illegal_actions=illegal_moves)
                s1, r, done, _ = self.env.step(a)
                score += r
                s = s1
            score_arr[e] = score
        return np.mean(score_arr)

In [3]:
class Agent():
    def __init__(
        self,
        layers = [256, 256],
        action_size = 4,
        state_size = 4
        ):
        
        self.layers = layers
        self.output_size = action_size
        self.input_size = state_size
        
        self.policy = CNN()         
        
            
    def get_action(self, state, epsilon=0, training=True, illegal_actions=[]):
        val = np.random.random()
        if val >= epsilon and training:
            with torch.no_grad():
                action = torch.argmax(self.policy.forward(np.expand_dims(state, 0))[0,:]).item()
        elif not training:
            action_values = self.policy.forward(np.expand_dims(state, 0))[0,:]
            min_value = torch.min(action_values)
            for action in illegal_actions:
                action_values[action] = min_value - 1
            action = torch.argmax(action_values).item()
        else:
            action = np.random.randint(0, high=self.output_size)
        return action

In [4]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, output_dim, layer_dims):
        super(NeuralNetwork, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.layer_dims = layer_dims
        current_dim = input_dim
        
        self.create_model()
        
        
    def create_model(self):            
        self.module_list = nn.ModuleList()
        input_dim = self.input_dim
        for layer_dim in self.layer_dims:
            self.module_list.append(nn.Linear(input_dim, layer_dim))
            self.module_list.append(nn.ReLU())
            self.module_list.append(nn.BatchNorm1d(layer_dim))
            input_dim = layer_dim
        self.module_list.append(nn.Linear(input_dim, self.output_dim))
        print(self.module_list)
        
        
    def forward(self, x, train=False):
        if type(x) is not torch.Tensor:
            x = torch.tensor(x)
        if not train:
            self.module_list.eval()
        else:
            self.module_list.train()
        out = x.float()
        for module in self.module_list:
            out = module(out)
        return out
    
    
    def get_weights(self):
        weights = []
        for param in self.parameters():
            weights.append(param.detach().numpy().flatten())
        return np.concatenate(weights)
            
        
class CNN(nn.Module):
    def __init__(self, input_dim=4, output_dim=4, conv_dim=256, dense_dims=[256, 256]):
        super(CNN, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.conv_dim = conv_dim
        self.dense_dims = dense_dims
        
        self.create_model()
        
        
    def create_model(self):            
        self.conv1 = nn.Conv2d(1, self.conv_dim, (1,4))
        self.conv2 = nn.Conv2d(1, self.conv_dim, (4,1))
        self.conv3 = nn.Conv2d(1, self.conv_dim, (2,2))
        self.module_list = nn.ModuleList()
        input_dim = self.conv_dim * (4 + 4 + 9)
        self.module_list.append(nn.ReLU())
        self.module_list.append(nn.BatchNorm1d(input_dim))
        for layer_dim in self.dense_dims:
            self.module_list.append(nn.Linear(input_dim, layer_dim))
            self.module_list.append(nn.ReLU())
            self.module_list.append(nn.BatchNorm1d(layer_dim))
            input_dim = layer_dim
        self.module_list.append(nn.Linear(input_dim, self.output_dim))
        print(self.module_list)
        
        
    def forward(self, x, train=False):
        if type(x) is not torch.Tensor:
            x = torch.tensor(x)
        if not train:
            self.module_list.eval()
        else:
            self.module_list.train()
        x = x.float()
        m = x.size()[0]
        out1 = self.conv1(x).reshape(m,-1)
        out2 = self.conv2(x).reshape(m,-1)
        out3 = self.conv3(x).reshape(m,-1)
        out = torch.cat([out1, out2, out3], 1)
        for module in self.module_list:
            out = module(out)
        return out
    
    
    def get_weights(self):
        weights = []
        for param in self.parameters():
            weights.append(param.detach().numpy().flatten())
        return np.concatenate(weights)
            

In [5]:
class ReplayMemory():
    
    def __init__(self, replay_size, input_size):
        self.replay_size = int(replay_size)
        self.input_size = input_size
        self.initialize_replay_memory()
        
    
    def initialize_replay_memory(self):
        self.states = np.zeros((self.replay_size, 1, self.input_size,self.input_size))
        self.actions = np.zeros(self.replay_size)
        self.rewards = np.zeros(self.replay_size)
        self.next_states = np.zeros((self.replay_size, 1, self.input_size, self.input_size))
        self.terminals = np.zeros(self.replay_size)
        
        self.replay_index = 0
        self.replay_full = False
        self.replay_max = 1
        
    
    def add_transition(self, transition):
        i = self.replay_index
        
        self.states[i,:,:,:] = transition[0]
        self.actions[i] = transition[1]
        self.rewards[i] = transition[2]
        self.next_states[i,:,:,:] = transition[3]
        self.terminals[i] = transition[4]
        
        self.replay_index += 1
        self.replay_max = min(self.replay_max+1, self.replay_size)
        if self.replay_index == self.replay_size:
            self.replay_index = 0
            self.replay_full = True
            
        
    def get_replay_sample(self, samples, index=None):
        if index is None:
            index = np.random.randint(0,high=self.replay_max, size=samples)
        states = self.states[index,:,:,:]
        actions = self.actions[index]
        rewards = self.rewards[index]
        next_states = self.next_states[index,:,:,:]
        terminals = self.terminals[index]
        return (states, actions, rewards, next_states, terminals)

In [6]:
class TestEnv1():
    
    def __init__(self, size=5, slip_prob=0):
        self.size = size
        self.state = np.zeros(size, dtype=int)
        self.slip_prob = slip_prob
        
    def reset(self):
        index = np.random.randint(0, high=self.size)
        self.state = np.zeros(self.size, dtype=int)
        self.state[index] = 1
        self.score = 0
        return self.state.copy()
        
    def step(self, action):
        direction = -1 if action == 0 else 1
        if random.random() < self.slip_prob:
            direction *= -1
        current_index = np.argmax(self.state)
        
        if current_index == 4 and direction == 1:
            return np.zeros(self.size), 1, 1, None
        if current_index == 0 and direction == -1:
            return np.zeros(self.size), -1, 1, None
        
        self.state[current_index] = 0
        self.state[current_index + direction] = 1
        
        return copy.deepcopy(self.state), 0, 0, None
        

In [8]:
alpha_list = [1e-2, 1e-3, 1e-4, 1e-5]
results = {}
for alpha in alpha_list:
    env = Game()
    agent = Agent()
    trainer = DQNTrainer(env, agent, alpha=alpha, epsilon_decay=0.993)
    trainer.train(1000, eval_interval=50)
    results[alpha] = {}
    results[alpha]['eval'] = trainer.eval_list
    results[alpha]['score'] = trainer.score_list

ModuleList(
  (0): ReLU()
  (1): BatchNorm1d(4352, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): Linear(in_features=4352, out_features=256, bias=True)
  (3): ReLU()
  (4): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (5): Linear(in_features=256, out_features=256, bias=True)
  (6): ReLU()
  (7): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (8): Linear(in_features=256, out_features=4, bias=True)
)
Iteration 0: 1341.68
Iteration 50: 988.72
Iteration 100: 1426.72
Iteration 150: 1312.72
Iteration 200: 1253.96
Iteration 250: 700.24
Iteration 300: 806.12
Iteration 350: 623.0
Iteration 400: 650.6
Iteration 450: 654.88
Iteration 500: 783.68
Iteration 550: 678.96
Iteration 600: 676.52
Iteration 650: 687.36
Iteration 700: 642.0
Iteration 750: 615.48
Iteration 800: 656.2
Iteration 850: 683.24
Iteration 900: 696.76
Iteration 950: 630.92
Iteration 1000: 731.48
ModuleList(
  (0): ReLU()
  (1): BatchNo