In [1]:
import gym
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable
import copy
import concurrent.futures
import time
import json
import random

In [6]:
class Agent():
    def __init__(
        self, 
        alpha = 0.001,
        layers = [256, 256],
        input_dim = 8,
        output_dim = 4,
        gamma = 0.9,
        epsilon = 0.2
        ):
        
        self.alpha = alpha
        self.layers = layers
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.gamma = gamma
        self.epsilon = epsilon
        self.score_list = None
        self.env = gym.make("LunarLander-v2")
        
        self.initialize_policy()
        
    def initialize_policy(self):
        self.policy = NeuralNetwork(self.input_dim, self.output_dim, self.layers, self.alpha)
        self.loss_fn = nn.MSELoss()
        self.optimizer = torch.optim.SGD(self.policy.parameters(), lr=self.alpha)
    
    def train(self, iterations, episodes):
        if self.score_list is None:
            self.score_list = []
        previous_weights = self.policy.get_weights().flatten()
        for i in range(iterations):
            states = []
            returns = np.empty((1))
            iteration_scores = []
            for e in range(episodes):
                s = self.env.reset()
                temp_states = []
                rewards = []
                score = 0
                done = False
                while not done:
                    temp_states.append(s)
                    a = self.get_action(s)
                    s, r, done, _ = self.env.step(a)
                    rewards.append(r)
                    score += r
                iteration_scores.append(score)
                states += temp_states
                returns = np.concatenate((returns, self.compute_returns(rewards)))
            iteration_score_mean = np.mean(iteration_scores)
            iteration_score_std = np.std(iteration_scores)
            print(f'Iteration {i+1}: {np.mean(iteration_score_mean)} +/- {iteration_score_std}')
            self.update(np.array(states), returns)
            current_weights = self.policy.get_weights().flatten()
            delta = (np.sum((current_weights - previous_weights)**2)**(1/2))
            print(f'Weight change: {delta}')
            mean_weight_magnitude = np.mean(np.abs(current_weights))
            #print(f'Weight Magnitude (mean): {mean_weight_magnitude}')
        
            
    def get_action(self, state):
        random_number = random.random()
        if random_number >= self.epsilon:
            with torch.no_grad():
                action = torch.argmax(self.policy.forward(np.expand_dims(state, 0))[0,:]).item()
        else:
            action = random.randint(0, 3)
        return action
    
    
    def compute_returns(self, rewards):
        returns = np.zeros(len(rewards))
        return_val = 0
        for i, reward in enumerate(reversed(rewards)):
            return_val = reward + self.gamma*return_val
            returns[-(i+1)] = return_val
        return returns
    
    
    def custom_loss(self, action_probs, returns):
        actions = torch.max(action_probs, 1, keepdim=True)[0]
        log_actions = torch.log(actions)
        loss = torch.mean(log_actions * returns)
        return loss

        
    def update(self, states, returns):
        
        action_probs = self.policy.forward(states, train=True)
        
        # Perform gradient descent step
        self.optimizer.zero_grad()
        loss = self.custom_loss(action_probs, torch.tensor(returns*-1))
        loss.backward()
        self.optimizer.step()
        delta = loss.item()

In [7]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, output_dim, layer_dims, alpha):
        super(NeuralNetwork, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.layer_dims = layer_dims
        self.alpha = alpha
        current_dim = input_dim
        
        self.create_model()
        
        
    def create_model(self):
        if len(self.layer_dims) == 0:
            self.model = nn.Sequential(
                nn.Linear(self.input_dim, self.output_dim),
                nn.Softmax()
            )
        
        elif len(self.layer_dims) == 1:
            self.model = nn.Sequential(
                nn.Linear(self.input_dim, self.layer_dims[0]),
                nn.ReLU(),
                nn.BatchNorm1d(self.layer_dims[0]),
                nn.Linear(self.layer_dims[0], self.output_dim),
                nn.Softmax(dim=1)
            )
        elif len(self.layer_dims) == 2:
            self.model = nn.Sequential(
                nn.Linear(self.input_dim, self.layer_dims[0]),
                nn.ReLU(),
                nn.BatchNorm1d(self.layer_dims[0]),
                nn.Linear(self.layer_dims[0], self.layer_dims[1]),
                nn.ReLU(),
                nn.BatchNorm1d(self.layer_dims[1]),
                nn.Linear(self.layer_dims[1], self.output_dim),
                nn.Softmax(dim=1)
            )
        elif len(self.layer_dims) == 3:
            self.model = nn.Sequential(
                nn.Linear(self.input_dim, self.layer_dims[0]),
                nn.ReLU(),
                nn.BatchNorm1d(self.layer_dims[0]),
                nn.Linear(self.layer_dims[0], self.layer_dims[1]),
                nn.ReLU(),
                nn.BatchNorm1d(self.layer_dims[1]),
                nn.Linear(self.layer_dims[1], self.layer_dims[2]),
                nn.ReLU(),
                nn.BatchNorm1d(self.layer_dims[2]),
                nn.Linear(self.layer_dims[2], self.output_dim),
                nn.Softmax(dim=1)
            )
        elif len(self.layer_dims) == 4:
            self.model = nn.Sequential(
                nn.Linear(self.input_dim, self.layer_dims[0]),
                nn.ReLU(),
                nn.BatchNorm1d(self.layer_dims[0]),
                nn.Linear(self.layer_dims[0], self.layer_dims[1]),
                nn.ReLU(),
                nn.BatchNorm1d(self.layer_dims[1]),
                nn.Linear(self.layer_dims[1], self.layer_dims[2]),
                nn.ReLU(),
                nn.BatchNorm1d(self.layer_dims[2]),
                nn.Linear(self.layer_dims[2], self.layer_dims[3]),
                nn.ReLU(),
                nn.BatchNorm1d(self.layer_dims[3]),
                nn.Linear(self.layer_dims[3], self.output_dim),
                nn.Softmax(dim=1)
            )
        
        
    def forward(self, x, train=False):
        if type(x) is not torch.Tensor:
            x = torch.tensor(x)
        if not train:
            self.model.eval()
        else:
            self.model.train()
        out = self.model(x.float())
        return out
    
    def get_weights(self):
        weights = []
        for param in self.parameters():
            weights.append(param.detach().numpy().flatten())
        return np.concatenate(weights)
            

In [14]:
agent = Agent(layers=[64, 64, 64], alpha=0.00001, gamma=0.99, epsilon=0)

In [15]:
agent.train(100, 128)

Iteration 1: -137.79608493066502 +/- 43.56486477254089
Weight change: 0.00188188845487748
Iteration 2: -128.5542502859878 +/- 36.51451626064602
Weight change: 0.0036189079171575116
Iteration 3: -126.62397702427462 +/- 42.45432953789809
Weight change: 0.005302916648469262
Iteration 4: -132.7613783818465 +/- 41.56189733572737
Weight change: 0.0068950122815384365
Iteration 5: -124.9286571904864 +/- 44.66894206622537
Weight change: 0.008449125601949022
Iteration 6: -137.65792061024848 +/- 44.230601828023495
Weight change: 0.010025439551364309
Iteration 7: -131.37654385065008 +/- 40.139693005549546
Weight change: 0.01177536308222397
Iteration 8: -132.35587757818058 +/- 40.3469354851024
Weight change: 0.01322420804906784
Iteration 9: -135.07787159572888 +/- 36.277051960891754
Weight change: 0.014962442380122521
Iteration 10: -129.32088805315436 +/- 44.23744134731829
Weight change: 0.016615557556858923
Iteration 11: -112.28800794277755 +/- 45.73130441678851
Weight change: 0.017950335946249972

Iteration 91: -277.39065062192674 +/- 120.56218163355027
Weight change: 0.14309748028960004
Iteration 92: -261.4643882243713 +/- 125.82006847015799
Weight change: 0.14395596737810495
Iteration 93: -283.1574876663189 +/- 124.92121380675444
Weight change: 0.14487698160401083
Iteration 94: -267.4499437267285 +/- 125.89927575495412
Weight change: 0.14587089503297487
Iteration 95: -280.2036519380629 +/- 126.74941202539635
Weight change: 0.14652150637178585
Iteration 96: -282.06152955715385 +/- 136.69716540840957
Weight change: 0.14745920066568538
Iteration 97: -271.9877397439667 +/- 126.8113999685332
Weight change: 0.14833549081268296
Iteration 98: -255.5866325293272 +/- 117.54884353840535
Weight change: 0.14895849650269577
Iteration 99: -281.4100585994205 +/- 131.63408785531786
Weight change: 0.1498356462640198
Iteration 100: -277.1747658466185 +/- 145.06735462768108
Weight change: 0.15018332424859476
