In [None]:
import gym
import random
import itertools
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

# Definitions of the hyper-parameters and configurations.

# Batch size
BATCH_SIZE = 128

# Leaning rate
LEARNING_RATE = 0.0001

# The total number of training episodes.
TOTAL_EPISODE = 2000

# (Linear) Decayed epsilon-greedy is used.
START_EPSILON = 0.99
END_EPSILON = 0.05
EPSILON_DACAY_RATE = 0.02

# Gamma value in RL theory
GAMMA = 0.999

# The size of the expirence pool.
POOL_SIZE = 10000

# Target net update interval episodes
TARGET_UPDATE_INTERVAL = 2

# Render or not
DO_RENDERING = False

In [None]:
# The network structure. A fully-connected network should be enough.
# Inputs are state (dimension is 4); outputs are Q values of the two actions in current state (dimension is 2).
# NOTE: Large networks are very likely to degenerate, and perform much worse than the tiny networks do.
class Network(nn.Module):

    def __init__(self):
        super(Network, self).__init__()

        self.add_module('layer1', nn.Linear(4, 20))
        self.add_module('layer2', nn.Linear(20, 20))
        self.add_module('layer3', nn.Linear(20, 2))

    def forward(self, x):
        # NOTE: Cuz the network is tiny, in the case of using ReLU, alive nerons will be too few to make the network work.
        x = F.leaky_relu_(self.layer1(x))
        x = F.leaky_relu_(self.layer2(x))
        return F.leaky_relu_(self.layer3(x))

In [None]:
# The expirence pool, which is vital for DQN.
# The pool here is a ring buffer.
class ExpPool(object):

    def __init__(self):
        self.pool = []
        self.index = 0

    def add(self, state, action, next_state, reward):
        instance = Instance(state, action, next_state, reward)
        if len(self.pool) < POOL_SIZE:
            self.pool.append(instance)
        else:
            self.pool[self.index] = instance
            
        self.index = (self.index + 1) % POOL_SIZE
        
    def sample(self, batch_size):
        return random.sample(self.pool, batch_size)

    def size(self):
        return len(self.pool)

# One single instance in the pool.
# To use TD, the following 4 things is required.
class Instance(object):
    
    def __init__(self, state, action, next_state, reward):
        self.state = state
        self.action = action
        self.reward = reward
        self.next_state = next_state

In [None]:
# All the initializations and declarations before taining.

# Enviroment
env = gym.make('CartPole-v0')

# Device
device = torch.device('cpu')

# Double DQN can make the training process more stable.
policy_net = Network().to(device)
target_net = Network().to(device)

# Optimizer
optimizer = torch.optim.Adam(policy_net.parameters(), lr=LEARNING_RATE)

# Experience pool
exp_pool = ExpPool()

# Data used to draw plots
episode_total_loss = 0
avg_loss_array = []
episode_length_array = []

In [None]:
# Definition of functions


# The policy of the agent
def epsilon_greedy(state, current_episode):

    # Linear-decayed epsilon
    epsilon = max(START_EPSILON - current_episode * EPSILON_DACAY_RATE,
                  END_EPSILON)

    # Take the action whose Q value is larger, or randomly.
    # Since the network are only used to interact with enviroment will not be updated, we use torch.no_grad() here.
    if random.random() > epsilon:
        with torch.no_grad():
            result = policy_net(
                torch.tensor(state, device=device, dtype=torch.float))
            return torch.max(result, 0)[1].view(1, 1)
    else:
        return torch.tensor([[random.randrange(2)]], dtype=torch.long)


# Update model for one single time.
def update():

    # Do the update only if enough samples in the pool
    if exp_pool.size() < BATCH_SIZE:
        return

    # Prepare the samples, convert them into tensors.
    states, actions, rewards, next_states = [], [], [], []
    final_states, final_actions, final_rewards = [], [], []
    instances = exp_pool.sample(BATCH_SIZE)
    for instance in instances:
        if instance.next_state is None:
            final_states.append(instance.state)
            final_actions.append(instance.action)
            final_rewards.append(instance.reward)
        else:
            states.append(instance.state)
            actions.append(instance.action)
            rewards.append(instance.reward)
            next_states.append(instance.next_state)

    state_batch = torch.tensor(states + final_states,
                               device=device,
                               dtype=torch.float)
    action_batch = torch.tensor(actions + final_actions,
                                device=device,
                                dtype=torch.long).view(-1, 1)
    reward_batch = torch.tensor(rewards + final_rewards,
                                device=device,
                                dtype=torch.float)
    next_state_batch = torch.tensor(next_states,
                                    device=device,
                                    dtype=torch.float)

    # Get the Q values of the state-action pairs in the samples. Use policy net here.
    policy_result = policy_net(state_batch)
    q_values = torch.gather(policy_result, dim=1,
                            index=action_batch).view(1, -1)[0]

    # Get the state values of the next states (i.e., the max Q value of the next state). Use target net here.
    # NOTE: the final states are vital, whose state values MUST be 0.
    # Since the policy net will not be updated using gradient, we use detach() to stop the backprop.
    next_state_values = torch.cat(
        (torch.max(target_net(next_state_batch),
                   1)[0].detach(), torch.zeros(len(final_states))), 0)

    # Get the target Q value (according to TD).
    target_q_values = GAMMA * next_state_values + reward_batch

    # Calculate the loss function. We can try different of regression loss functions.
    loss = nn.MSELoss()(q_values, target_q_values)
    global episode_total_loss
    episode_total_loss += loss

    # Update the model.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


# Draw the analysis plots.
def draw_plot():
    plt.figure(figsize=(50,20))
    plt.xticks(fontsize=25)
    plt.yticks(fontsize=25)
    plt.plot(avg_loss_array)
    plt.ylabel('loss', fontsize=35)
    plt.xlabel('episode', fontsize=35)
    plt.draw()

    plt.figure(figsize=(50,20))
    plt.xticks(fontsize=25)
    plt.yticks(fontsize=25)
    plt.plot(episode_length_array)
    plt.ylabel('length', fontsize=35)
    plt.xlabel('episode', fontsize=35)
    plt.draw()

In [None]:
# Training process.
for episode in range(TOTAL_EPISODE):

    # Get the init state
    state = env.reset()

    for step in itertools.count():

        # Take an action.
        action = epsilon_greedy(state, episode)
        new_state, reward, done, _ = env.step(action.item())

        # Update the model.
        update()

        if DO_RENDERING:
            env.render()

        if done:
            # Save the final state sample into pool.
            exp_pool.add(state, action, None, 0)
            
            # Record data.
            avg_loss_array.append(episode_total_loss / (step + 1))
            episode_length_array.append(step + 1)
            episode_total_loss = 0

            # Update the target net.
            if 0 == episode % TARGET_UPDATE_INTERVAL:
                target_net.load_state_dict(policy_net.state_dict())
            break

        else:
            # Save the sample into pool.
            exp_pool.add(state, action, new_state, reward)
            state = new_state

# Draw the plots.
draw_plot()

env.close()
