In [1]:
import gym
import random
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import matplotlib.pyplot as plt
from moviepy.editor import ImageSequenceClip

# hyper parameters
EPISODES = 1000  # number of episodes
EPS_START = 0.9  # e-greedy threshold start value
EPS_END = 0.05  # e-greedy threshold end value
EPS_DECAY = 200  # e-greedy threshold decay
GAMMA = 0.75  # Q-learning discount factor
LR = 0.001  # NN optimizer learning rate
HIDDEN_LAYER = 164  # NN hidden layer size
BATCH_SIZE = 64  # Q-learning batch size

# if gpu is to be used
use_cuda = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor
Tensor = FloatTensor


class ReplayMemory:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []

    def push(self, transition):
        self.memory.append(transition)
        if len(self.memory) > self.capacity:
            del self.memory[0]

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

class Network(nn.Module):
    def __init__(self):
        nn.Module.__init__(self)
        self.l1 = nn.Linear(4, HIDDEN_LAYER)
        self.l2 = nn.Linear(HIDDEN_LAYER, 2)

    def forward(self, x):
        x = F.relu(self.l1(x))
        x = self.l2(x)
        return x

env = gym.make('CartPole-v0').unwrapped

model = Network()
target = Network()
if use_cuda:
    model.cuda()
    target.cuda()
memory = ReplayMemory(10000)
optimizer = optim.Adam(model.parameters(), LR)
steps_done = 0
ed = []

# def plot_durations(d):
#     plt.figure(2)
#     plt.clf()
#     plt.title('Training...')
#     plt.xlabel('Episode')
#     plt.ylabel('Duration')
#     plt.plot(d)
#
#     plt.savefig('4_policy_gradient_score.png')

def select_action(state, train=True):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if train:
        if sample > eps_threshold:
            return model(Variable(state, volatile=True).type(FloatTensor)).data.max(1)[1].view(1, 1)
        else:
            return LongTensor([[random.randrange(2)]])
    else:
        return model(Variable(state, volatile=True).type(FloatTensor)).data.max(1)[1].view(1, 1)

def run_episode(episode, env):
    state = env.reset()
    steps = 0
    while True:
        # env.render()
        action = select_action(FloatTensor([state]))
        next_state, reward, done, _ = env.step(action[0, 0].item())

        # negative reward when attempt ends
        if done:
            if steps < 30:
                reward -= 10
            else:
                reward = -1
        if steps > 100:
            reward += 1
        if steps > 200:
            reward += 1
        if steps > 300:
            reward += 1

        memory.push((FloatTensor([state]),
                     action,  # action is already a tensor
                     FloatTensor([next_state]),
                     FloatTensor([reward])))

        learn()

        state = next_state
        steps += 1

        if done or steps >= 1000:
            ed.append(steps)
            print("[Episode {:>5}]  steps: {:>5}".format(episode, steps))
            if sum(ed[-10:])/10 > 800:
                return True
            break
    return False

def learn():
    if len(memory) < BATCH_SIZE:
        return

    # random transition batch is taken from experience replay memory
    transitions = memory.sample(BATCH_SIZE)
    batch_state, batch_action, batch_next_state, batch_reward = zip(*transitions)

    batch_state = Variable(torch.cat(batch_state))
    batch_action = Variable(torch.cat(batch_action))
    batch_reward = Variable(torch.cat(batch_reward))
    batch_next_state = Variable(torch.cat(batch_next_state))

    # current Q values are estimated by NN for all actions
    current_q_values = model(batch_state).gather(1, batch_action)
    # expected Q values are estimated from actions which gives maximum Q value
    max_next_q_values = target(batch_next_state).detach().max(1)[0]
    expected_q_values = batch_reward + (GAMMA * max_next_q_values)

    # loss is measured from error between current and newly expected Q values
    loss = F.smooth_l1_loss(current_q_values, expected_q_values)

    # backpropagation of loss to NN
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

def botPlay():
    state = env.reset()
    steps = 0
    frames = []
    while True:
        frame = env.render(mode='rgb_array')
        frames.append(frame)
        action = select_action(FloatTensor([state]))
        next_state, reward, done, _ = env.step(action[0, 0])

        state = next_state
        steps += 1

        if done or steps >= 1000:
            break

    clip = ImageSequenceClip(frames, fps=20)
    clip.write_gif('4_policy_gradient_play.gif', fps=20)

for e in range(EPISODES):
    complete = run_episode(e, env)

    if complete:
        print('complete...!')
        break

    if (e+1) % 5 == 0:
        mp = list(target.parameters())
        mcp = list(model.parameters())
        n = len(mp)
        for i in range(0, n):
            mp[i].data[:] = mcp[i].data[:]


[Episode     0]  steps:    23
[Episode     1]  steps:    40
[Episode     2]  steps:    12
[Episode     3]  steps:    12
[Episode     4]  steps:    16
[Episode     5]  steps:    10
[Episode     6]  steps:    12
[Episode     7]  steps:    17
[Episode     8]  steps:    11
[Episode     9]  steps:    13
[Episode    10]  steps:    17
[Episode    11]  steps:    14
[Episode    12]  steps:    11
[Episode    13]  steps:    12
[Episode    14]  steps:    11
[Episode    15]  steps:    14
[Episode    16]  steps:     9
[Episode    17]  steps:    19
[Episode    18]  steps:    12
[Episode    19]  steps:    12
[Episode    20]  steps:     9
[Episode    21]  steps:     9
[Episode    22]  steps:     9
[Episode    23]  steps:    15
[Episode    24]  steps:    13
[Episode    25]  steps:    13
[Episode    26]  steps:     9
[Episode    27]  steps:    11
[Episode    28]  steps:    12
[Episode    29]  steps:    12
[Episode    30]  steps:     9
[Episode    31]  steps:    11
[Episode    32]  steps:    11
[Episode  

[Episode   274]  steps:    20
[Episode   275]  steps:     9
[Episode   276]  steps:    33
[Episode   277]  steps:    29
[Episode   278]  steps:    20
[Episode   279]  steps:    27
[Episode   280]  steps:    24
[Episode   281]  steps:    29
[Episode   282]  steps:    27
[Episode   283]  steps:    16
[Episode   284]  steps:    12
[Episode   285]  steps:    23
[Episode   286]  steps:    21
[Episode   287]  steps:    12
[Episode   288]  steps:    31
[Episode   289]  steps:    13
[Episode   290]  steps:    10
[Episode   291]  steps:     9
[Episode   292]  steps:    22
[Episode   293]  steps:    10
[Episode   294]  steps:     9
[Episode   295]  steps:    15
[Episode   296]  steps:    28
[Episode   297]  steps:    23
[Episode   298]  steps:    21
[Episode   299]  steps:    13
[Episode   300]  steps:    10
[Episode   301]  steps:    72
[Episode   302]  steps:    33
[Episode   303]  steps:    10
[Episode   304]  steps:    27
[Episode   305]  steps:    21
[Episode   306]  steps:    12
[Episode  

[Episode   551]  steps:    13
[Episode   552]  steps:    13
[Episode   553]  steps:    35
[Episode   554]  steps:    23
[Episode   555]  steps:    24
[Episode   556]  steps:    11
[Episode   557]  steps:     9
[Episode   558]  steps:    11
[Episode   559]  steps:    11
[Episode   560]  steps:    13
[Episode   561]  steps:    28
[Episode   562]  steps:    12
[Episode   563]  steps:    15
[Episode   564]  steps:    26
[Episode   565]  steps:    23
[Episode   566]  steps:    17
[Episode   567]  steps:    21
[Episode   568]  steps:    12
[Episode   569]  steps:     8
[Episode   570]  steps:     8
[Episode   571]  steps:    23
[Episode   572]  steps:     8
[Episode   573]  steps:    24
[Episode   574]  steps:    10
[Episode   575]  steps:    29
[Episode   576]  steps:    24
[Episode   577]  steps:    20
[Episode   578]  steps:    10
[Episode   579]  steps:    12
[Episode   580]  steps:    10
[Episode   581]  steps:    11
[Episode   582]  steps:    11
[Episode   583]  steps:    12
[Episode  

[Episode   826]  steps:    26
[Episode   827]  steps:    17
[Episode   828]  steps:    10
[Episode   829]  steps:    11
[Episode   830]  steps:    10
[Episode   831]  steps:    10
[Episode   832]  steps:    16
[Episode   833]  steps:     8
[Episode   834]  steps:     9
[Episode   835]  steps:    21
[Episode   836]  steps:    16
[Episode   837]  steps:    39
[Episode   838]  steps:    10
[Episode   839]  steps:    24
[Episode   840]  steps:    10
[Episode   841]  steps:    10
[Episode   842]  steps:     9
[Episode   843]  steps:    10
[Episode   844]  steps:     9
[Episode   845]  steps:    16
[Episode   846]  steps:    11
[Episode   847]  steps:    10
[Episode   848]  steps:     9
[Episode   849]  steps:     8
[Episode   850]  steps:    19
[Episode   851]  steps:    15
[Episode   852]  steps:     9
[Episode   853]  steps:    22
[Episode   854]  steps:    10
[Episode   855]  steps:    28
[Episode   856]  steps:     9
[Episode   857]  steps:    10
[Episode   858]  steps:    13
[Episode  