In [1]:
import gym
from gym import wrappers
import random
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import matplotlib.pyplot as plt
%config Completer.use_jedi = False

In [2]:
# hyper parameters
EPISODES = 200  # number of episodes
EPS_START = 0.9  # e-greedy threshold start value
EPS_END = 0.05  # e-greedy threshold end value
EPS_DECAY = 200  # e-greedy threshold decay
GAMMA = 0.8  # Q-learning discount factor
LR = 0.001  # NN optimizer learning rate
HIDDEN_LAYER = 256  # NN hidden layer size
BATCH_SIZE = 3  # Q-learning batch size

# if gpu is to be used
use_cuda = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor
Tensor = FloatTensor



In [3]:
class ReplayMemory:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []

    def push(self, transition):
        self.memory.append(transition)
        if len(self.memory) > self.capacity:
            del self.memory[0]

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

    

In [4]:
class Network(nn.Module):
    def __init__(self):
        nn.Module.__init__(self)
        self.l1 = nn.Linear(4, HIDDEN_LAYER)
        self.l2 = nn.Linear(HIDDEN_LAYER, 2)

    def forward(self, x):
        x = F.relu(self.l1(x))
        x = self.l2(x)
        return x
    

In [5]:
env = gym.make('CartPole-v0')
#env = wrappers.Monitor(env, './tmp/cartpole-v0-1')

model = Network()
if use_cuda:
    model.cuda()
memory = ReplayMemory(10000)
optimizer = optim.Adam(model.parameters(), LR)
steps_done = 0
episode_durations = []



In [6]:
def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        return model(Variable(state, volatile=True).type(FloatTensor)).data.max(1)[1].view(1, 1)
    else:
        return LongTensor([[random.randrange(2)]])
    

In [7]:
def run_episode(e, environment):
    state = environment.reset()
    steps = 0
    while True:
        environment.render()
        action = select_action(FloatTensor([state]))
        next_state, reward, done, _ = environment.step(action[0, 0].item())

        # negative reward when attempt ends
        if done:
            reward = -1

        memory.push((FloatTensor([state]),
                     action,  # action is already a tensor
                     FloatTensor([next_state]),
                     FloatTensor([reward])))

       # learn()

        state = next_state
        steps += 1

        if done:
            print("{2} Episode {0} finished after {1} steps"
                  .format(e, steps, '\033[92m' if steps >= 195 else '\033[99m'))
            episode_durations.append(steps)
         #   plot_durations()
            break


In [8]:

memory.memory
memory.memory=[]

In [9]:
for e in range(1):
    run_episode(e,env)

[99m Episode 0 finished after 17 steps


  import sys


In [10]:
memory.memory

[(tensor([[0.0134, 0.0430, 0.0067, 0.0092]], device='cuda:0'),
  tensor([[1]], device='cuda:0'),
  tensor([[ 0.0143,  0.2381,  0.0069, -0.2814]], device='cuda:0'),
  tensor([1.], device='cuda:0')),
 (tensor([[ 0.0143,  0.2381,  0.0069, -0.2814]], device='cuda:0'),
  tensor([[0]], device='cuda:0'),
  tensor([[0.0191, 0.0428, 0.0013, 0.0135]], device='cuda:0'),
  tensor([1.], device='cuda:0')),
 (tensor([[0.0191, 0.0428, 0.0013, 0.0135]], device='cuda:0'),
  tensor([[0]], device='cuda:0'),
  tensor([[ 0.0199, -0.1523,  0.0015,  0.3065]], device='cuda:0'),
  tensor([1.], device='cuda:0')),
 (tensor([[ 0.0199, -0.1523,  0.0015,  0.3065]], device='cuda:0'),
  tensor([[0]], device='cuda:0'),
  tensor([[ 0.0169, -0.3474,  0.0077,  0.5997]], device='cuda:0'),
  tensor([1.], device='cuda:0')),
 (tensor([[ 0.0169, -0.3474,  0.0077,  0.5997]], device='cuda:0'),
  tensor([[1]], device='cuda:0'),
  tensor([[ 0.0099, -0.1524,  0.0197,  0.3095]], device='cuda:0'),
  tensor([1.], device='cuda:0')),
 (

In [11]:
transitions=memory.sample(BATCH_SIZE)
transitions

[(tensor([[-0.0358, -0.5467,  0.1009,  0.9870]], device='cuda:0'),
  tensor([[1]], device='cuda:0'),
  tensor([[-0.0468, -0.3530,  0.1206,  0.7276]], device='cuda:0'),
  tensor([1.], device='cuda:0')),
 (tensor([[-0.0648, -0.7462,  0.1563,  1.3876]], device='cuda:0'),
  tensor([[1]], device='cuda:0'),
  tensor([[-0.0797, -0.5533,  0.1841,  1.1476]], device='cuda:0'),
  tensor([1.], device='cuda:0')),
 (tensor([[-7.9909e-05, -5.4329e-01,  3.8027e-02,  9.0900e-01]], device='cuda:0'),
  tensor([[1]], device='cuda:0'),
  tensor([[-0.0109, -0.3487,  0.0562,  0.6285]], device='cuda:0'),
  tensor([1.], device='cuda:0'))]

In [12]:
batch_state,batch_action,batch_next_state,batch_reward=zip(*transitions)
batch_state,batch_action,batch_next_state,batch_reward

((tensor([[-0.0358, -0.5467,  0.1009,  0.9870]], device='cuda:0'),
  tensor([[-0.0648, -0.7462,  0.1563,  1.3876]], device='cuda:0'),
  tensor([[-7.9909e-05, -5.4329e-01,  3.8027e-02,  9.0900e-01]], device='cuda:0')),
 (tensor([[1]], device='cuda:0'),
  tensor([[1]], device='cuda:0'),
  tensor([[1]], device='cuda:0')),
 (tensor([[-0.0468, -0.3530,  0.1206,  0.7276]], device='cuda:0'),
  tensor([[-0.0797, -0.5533,  0.1841,  1.1476]], device='cuda:0'),
  tensor([[-0.0109, -0.3487,  0.0562,  0.6285]], device='cuda:0')),
 (tensor([1.], device='cuda:0'),
  tensor([1.], device='cuda:0'),
  tensor([1.], device='cuda:0')))

In [13]:
batch_state = Variable(torch.cat(batch_state))
batch_action = Variable(torch.cat(batch_action))
batch_reward = Variable(torch.cat(batch_reward))
batch_next_state = Variable(torch.cat(batch_next_state))

In [14]:
model_output=model(batch_state)
print("model_output=",model_output)


model_output= tensor([[-0.4473,  0.0666],
        [-0.4570,  0.0778],
        [-0.4510,  0.0590]], device='cuda:0', grad_fn=<AddmmBackward>)


In [15]:
current_q_values=model_output.gather(1,batch_action)

In [16]:
print("current_q_values=",current_q_values)

current_q_values= tensor([[0.0666],
        [0.0778],
        [0.0590]], device='cuda:0', grad_fn=<GatherBackward>)


In [17]:
max_next_q_values = model(batch_next_state).detach().max(1)[0]
expected_q_values = batch_reward + (GAMMA * max_next_q_values)

In [19]:
expected_q_values

tensor([1.0737, 1.0683, 1.0644], device='cuda:0')

In [20]:
loss = F.smooth_l1_loss(current_q_values, expected_q_values)

  """Entry point for launching an IPython kernel.


In [21]:
current_q_values

tensor([[0.0666],
        [0.0778],
        [0.0590]], device='cuda:0', grad_fn=<GatherBackward>)

In [26]:
current_q_values.detach()


tensor([[0.0666],
        [0.0778],
        [0.0590]], device='cuda:0')