In [1]:
import gym
from gym import wrappers
import random
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [9]:
# hyper parameters
EPISODES = 200  # number of episodes
EPS_START = 0.9  # e-greedy threshold start value
EPS_END = 0.05  # e-greedy threshold end value
EPS_DECAY = 200  # e-greedy threshold decay
GAMMA = 0.8  # Q-learning discount factor
LR = 0.001  # NN optimizer learning rate
HIDDEN_LAYER = 256  # NN hidden layer size
BATCH_SIZE = 3  # Q-learning batch size

# if gpu is to be used
use_cuda = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor
Tensor = FloatTensor



In [3]:
class ReplayMemory:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []

    def push(self, transition):
        self.memory.append(transition)
        if len(self.memory) > self.capacity:
            del self.memory[0]

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

    

In [4]:
class Network(nn.Module):
    def __init__(self):
        nn.Module.__init__(self)
        self.l1 = nn.Linear(4, HIDDEN_LAYER)
        self.l2 = nn.Linear(HIDDEN_LAYER, 2)

    def forward(self, x):
        x = F.relu(self.l1(x))
        x = self.l2(x)
        return x
    

In [13]:
env = gym.make('CartPole-v0')
#env = wrappers.Monitor(env, './tmp/cartpole-v0-1')

model = Network()
if use_cuda:
    model.cuda()
memory = ReplayMemory(10000)
optimizer = optim.Adam(model.parameters(), LR)
steps_done = 0
episode_durations = []



In [6]:
def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        return model(Variable(state, volatile=True).type(FloatTensor)).data.max(1)[1].view(1, 1)
    else:
        return LongTensor([[random.randrange(2)]])
    

In [16]:
def run_episode(e, environment):
    state = environment.reset()
    steps = 0
    while True:
        environment.render()
        action = select_action(FloatTensor([state]))
        next_state, reward, done, _ = environment.step(action[0, 0].item())

        # negative reward when attempt ends
        if done:
            reward = -1

        memory.push((FloatTensor([state]),
                     action,  # action is already a tensor
                     FloatTensor([next_state]),
                     FloatTensor([reward])))

       # learn()

        state = next_state
        steps += 1

        if done:
            print("{2} Episode {0} finished after {1} steps"
                  .format(e, steps, '\033[92m' if steps >= 195 else '\033[99m'))
            episode_durations.append(steps)
         #   plot_durations()
            break


In [14]:

memory.memory
memory.memory=[]

In [17]:
for e in range(1):
    run_episode(e,env)

[99m Episode 0 finished after 11 steps


  import sys


In [18]:
memory.memory

[(tensor([[-0.0334, -0.0122, -0.0461, -0.0060]], device='cuda:0'),
  tensor([[1]], device='cuda:0'),
  tensor([[-0.0336,  0.1835, -0.0462, -0.3128]], device='cuda:0'),
  tensor([1.], device='cuda:0')),
 (tensor([[-0.0336,  0.1835, -0.0462, -0.3128]], device='cuda:0'),
  tensor([[0]], device='cuda:0'),
  tensor([[-0.0299, -0.0109, -0.0524, -0.0351]], device='cuda:0'),
  tensor([1.], device='cuda:0')),
 (tensor([[-0.0299, -0.0109, -0.0524, -0.0351]], device='cuda:0'),
  tensor([[1]], device='cuda:0'),
  tensor([[-0.0302,  0.1849, -0.0531, -0.3438]], device='cuda:0'),
  tensor([1.], device='cuda:0')),
 (tensor([[-0.0302,  0.1849, -0.0531, -0.3438]], device='cuda:0'),
  tensor([[1]], device='cuda:0'),
  tensor([[-0.0265,  0.3807, -0.0600, -0.6528]], device='cuda:0'),
  tensor([1.], device='cuda:0')),
 (tensor([[-0.0265,  0.3807, -0.0600, -0.6528]], device='cuda:0'),
  tensor([[0]], device='cuda:0'),
  tensor([[-0.0188,  0.1865, -0.0731, -0.3796]], device='cuda:0'),
  tensor([1.], device='c

In [20]:
transitions=memory.sample(BATCH_SIZE)
transitions

[(tensor([[-0.0265,  0.3807, -0.0600, -0.6528]], device='cuda:0'),
  tensor([[0]], device='cuda:0'),
  tensor([[-0.0188,  0.1865, -0.0731, -0.3796]], device='cuda:0'),
  tensor([1.], device='cuda:0')),
 (tensor([[ 0.0196,  0.9713, -0.1414, -1.6584]], device='cuda:0'),
  tensor([[1]], device='cuda:0'),
  tensor([[ 0.0390,  1.1678, -0.1746, -1.9916]], device='cuda:0'),
  tensor([1.], device='cuda:0')),
 (tensor([[-0.0188,  0.1865, -0.0731, -0.3796]], device='cuda:0'),
  tensor([[1]], device='cuda:0'),
  tensor([[-0.0151,  0.3826, -0.0807, -0.6944]], device='cuda:0'),
  tensor([1.], device='cuda:0'))]

In [21]:
batch_state,batch_action,batch_next_state,batch_reward=zip(*transitions)
batch_state,batch_action,batch_next_state,batch_reward

((tensor([[-0.0265,  0.3807, -0.0600, -0.6528]], device='cuda:0'),
  tensor([[ 0.0196,  0.9713, -0.1414, -1.6584]], device='cuda:0'),
  tensor([[-0.0188,  0.1865, -0.0731, -0.3796]], device='cuda:0')),
 (tensor([[0]], device='cuda:0'),
  tensor([[1]], device='cuda:0'),
  tensor([[1]], device='cuda:0')),
 (tensor([[-0.0188,  0.1865, -0.0731, -0.3796]], device='cuda:0'),
  tensor([[ 0.0390,  1.1678, -0.1746, -1.9916]], device='cuda:0'),
  tensor([[-0.0151,  0.3826, -0.0807, -0.6944]], device='cuda:0')),
 (tensor([1.], device='cuda:0'),
  tensor([1.], device='cuda:0'),
  tensor([1.], device='cuda:0')))

In [23]:
batch_state = Variable(torch.cat(batch_state))
batch_action = Variable(torch.cat(batch_action))
batch_reward = Variable(torch.cat(batch_reward))
batch_next_state = Variable(torch.cat(batch_next_state))

TypeError: cat(): argument 'tensors' (position 1) must be tuple of Tensors, not Tensor