In [1]:
import gym
from gym import wrappers
import random
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [2]:
# hyper parameters
EPISODES = 200  # number of episodes
EPS_START = 0.9  # e-greedy threshold start value
EPS_END = 0.05  # e-greedy threshold end value
EPS_DECAY = 200  # e-greedy threshold decay
GAMMA = 0.8  # Q-learning discount factor
LR = 0.001  # NN optimizer learning rate
HIDDEN_LAYER = 256  # NN hidden layer size
BATCH_SIZE = 3  # Q-learning batch size

# if gpu is to be used
use_cuda = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor
Tensor = FloatTensor



  return torch._C._cuda_getDeviceCount() > 0


In [3]:
class ReplayMemory:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []

    def push(self, transition):
        self.memory.append(transition)
        if len(self.memory) > self.capacity:
            del self.memory[0]

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

    

In [4]:
class Network(nn.Module):
    def __init__(self):
        nn.Module.__init__(self)
        self.l1 = nn.Linear(4, HIDDEN_LAYER)
        self.l2 = nn.Linear(HIDDEN_LAYER, 2)

    def forward(self, x):
        x = F.relu(self.l1(x))
        x = self.l2(x)
        return x
    

In [5]:
env = gym.make('CartPole-v0')
#env = wrappers.Monitor(env, './tmp/cartpole-v0-1')

model = Network()
if use_cuda:
    model.cuda()
memory = ReplayMemory(10000)
optimizer = optim.Adam(model.parameters(), LR)
steps_done = 0
episode_durations = []



In [6]:
def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        return model(Variable(state, volatile=True).type(FloatTensor)).data.max(1)[1].view(1, 1)
    else:
        return LongTensor([[random.randrange(2)]])
    

In [7]:
def run_episode(e, environment):
    state = environment.reset()
    steps = 0
    while True:
        environment.render()
        action = select_action(FloatTensor([state]))
        next_state, reward, done, _ = environment.step(action[0, 0].item())

        # negative reward when attempt ends
        if done:
            reward = -1

        memory.push((FloatTensor([state]),
                     action,  # action is already a tensor
                     FloatTensor([next_state]),
                     FloatTensor([reward])))

       # learn()

        state = next_state
        steps += 1

        if done:
            print("{2} Episode {0} finished after {1} steps"
                  .format(e, steps, '\033[92m' if steps >= 195 else '\033[99m'))
            episode_durations.append(steps)
         #   plot_durations()
            break


In [8]:

memory.memory
memory.memory=[]

In [9]:
for e in range(1):
    run_episode(e,env)

[99m Episode 0 finished after 10 steps


  import sys


In [10]:
memory.memory

[(tensor([[-0.0424,  0.0421,  0.0168, -0.0414]]),
  tensor([[1]]),
  tensor([[-0.0416,  0.2370,  0.0160, -0.3287]]),
  tensor([1.])),
 (tensor([[-0.0416,  0.2370,  0.0160, -0.3287]]),
  tensor([[1]]),
  tensor([[-0.0369,  0.4319,  0.0094, -0.6163]]),
  tensor([1.])),
 (tensor([[-0.0369,  0.4319,  0.0094, -0.6163]]),
  tensor([[1]]),
  tensor([[-0.0282,  0.6268, -0.0029, -0.9060]]),
  tensor([1.])),
 (tensor([[-0.0282,  0.6268, -0.0029, -0.9060]]),
  tensor([[1]]),
  tensor([[-0.0157,  0.8220, -0.0210, -1.1996]]),
  tensor([1.])),
 (tensor([[-0.0157,  0.8220, -0.0210, -1.1996]]),
  tensor([[1]]),
  tensor([[ 7.5451e-04,  1.0174e+00, -4.5002e-02, -1.4988e+00]]),
  tensor([1.])),
 (tensor([[ 7.5451e-04,  1.0174e+00, -4.5002e-02, -1.4988e+00]]),
  tensor([[1]]),
  tensor([[ 0.0211,  1.2130, -0.0750, -1.8052]]),
  tensor([1.])),
 (tensor([[ 0.0211,  1.2130, -0.0750, -1.8052]]),
  tensor([[1]]),
  tensor([[ 0.0454,  1.4089, -0.1111, -2.1202]]),
  tensor([1.])),
 (tensor([[ 0.0454,  1.4089, -

In [11]:
transitions=memory.sample(BATCH_SIZE)
transitions

[(tensor([[ 0.0211,  1.2130, -0.0750, -1.8052]]),
  tensor([[1]]),
  tensor([[ 0.0454,  1.4089, -0.1111, -2.1202]]),
  tensor([1.])),
 (tensor([[-0.0369,  0.4319,  0.0094, -0.6163]]),
  tensor([[1]]),
  tensor([[-0.0282,  0.6268, -0.0029, -0.9060]]),
  tensor([1.])),
 (tensor([[ 0.0454,  1.4089, -0.1111, -2.1202]]),
  tensor([[1]]),
  tensor([[ 0.0735,  1.6050, -0.1535, -2.4450]]),
  tensor([1.]))]

In [12]:
batch_state,batch_action,batch_next_state,batch_reward=zip(*transitions)
batch_state,batch_action,batch_next_state,batch_reward

((tensor([[ 0.0211,  1.2130, -0.0750, -1.8052]]),
  tensor([[-0.0369,  0.4319,  0.0094, -0.6163]]),
  tensor([[ 0.0454,  1.4089, -0.1111, -2.1202]])),
 (tensor([[1]]), tensor([[1]]), tensor([[1]])),
 (tensor([[ 0.0454,  1.4089, -0.1111, -2.1202]]),
  tensor([[-0.0282,  0.6268, -0.0029, -0.9060]]),
  tensor([[ 0.0735,  1.6050, -0.1535, -2.4450]])),
 (tensor([1.]), tensor([1.]), tensor([1.])))

In [13]:
batch_state = Variable(torch.cat(batch_state))
batch_action = Variable(torch.cat(batch_action))
batch_reward = Variable(torch.cat(batch_reward))
batch_next_state = Variable(torch.cat(batch_next_state))

In [14]:
model_output=model(batch_state)
print("model_output=",model_output)


model_output= tensor([[ 0.0976,  0.6439],
        [-0.0355,  0.3416],
        [ 0.1115,  0.7360]], grad_fn=<AddmmBackward>)


In [16]:
current_q_values=model_output.gather(1,batch_action)

In [17]:
print("current_q_values=",current_q_values)

current_q_values= tensor([[0.6439],
        [0.3416],
        [0.7360]], grad_fn=<GatherBackward>)
