In [1]:
import gym

import numpy as np
import tqdm, random
from collections import deque
import matplotlib.pyplot as plt   

import torch 
from torch import nn, optim
import torch.nn.functional as F

In [2]:
debug = True

env = gym.make('CartPole-v0')

n_episodes = 10_000
memory = deque(maxlen=3000)

lr = 0.01
batch_size = 256

gamma = torch.tensor(1.0, dtype=torch.float)

epsilon = 1.0
epsilon_decay = 0.999

copy_step = 10

In [3]:
class QModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.input_layer = nn.Linear(env.observation_space.shape[0], 16)
        self.h1 = nn.Linear(16,16)
        self.h2 = nn.Linear(16,16)
        self.out_layer = nn.Linear(16,env.action_space.n)

    def forward(self, inputs):
        h = F.relu(self.input_layer(inputs))
        h = F.relu(self.h1(h))
        h = F.relu(self.h2(h))
        out = self.out_layer(h)
        return out

model = QModel()
target_model = QModel()
target_model.load_state_dict(model.state_dict())
optimizer = optim.Adam(model.parameters(), lr)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1000, gamma=0.01 )

if debug:
    print(model(torch.atleast_2d(torch.tensor(env.observation_space.sample()))))

tensor([[-7.1579e+35, -3.0531e+35]], grad_fn=<AddmmBackward>)


In [4]:
def remember(state, action, reward, done, next_state):
    memory.append((state, action, reward, done, next_state))

def get_action(state):
    if np.random.normal() < epsilon:
        return env.action_space.sample()
    else:
        model.eval()
        with torch.no_grad():
            state = torch.tensor(state, dtype=torch.float).view(1,4)
            return np.argmax(model(state).cpu().numpy(), -1)[0]

def train():
    if len(memory) < batch_size:
        return
    
    xb = []
    yb = []

    for (state, action, reward, done, next_state) in random.sample(memory, batch_size):
        with torch.no_grad():
            pred_reward = model(torch.tensor(state, dtype=torch.float).view(1,4))
            next_reward = target_model(torch.tensor(next_state, dtype=torch.float).view(1,4))
            actual_reward = pred_reward[0]
            actual_reward[action] = torch.tensor(reward, dtype=torch.float)
            if not done:
                actual_reward[action] += torch.max(next_reward[0]) * gamma

        yb.append(actual_reward)
        xb.append(state)
        
    xb = torch.tensor(xb, dtype=torch.float)
    yb = torch.stack(yb)
    # print(xb,yb, sep='\n---\n', end = '\n###\n')
    model.train()
    y_hat = model(xb)
    loss = F.mse_loss(y_hat, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    # scheduler.step()

def run_episode():

    iter = 0
    running_reward = 0
    state = env.reset()
    done = False

    while not done:
        iter += 1

        action = get_action(state)
        next_state, reward, done, _ = env.step(action)

        if done and iter <  env._max_episode_steps:
            reward = -1
        
        remember(state, action, reward, done, next_state)
        running_reward += reward

        state = next_state
    
    return running_reward

def run():

    global epsilon 

    pbar = tqdm.tqdm(range(n_episodes))
    mean_reward = deque(maxlen=100)

    for e in pbar:

        reward = run_episode()
        mean_reward.append(reward)

        pbar.set_postfix({
            'episode_reward' : reward,
            'mean (100 eps.)' : np.mean(mean_reward),
            'epsilon' : epsilon
        })

        train()
        epsilon *= epsilon_decay

        if e % copy_step == 0:
            target_model.load_state_dict(model.state_dict())
            global optimizer
            optimizer = optim.Adam(model.parameters(), optimizer.state_dict()['param_groups'][0]['lr'])


In [5]:
run()

 12%|█▏        | 1203/10000 [00:51<06:57, 21.05it/s, episode_reward=122, mean (100 eps.)=79.6, epsilon=0.3]