In [None]:
import gym
import gym.spaces
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

In [None]:
replay_memory = np.empty((0,5))

learning_rate = 0.001
epsilon_decay = 0.995
epsilon = 1.0
epsilon_min = 0.01

n_episodes = 1000
batch_size = 32
gamma = 0.95

env = gym.make('CartPole-v1')

state_size = env.observation_space.shape[0]
action_size = env.action_space.n

In [None]:
def model(state_size, action_size):
    layers = []
    layers.append(nn.Linear(state_size, 24))
    layers.append(nn.ReLU())
    layers.append(nn.Linear(24,24))
    layers.append(nn.ReLU())
    layers.append(nn.Linear(24, action_size))
    return nn.Sequential(*layers).double()

In [None]:
def act(model, action_size, state, epsilon):    
    if np.random.rand() <= epsilon:
        return np.random.randint(action_size)

    act_values = model(Variable(torch.from_numpy(state))).data.numpy()
    return np.argmax(act_values[0])

In [None]:
def remember(state, action, reward, next_state, done):
    global replay_memory
    data = [state, action, reward, next_state, done]
    replay_memory = np.append(replay_memory, np.array([data]), axis=0)    

In [None]:
def random_sample(data, batch_size):    
    rand_idx = np.random.choice(data.shape[0], batch_size, replace=False)
    return data[rand_idx, :]

In [None]:
def replay(model, batch_size):
    minibatch = random_sample(np.array(replay_memory), batch_size)
    
    for state, action, reward, next_state, done in minibatch:        
        optimizer.zero_grad()   # zero the gradient buffers
        output = model(Variable(torch.from_numpy(state)))        
        target = reward
        if not done:
            target = reward + gamma * np.amax(model(Variable(torch.from_numpy(next_state))).data.numpy())

        target_f = model(Variable(torch.from_numpy(state)))
        target_f[0][action] = target
        loss = cost(output, target_f)    
        loss.backward()
        optimizer.step()
    global epsilon, epsilon_min, epsilon_decay
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

In [8]:
p_rewards = []
network = model(action_size=action_size, state_size=state_size)

optimizer = optim.Adam(network.parameters(), lr=learning_rate)
cost = nn.MSELoss()

for e in range(n_episodes):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    for time in range(500):
        env.render()
        action = act(model=network, state=state, action_size=action_size, epsilon=epsilon)
        next_state, reward, done, _ = env.step(action)
        reward = reward if not done else -10
        
        next_state = np.reshape(next_state, [1, state_size])
        remember(state, action, reward, next_state, done)        
        state = next_state
        if done:
            p_rewards.append(time)
            print("episode: {}/{}, score: {}, e: {:.4}".format(e, n_episodes, time, epsilon))
            break
        if len(replay_memory) > batch_size:
            replay(network, batch_size)            

episode: 0/1000, score: 12, e: 1.0
episode: 1/1000, score: 24, e: 0.9752
episode: 2/1000, score: 12, e: 0.9183
episode: 3/1000, score: 61, e: 0.6764
episode: 4/1000, score: 29, e: 0.5849
episode: 5/1000, score: 18, e: 0.5344
episode: 6/1000, score: 23, e: 0.4762
episode: 7/1000, score: 143, e: 0.2326
episode: 8/1000, score: 100, e: 0.1409
episode: 9/1000, score: 99, e: 0.08576
episode: 10/1000, score: 79, e: 0.05772
episode: 11/1000, score: 175, e: 0.02401
episode: 12/1000, score: 127, e: 0.0127
episode: 13/1000, score: 141, e: 0.009986
episode: 14/1000, score: 101, e: 0.009986
episode: 15/1000, score: 64, e: 0.009986
episode: 16/1000, score: 140, e: 0.009986
episode: 17/1000, score: 233, e: 0.009986
episode: 18/1000, score: 195, e: 0.009986
episode: 19/1000, score: 226, e: 0.009986
episode: 20/1000, score: 166, e: 0.009986
episode: 21/1000, score: 156, e: 0.009986
episode: 22/1000, score: 169, e: 0.009986
episode: 23/1000, score: 191, e: 0.009986
episode: 24/1000, score: 174, e: 0.009

episode: 195/1000, score: 116, e: 0.009986
episode: 196/1000, score: 276, e: 0.009986
episode: 197/1000, score: 499, e: 0.009986
episode: 198/1000, score: 279, e: 0.009986
episode: 199/1000, score: 194, e: 0.009986
episode: 200/1000, score: 329, e: 0.009986
episode: 201/1000, score: 174, e: 0.009986
episode: 202/1000, score: 499, e: 0.009986
episode: 203/1000, score: 237, e: 0.009986
episode: 204/1000, score: 234, e: 0.009986
episode: 205/1000, score: 254, e: 0.009986
episode: 206/1000, score: 227, e: 0.009986
episode: 207/1000, score: 152, e: 0.009986
episode: 208/1000, score: 187, e: 0.009986
episode: 209/1000, score: 155, e: 0.009986
episode: 210/1000, score: 136, e: 0.009986
episode: 211/1000, score: 111, e: 0.009986
episode: 212/1000, score: 499, e: 0.009986
episode: 213/1000, score: 138, e: 0.009986
episode: 214/1000, score: 10, e: 0.009986
episode: 215/1000, score: 19, e: 0.009986
episode: 216/1000, score: 68, e: 0.009986
episode: 217/1000, score: 11, e: 0.009986
episode: 218/10

episode: 387/1000, score: 447, e: 0.009986
episode: 388/1000, score: 244, e: 0.009986
episode: 389/1000, score: 227, e: 0.009986
episode: 390/1000, score: 20, e: 0.009986
episode: 391/1000, score: 40, e: 0.009986
episode: 392/1000, score: 72, e: 0.009986
episode: 393/1000, score: 58, e: 0.009986
episode: 394/1000, score: 66, e: 0.009986
episode: 395/1000, score: 70, e: 0.009986
episode: 396/1000, score: 78, e: 0.009986
episode: 397/1000, score: 81, e: 0.009986
episode: 398/1000, score: 150, e: 0.009986
episode: 399/1000, score: 87, e: 0.009986
episode: 400/1000, score: 101, e: 0.009986
episode: 401/1000, score: 90, e: 0.009986
episode: 402/1000, score: 87, e: 0.009986
episode: 403/1000, score: 48, e: 0.009986
episode: 404/1000, score: 39, e: 0.009986
episode: 405/1000, score: 74, e: 0.009986
episode: 406/1000, score: 83, e: 0.009986
episode: 407/1000, score: 86, e: 0.009986
episode: 408/1000, score: 91, e: 0.009986
episode: 409/1000, score: 116, e: 0.009986
episode: 410/1000, score: 82

episode: 583/1000, score: 8, e: 0.009986
episode: 584/1000, score: 8, e: 0.009986
episode: 585/1000, score: 9, e: 0.009986
episode: 586/1000, score: 9, e: 0.009986
episode: 587/1000, score: 8, e: 0.009986
episode: 588/1000, score: 8, e: 0.009986
episode: 589/1000, score: 8, e: 0.009986
episode: 590/1000, score: 8, e: 0.009986
episode: 591/1000, score: 9, e: 0.009986
episode: 592/1000, score: 7, e: 0.009986
episode: 593/1000, score: 7, e: 0.009986
episode: 594/1000, score: 9, e: 0.009986
episode: 595/1000, score: 7, e: 0.009986
episode: 596/1000, score: 9, e: 0.009986
episode: 597/1000, score: 9, e: 0.009986
episode: 598/1000, score: 8, e: 0.009986
episode: 599/1000, score: 7, e: 0.009986
episode: 600/1000, score: 7, e: 0.009986
episode: 601/1000, score: 9, e: 0.009986
episode: 602/1000, score: 8, e: 0.009986
episode: 603/1000, score: 10, e: 0.009986
episode: 604/1000, score: 9, e: 0.009986
episode: 605/1000, score: 9, e: 0.009986
episode: 606/1000, score: 9, e: 0.009986
episode: 607/10

episode: 783/1000, score: 8, e: 0.009986
episode: 784/1000, score: 9, e: 0.009986
episode: 785/1000, score: 8, e: 0.009986
episode: 786/1000, score: 8, e: 0.009986
episode: 787/1000, score: 7, e: 0.009986
episode: 788/1000, score: 11, e: 0.009986
episode: 789/1000, score: 10, e: 0.009986
episode: 790/1000, score: 8, e: 0.009986
episode: 791/1000, score: 9, e: 0.009986
episode: 792/1000, score: 8, e: 0.009986
episode: 793/1000, score: 8, e: 0.009986
episode: 794/1000, score: 10, e: 0.009986
episode: 795/1000, score: 9, e: 0.009986
episode: 796/1000, score: 9, e: 0.009986
episode: 797/1000, score: 9, e: 0.009986
episode: 798/1000, score: 9, e: 0.009986
episode: 799/1000, score: 9, e: 0.009986
episode: 800/1000, score: 8, e: 0.009986
episode: 801/1000, score: 9, e: 0.009986
episode: 802/1000, score: 8, e: 0.009986
episode: 803/1000, score: 9, e: 0.009986
episode: 804/1000, score: 7, e: 0.009986
episode: 805/1000, score: 9, e: 0.009986
episode: 806/1000, score: 7, e: 0.009986
episode: 807/

episode: 983/1000, score: 8, e: 0.009986
episode: 984/1000, score: 9, e: 0.009986
episode: 985/1000, score: 9, e: 0.009986
episode: 986/1000, score: 8, e: 0.009986
episode: 987/1000, score: 7, e: 0.009986
episode: 988/1000, score: 8, e: 0.009986
episode: 989/1000, score: 8, e: 0.009986
episode: 990/1000, score: 8, e: 0.009986
episode: 991/1000, score: 8, e: 0.009986
episode: 992/1000, score: 10, e: 0.009986
episode: 993/1000, score: 9, e: 0.009986
episode: 994/1000, score: 7, e: 0.009986
episode: 995/1000, score: 9, e: 0.009986
episode: 996/1000, score: 8, e: 0.009986
episode: 997/1000, score: 8, e: 0.009986
episode: 998/1000, score: 8, e: 0.009986
episode: 999/1000, score: 7, e: 0.009986


In [None]:
plt.plot(p_rewards, '-|')
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.show()