In [0]:
import torch
from collections import deque
import random
from torch.autograd import Variable
from cyber_env import cyber
import time

In [0]:
class DQN():
    def __init__(self, n_state, n_action, n_hidden=50, lr=0.05):
        self.criterion = torch.nn.MSELoss()
        self.model = torch.nn.Sequential(
                        torch.nn.Linear(n_state, n_hidden),
                        torch.nn.ReLU(),
                        torch.nn.Linear(n_hidden, n_action)
                )

        self.optimizer = torch.optim.Adam(self.model.parameters(), lr)

    def update(self, s, y):
        y_pred = self.model(torch.Tensor(s))
        loss = self.criterion(y_pred, Variable(torch.Tensor(y)))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def predict(self, s):
        with torch.no_grad():
            return self.model(torch.Tensor(s))

    def replay(self, memory, replay_size, gamma):
        if len(memory) >= replay_size:
            replay_data = random.sample(memory, replay_size)
            states = []
            td_targets = []
            for state, action, next_state, reward, is_done in replay_data:
                states.append(state)
                q_values = self.predict(state).tolist()
                if is_done:
                    q_values[action] = reward
                else:
                    q_values_next = self.predict(next_state)
                    q_values[action] = reward + gamma * torch.max(q_values_next).item()

                td_targets.append(q_values)

            self.update(states, td_targets)

def gen_epsilon_greedy_policy(estimator, epsilon, n_action):
    def policy_function(state):
        if random.random() < epsilon:
            return random.randint(0, n_action - 1)
        else:
            q_values = estimator.predict(state)
            return torch.argmax(q_values).item()
    return policy_function


def q_learning(env, estimator, n_episode, replay_size, gamma=1.0, epsilon=0.1, epsilon_decay=.99):
    for episode in range(n_episode):
        start = time.time()
        policy = gen_epsilon_greedy_policy(estimator, epsilon, n_action)
        state = env._reset()
        is_done = False
        k=0
        while not is_done:
            action = policy(state)
            next_state, reward, is_done, _ = env._step(action)
            total_reward_episode[episode] += reward
            a1=state
            k+=1
            memory.append((state, action, next_state, reward, is_done))

            if is_done:
                env._reset()
                break

            estimator.replay(memory, replay_size, gamma)
            state = next_state
        end = time.time()
        print (end-start)
        print('Episode: {}, total reward: {}, epsilon: {}'.format(episode, total_reward_episode[episode], epsilon))

        epsilon = max(epsilon * epsilon_decay, 0.01)

In [4]:
env = cyber()
n_state = env.observation_space.n
n_action = env.action_space.n
n_hidden = 50
lr = 0.001
dqn = DQN(n_state, n_action, n_hidden, lr)
memory = deque(maxlen=100)
n_episode = 500
replay_size = 20
total_reward_episode = [0] * n_episode
q_learning(env, dqn, n_episode, replay_size, gamma=.9, epsilon=.3)

0.41179680824279785
Episode: 0, total reward: 33, epsilon: 0.3
0.25401735305786133
Episode: 1, total reward: -45, epsilon: 0.297
0.22507023811340332
Episode: 2, total reward: 31, epsilon: 0.29402999999999996
0.24963068962097168
Episode: 3, total reward: -5, epsilon: 0.29108969999999995
0.27486371994018555
Episode: 4, total reward: 29, epsilon: 0.28817880299999993
0.2178199291229248
Episode: 5, total reward: 35, epsilon: 0.28529701496999993
0.23909354209899902
Episode: 6, total reward: -35, epsilon: 0.28244404482029994
0.22369885444641113
Episode: 7, total reward: -21, epsilon: 0.27961960437209693
0.2690267562866211
Episode: 8, total reward: 27, epsilon: 0.276823408328376
0.26422667503356934
Episode: 9, total reward: 31, epsilon: 0.27405517424509224
0.23207998275756836
Episode: 10, total reward: -39, epsilon: 0.2713146225026413
0.22159051895141602
Episode: 11, total reward: 5, epsilon: 0.2686014762776149
0.26483702659606934
Episode: 12, total reward: -35, epsilon: 0.26591546151483875
0.