# Deep Q-Network (DQN)
---
In this notebook, you will implement a DQN agent with OpenAI Gym's LunarLander-v2 environment.

### 1. Import the Necessary Packages

In [1]:
import random
import torch
import numpy as np
from infra.game import Game
from infra.util import getValidMoves
import csv
import math
from collections import deque
import matplotlib.pyplot as plt
from dqn_agent import Agent
from tqdm import tqdm_notebook
%matplotlib inline

### 3. Train the Agent with DQN

Run the code cell below to train the agent from scratch.  You are welcome to amend the supplied values of the parameters in the function, to try to see if you can get better performance!

Alternatively, you can skip to the next step below (**4. Watch a Smart Agent!**), to load the saved model weights from a pre-trained agent.

In [None]:
def dqn(n_episodes=20000, eps_start=0.5, eps_end=0.02, eps_decay=0.999):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    cp = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    numCompleted = 0
    eps = eps_start                    # initialize epsilon
#     for i in tqdm_notebook(range(200)):
#         game = Game(i)
#         game.createGame((0, 0, 0, 0))
#         game.startGame()
#         score = game.score
#         moves = game.getValidMoves()
#         while not game.won() and moves:
#             state = np.array(game.getStateForDQN())
#             move = moves[0] if random.random() > 0.01 else moves[-1]
#             game.performMoves(move)
#             reward = game.score - score
#             next_state = np.array(game.getStateForDQN())
#             if move[1]=='deal':
#                 action = 1080
#             else:
#                 _, _, ncards, from_stack, to_stack = move
#                 f = from_stack.id-1
#                 t = to_stack.id-1
#                 action = (f*9+t-(f<t))*12+ncards-1
#             #print(state, action, reward, next_state, game.won())
#             agent.step(state, action, reward, next_state, game.won())
#             moves = game.getValidMoves()
#             score = game.score
#         #print(game.score, game.won())
#     torch.save(agent.qnetwork_local.state_dict(), 'checkpoint-greedy-2.pth')
    for i_episode in tqdm_notebook(range(1, n_episodes+1)):
        game = Game(i_episode)
        game.createGame((0, 0, 0, 0))
        game.startGame()
        score = game.score
        moves = getValidMoves(game.getVisibleState())
        # one game till win the game or cannot move
        while not game.won() and moves:
            state = np.array(game.getStateForDQN())
#             if random.random() > 0.1:
#                 move = game.getValidMoves()[0]
#                 if move[1]=='deal':
#                     action = 1080
#                 else:
#                     _, _, ncards, from_stack, to_stack = move
#                     f = from_stack.id-1
#                     t = to_stack.id-1
#                     action = (f*9+t-(f<t))*12+ncards-1
#             else:
            action = agent.act(state, moves, eps)
            game.performMovesForDQN(action)
            next_state = np.array(game.getStateForDQN())
            reward = game.score - score
            won = game.won()
            if won:
#                 reward += 10000
                numCompleted += 1
            agent.step(state, action, reward, next_state, game.won())
            moves = getValidMoves(game.getVisibleState())
            score = game.score
        scores_window.append(score)       # save most recent score
        eps = max(eps_end, eps_decay*eps) # decrease epsilon
#         print('\rEpisode {}\tAverage Score: {:.2f} \tCompleted: {}'.format(i_episode, score, completed), end="")
        if i_episode % 100 == 0:
            completed = numCompleted/i_episode
            cp.append(completed)
            print('\rEpisode {}\tAverage Score: {:.2f} \tCompleted: {}, eps: {}, lr: {}'.format(i_episode, np.mean(scores_window), completed, eps, agent.optimizer.param_groups[0]['lr']))
#             max_completed = max(max_completed, completed)
#             if completed < max_completed - 0.015:
#                 print('completed decreasing {:.2f} {:.2f}'.format(completed, max_completed))
#                 break
#         if numCompleted * 1.0 / i_episode > 0.45:
#             print('\nEnvironment solved in {:d} episodes!\tNumber of complted game: {:.2f}'.format(i_episode, numCompleted))
#             torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
#             break
    print('\nRunning all {:d} episodes!\tNumber of completed game: {:.2f}'.format(i_episode, numCompleted))
    torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
    return cp, numCompleted

agent = Agent(state_size=141, action_size=1081, seed=0)
# agent.qnetwork_local.load_state_dict(torch.load('checkpoint-greedy.pth'))
cp, numCompleted = dqn()
# cp, numCompleted = dqn(20000, 0.02)

# plot the scores
print(numCompleted)
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(cp)), cp)
plt.ylabel('Score')
plt.xlabel('Episode #/K')
plt.show()

Episode 100	Average Score: 505.89 	Completed: 0.01, eps: 0.4523960735568548, lr: 0.005
Episode 200	Average Score: 514.33 	Completed: 0.02, eps: 0.409324414739318, lr: 0.005
Episode 300	Average Score: 532.51 	Completed: 0.02666666666666667, eps: 0.37035351607804984, lr: 0.005
Episode 400	Average Score: 512.19 	Completed: 0.025, eps: 0.33509295300337016, lr: 0.005


In [10]:
agent.optimizer.param_groups[0]['lr']

0.005

### 4. Watch a Smart Agent!

In the next code cell, you will load the trained weights from file to watch a smart agent!

In [16]:
# load the weights from file
agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth'))

game = Game(10000)
game.createGame((0, 0, 0, 0))
game.startGame()
print(game)
moves = getValidMoves(game.getVisibleState())
while not game.won() and moves:
    state = np.array(game.getStateForDQN())
    action = agent.act(state, moves)
    game.performMovesForDQN(action)
    moves = getValidMoves(game.getVisibleState())
print(game)
print('Game#%d [%s] Score: %d   Moves: %d   Completed: %d' %
                      (game.id, 'Won' if game.won() else 'Lost', game.score, game.moves, game.completed))

# baseline
game = Game(10000)
game.createGame((0, 0, 0, 0))
game.startGame()
moves = game.getValidMoves()
while not game.won() and moves:
    game.performMoves(moves[0])
    moves = game.getValidMoves()
print(game)
print('Game#%d [%s] Score: %d   Moves: %d   Completed: %d' %
                      (game.id, 'Won' if game.won() else 'Lost', game.score, game.moves, game.completed))

-♤ 10- -♤  K- -♤  A- -♤  3- -♤  8- -♤  7- -♤  4- -♤ 10- -♤  9- -♤  6-
-♤  2- -♤  9- -♤  Q- -♤  5- -♤  3- -♤ 10- -♤  6- -♤  Q- -♤  5- -♤  5-
-♤  9- -♤  A- -♤  3- -♤  9- -♤  7- -♤  K- -♤  Q- -♤  3- -♤  2- -♤  K-
-♤  9- -♤  4- -♤ 10- -♤  8- -♤  A- -♤  8- -♤  7- -♤  7- -♤ 10- -♤  2-
-♤  7- -♤  3- -♤  K- -♤  Q-  ♤  8   ♤  3   ♤  9   ♤  7   ♤  Q   ♤  2 
 ♤ 10   ♤  A   ♤  Q   ♤  2                                           
Game#10000 Score: 500   Moves: 0   Completed: 0   Undealt: 5
-♤ 10- -♤  K- -♤  A- -♤  3- -♤  8- -♤  7- -♤  4- -♤ 10- -♤  9- -♤  6-
-♤  2- -♤  9- -♤  Q- -♤  5- -♤  3- -♤ 10- -♤  6- -♤  Q- -♤  5- -♤  5-
-♤  9- -♤  A- -♤  3- -♤  9- -♤  7- -♤  K- -♤  Q- -♤  3- -♤  2- -♤  K-
-♤  9- -♤  4- -♤ 10- -♤  8- -♤  A- -♤  8- -♤  7- -♤  7- -♤ 10- -♤  2-
-♤  7- -♤  3- -♤  K- -♤  Q-  ♤  8   ♤  3   ♤  9   ♤  7   ♤  Q   ♤  2 
 ♤ 10   ♤  A   ♤  Q   ♤  2   ♤  8   ♤  2   ♤  J   ♤  6   ♤  5   ♤  5 
 ♤ 10   ♤  6   ♤  J   ♤  J   ♤  4   ♤  K   ♤  J   ♤  5   ♤ 10   ♤  A 
 ♤  4   ♤  J          ♤  J   

In [4]:
high_score = 0
win_count = 0
count = 1000
# agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth'))
try:
    with open('output/dqn-1suit.csv', 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(('ID', 'Result', 'Score', '#Moves', '#Stacks'))
        for i in tqdm_notebook(range(count)):
            game = Game(i)
            game.createGame((0, 0, 0, 0))
            game.startGame()
            moves = getValidMoves(game.getVisibleState())
            while not game.won() and moves:
                state = np.array(game.getStateForDQN())
                action = agent.act(state, moves)
                game.performMovesForDQN(action)
                if game.won():
                    high_score = max(high_score, game.score)
                    win_count += 1
                moves = getValidMoves(game.getVisibleState())
#             print('Game#%d [%s] Score: %d   Moves: %d   Completed: %d' %
#                   (game.id, 'Won' if game.won() else 'Lost', game.score, game.moves, game.completed))
            writer.writerow(map(str, (game.id, 'Won' if game.won()
                                      else 'Lost', game.score, game.moves, game.completed)))
except KeyboardInterrupt:
    pass
print('Won %d/%d games, high score is %d' % (win_count, count, high_score))



Won 42/1000 games, high score is 1200


In [19]:
high_score = 0
win_count = 0
count = 1000
agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth'))
try:
    with open('output/dqn-1suit.csv', 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(('ID', 'Result', 'Score', '#Moves', '#Stacks'))
        for i in tqdm_notebook(range(count)):
            game = Game(i)
            game.createGame((0, 0, 0, 0))
            game.startGame()
            moves = getValidMoves(game.getVisibleState())
            while not game.won() and moves:
                state = np.array(game.getStateForDQN())
                action = agent.act(state, moves, 1)
                game.performMovesForDQN(action)
                if game.won():
                    high_score = max(high_score, game.score)
                    win_count += 1
                moves = getValidMoves(game.getVisibleState())
#             print('Game#%d [%s] Score: %d   Moves: %d   Completed: %d' %
#                   (game.id, 'Won' if game.won() else 'Lost', game.score, game.moves, game.completed))
            writer.writerow(map(str, (game.id, 'Won' if game.won()
                                      else 'Lost', game.score, game.moves, game.completed)))
except KeyboardInterrupt:
    pass
print('Won %d/%d games, high score is %d' % (win_count, count, high_score))


Won 10/1000 games, high score is 1204


In [None]:

high_score = 0
win_count = 0
count = 1000
try:
    with open('output/greedy-1suit.csv', 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(('ID', 'Result', 'Score', '#Moves', '#Stacks'))
        for i in tqdm_notebook(range(count)):
            game = Game(i)
            game.createGame((0, 0, 0, 0))
            game.startGame()
            moves = game.getValidMoves()
            while not game.won() and moves:
                move = moves[0] if random.random() > 0.01 else moves[-1]
                game.performMoves(move)
                if game.won():
                    high_score = max(high_score, game.score)
                    win_count += 1
                moves = game.getValidMoves()
#             print('Game#%d [%s] Score: %d   Moves: %d   Completed: %d' %
#                   (game.id, 'Won' if game.won() else 'Lost', game.score, game.moves, game.completed))
            writer.writerow(map(str, (game.id, 'Won' if game.won()
                                      else 'Lost', game.score, game.moves, game.completed)))
except KeyboardInterrupt:
    pass
print('Won %d/%d games, high score is %d' % (win_count, count, high_score))