# Original 64 Player Pitt Goal Wall  

In [220]:
import numpy as np
import random
import sys

def randPair(s,e):
    return np.random.randint(s,e), np.random.randint(s,e)

class BoardPiece:

    def __init__(self, name, code, pos):
        self.name = name #name of the piece
        self.code = code #an ASCII character to display on the board
        self.pos = pos #2-tuple e.g. (1,4)

class BoardMask:

    def __init__(self, name, mask, code):
        self.name = name
        self.mask = mask
        self.code = code

    def get_positions(self): #returns tuple of arrays
        return np.nonzero(self.mask)

def zip_positions2d(positions): #positions is tuple of two arrays
    x,y = positions
    return list(zip(x,y))

class GridBoard:

    def __init__(self, size=4):
        self.size = size #Board dimensions, e.g. 4 x 4
        self.components = {} #name : board piece
        self.masks = {}

    def addPiece(self, name, code, pos=(0,0)):
        newPiece = BoardPiece(name, code, pos)
        self.components[name] = newPiece

    #basically a set of boundary elements
    def addMask(self, name, mask, code):
        #mask is a 2D-numpy array with 1s where the boundary elements are
        newMask = BoardMask(name, mask, code)
        self.masks[name] = newMask

    def movePiece(self, name, pos):
        move = True
        for _, mask in self.masks.items():
            if pos in zip_positions2d(mask.get_positions()):
                move = False
        if move:
            self.components[name].pos = pos

    def delPiece(self, name):
        del self.components['name']

    def render(self):
        dtype = '<U2'
        displ_board = np.zeros((self.size, self.size), dtype=dtype)
        displ_board[:] = ' '

        for name, piece in self.components.items():
            displ_board[piece.pos] = piece.code

        for name, mask in self.masks.items():
            displ_board[mask.get_positions()] = mask.code

        return displ_board

    def render_np(self):
        num_pieces = len(self.components) + len(self.masks)
        displ_board = np.zeros((num_pieces, self.size, self.size), dtype=np.uint8)
        layer = 0
        for name, piece in self.components.items():
            pos = (layer,) + piece.pos
            displ_board[pos] = 1
            layer += 1

        for name, mask in self.masks.items():
            print('Hello')
            x,y = self.masks['boundary'].get_positions()
            z = np.repeat(layer,len(x))
            a = (z,x,y)
            displ_board[a] = 1
            layer += 1
            
        return displ_board

def addTuple(a,b):
    return tuple([sum(x) for x in zip(a,b)])

In [221]:
class Gridworld:

    def __init__(self, size=4, mode='static'):
        if size >= 4:
            self.board = GridBoard(size=size)
        else:
            print("Minimum board size is 4. Initialized to size 4.")
            self.board = GridBoard(size=4)

        #Add pieces, positions will be updated later
        self.board.addPiece('Player','P',(0,0))
        self.board.addPiece('Goal','+',(1,0))
        self.board.addPiece('Pit','-',(2,0))
        self.board.addPiece('Wall','W',(3,0))

        if mode == 'static':
            self.initGridStatic()
        elif mode == 'player':
            self.initGridPlayer()
        else:
            self.initGridRand()

    #Initialize stationary grid, all items are placed deterministically
    def initGridStatic(self):
        #Setup static pieces
        self.board.components['Player'].pos = (0,3) #Row, Column
        self.board.components['Goal'].pos = (0,0)
        self.board.components['Pit'].pos = (0,1)
        self.board.components['Wall'].pos = (1,1)

    #Check if board is initialized appropriately (no overlapping pieces)
    #also remove impossible-to-win boards
    def validateBoard(self):
        valid = True

        player = self.board.components['Player']
        goal = self.board.components['Goal']
        wall = self.board.components['Wall']
        pit = self.board.components['Pit']

        all_positions = [piece for name,piece in self.board.components.items()]
        all_positions = [player.pos, goal.pos, wall.pos, pit.pos]
        if len(all_positions) > len(set(all_positions)):
            return False

        corners = [(0,0),(0,self.board.size), (self.board.size,0), (self.board.size,self.board.size)]
        #if player is in corner, can it move? if goal is in corner, is it blocked?
        if player.pos in corners or goal.pos in corners:
            val_move_pl = [self.validateMove('Player', addpos) for addpos in [(0,1),(1,0),(-1,0),(0,-1)]]
            val_move_go = [self.validateMove('Goal', addpos) for addpos in [(0,1),(1,0),(-1,0),(0,-1)]]
            if 0 not in val_move_pl or 0 not in val_move_go:
                #print(self.display())
                #print("Invalid board. Re-initializing...")
                valid = False

        return valid

    #Initialize player in random location, but keep wall, goal and pit stationary
    def initGridPlayer(self):
        #height x width x depth (number of pieces)
        self.initGridStatic()
        #place player
        self.board.components['Player'].pos = randPair(0,self.board.size)

        if (not self.validateBoard()):
            #print('Invalid grid. Rebuilding..')
            self.initGridPlayer()

    #Initialize grid so that goal, pit, wall, player are all randomly placed
    def initGridRand(self):
        #height x width x depth (number of pieces)
        self.board.components['Player'].pos = randPair(0,self.board.size)
        self.board.components['Goal'].pos = randPair(0,self.board.size)
        self.board.components['Pit'].pos = randPair(0,self.board.size)
        self.board.components['Wall'].pos = randPair(0,self.board.size)

        if (not self.validateBoard()):
            #print('Invalid grid. Rebuilding..')
            self.initGridRand()

    def validateMove(self, piece, addpos=(0,0)):
        outcome = 0 #0 is valid, 1 invalid, 2 lost game
        pit = self.board.components['Pit'].pos
        wall = self.board.components['Wall'].pos
        new_pos = addTuple(self.board.components[piece].pos, addpos)
        if new_pos == wall:
            outcome = 1 #block move, player can't move to wall
        elif max(new_pos) > (self.board.size-1):    #if outside bounds of board
            outcome = 1
        elif min(new_pos) < 0: #if outside bounds
            outcome = 1
        elif new_pos == pit:
            outcome = 2

        return outcome

    def makeMove(self, action):
        #need to determine what object (if any) is in the new grid spot the player is moving to
        #actions in {u,d,l,r}
        def checkMove(addpos):
            if self.validateMove('Player', addpos) in [0,2]:
                new_pos = addTuple(self.board.components['Player'].pos, addpos)
                self.board.movePiece('Player', new_pos)

        if action == 'u': #up
            checkMove((-1,0))
        elif action == 'd': #down
            checkMove((1,0))
        elif action == 'l': #left
            checkMove((0,-1))
        elif action == 'r': #right
            checkMove((0,1))
        else:
            pass

    def reward(self):
        if (self.board.components['Player'].pos == self.board.components['Pit'].pos):
            return -10
        elif (self.board.components['Player'].pos == self.board.components['Goal'].pos):
            return 10
        else:
            return -1

    def display(self):
        return self.board.render()

In [222]:
action_set = {
    0: 'u',
    1: 'd',
    2: 'l',
    3: 'r',
}

In [223]:
def test_model(model, mode='static', display=True):
    i = 0
    test_game = Gridworld(mode=mode)
    state_ = test_game.board.render_np().reshape(1,64) + np.random.rand(1,64)/10.0
    state = torch.from_numpy(state_).float()
    if display:
        print("Initial State:")
        print(test_game.display())
    status = 1
    while(status == 1): #A
        qval = model(state)
        qval_ = qval.data.numpy()
        action_ = np.argmax(qval_) #B
        action = action_set[action_]
        if display:
            print('Move #: %s; Taking action: %s' % (i, action))
        test_game.makeMove(action)
        state_ = test_game.board.render_np().reshape(1,64) + np.random.rand(1,64)/10.0
        state = torch.from_numpy(state_).float()
        if display:
            print(test_game.display())
        reward = test_game.reward()
        if reward != -1:
            if reward > 0:
                status = 2
                if display:
                    print("Game won! Reward: %s" % (reward,))
            else:
                status = 0
                if display:
                    print("Game LOST. Reward: %s" % (reward,))
        i += 1
        if (i > 15):
            if display:
                print("Game lost; too many moves.")
            break
    
    win = True if status == 2 else False
    return win

In [224]:
game = Gridworld(size=4, mode='static')
game.board.render_np()

4


array([[[0, 0, 0, 1],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]],

       [[1, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]],

       [[0, 1, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]],

       [[0, 0, 0, 0],
        [0, 1, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]]], dtype=uint8)

In [225]:
game.makeMove('d')
game.board.render_np()

4


array([[[0, 0, 0, 0],
        [0, 0, 0, 1],
        [0, 0, 0, 0],
        [0, 0, 0, 0]],

       [[1, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]],

       [[0, 1, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]],

       [[0, 0, 0, 0],
        [0, 1, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]]], dtype=uint8)

In [226]:
import numpy as np
import torch
from IPython.display import clear_output
import random
from matplotlib import pylab as plt

l1 = 64
l2 = 150
l3 = 100
l4 = 4

model = torch.nn.Sequential(
    torch.nn.Linear(l1, l2),
    torch.nn.ReLU(),
    torch.nn.Linear(l2, l3),
    torch.nn.ReLU(),
    torch.nn.Linear(l3,l4)
)
loss_fn = torch.nn.MSELoss()
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

gamma = 0.9
epsilon = 1.0
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

gamma = 0.9
epsilon = 1.0

In [227]:
from collections import deque
epochs = 1000
losses = []
mem_size = 1000 #A
batch_size = 200 #B
replay = deque(maxlen=mem_size) #C
max_moves = 50 #D
h = 0
for i in range(epochs):
    game = Gridworld(size=4, mode='random')
    state1_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/100.0
    state1 = torch.from_numpy(state1_).float()
    status = 1
    mov = 0
    while(status == 1): 
        mov += 1
        qval = model(state1) #E
        qval_ = qval.data.numpy()
        if (random.random() < epsilon): #F
            action_ = np.random.randint(0,4)
        else:
            action_ = np.argmax(qval_)
        
        action = action_set[action_]
        game.makeMove(action)
        state2_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/100.0
        state2 = torch.from_numpy(state2_).float()
        reward = game.reward()
        done = True if reward > 0 else False
        exp =  (state1, action_, reward, state2, done) #G
        replay.append(exp) #H
        state1 = state2
        
        if len(replay) > batch_size: #I
            minibatch = random.sample(replay, batch_size) #J
            state1_batch = torch.cat([s1 for (s1,a,r,s2,d) in minibatch]) #K
            action_batch = torch.Tensor([a for (s1,a,r,s2,d) in minibatch])
            reward_batch = torch.Tensor([r for (s1,a,r,s2,d) in minibatch])
            state2_batch = torch.cat([s2 for (s1,a,r,s2,d) in minibatch])
            done_batch = torch.Tensor([d for (s1,a,r,s2,d) in minibatch])
            
            Q1 = model(state1_batch) #L
            with torch.no_grad():
                Q2 = model(state2_batch) #M
            
            Y = reward_batch + gamma * ((1 - done_batch) * torch.max(Q2,dim=1)[0]) #N
            X = Q1.gather(dim=1,index=action_batch.long().unsqueeze(dim=1)).squeeze()
            loss = loss_fn(X, Y.detach())
            print(i)
#             clear_output(wait=True)
            optimizer.zero_grad()
            loss.backward()
            losses.append(loss.item())
            optimizer.step()

        if reward != -1 or mov > max_moves: #O
            status = 0
            mov = 0
losses = np.array(losses)


4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
15
4
15
4
15
4
15
4
15
4
15
4
15
4
4
16
4
16
4
16
4
16
4
16
4
16
4
16
4
16
4
16
4
16
4
16
4
16
4
16
4
16
4
16
4
16
4
16
4
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
17
4
4
18
4
18
4
18
4
18
4
18
4
18
4
18
4
18
4
18
4
18
4
18
4
18
4
18
4
18
4
18
4
18
4
18
4
18
4
18
4
18
4
18
4
4
19
4
19
4
19
4
19
4
19
4
19
4
19
4
19
4
19
4
4
20
4
20
4
20
4
20
4
20
4
20
4
2

4
107
4
107
4
107
4
107
4
107
4
4
108
4
108
4
108
4
108
4
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
109
4
4
110
4
110
4
110
4
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
111
4
4
112
4
112
4
112
4
112
4
112
4
112
4
112
4
112
4
112
4
4
113
4
113
4
113
4
113
4
4
114
4
114
4
114
4
114
4
114
4
114
4
114
4
114
4
114
4
114
4
4
115
4
115
4
115
4
115
4
115
4
115
4
115
4
115
4
115
4
115
4
115
4
115
4
115
4
115
4
115
4
115
4
115
4
115
4
115
4
115
4
115
4
115
4
115
4
115
4
115
4
115
4
115
4
115
4
115
4
115
4
115
4
115
4
4
116
4
116
4
11

186
4
186
4
4
187
4
187
4
187
4
187
4
187
4
187
4
187
4
187
4
187
4
187
4
187
4
187
4
187
4
187
4
187
4
187
4
187
4
187
4
187
4
187
4
187
4
4
188
4
4
189
4
189
4
189
4
189
4
189
4
189
4
189
4
4
190
4
4
191
4
191
4
191
4
191
4
191
4
4
192
4
4
193
4
193
4
193
4
193
4
193
4
193
4
193
4
193
4
193
4
193
4
193
4
193
4
193
4
193
4
193
4
193
4
193
4
4
194
4
194
4
4
195
4
195
4
195
4
195
4
195
4
195
4
195
4
195
4
195
4
4
196
4
196
4
196
4
196
4
4
197
4
4
198
4
198
4
198
4
198
4
198
4
198
4
198
4
198
4
198
4
198
4
198
4
198
4
198
4
198
4
198
4
198
4
198
4
198
4
198
4
198
4
198
4
198
4
198
4
198
4
4
199
4
199
4
4
200
4
200
4
200
4
200
4
4
201
4
201
4
201
4
201
4
201
4
201
4
201
4
201
4
201
4
4
202
4
202
4
202
4
202
4
202
4
202
4
202
4
202
4
202
4
202
4
202
4
202
4
202
4
202
4
202
4
202
4
202
4
202
4
202
4
202
4
202
4
202
4
202
4
202
4
202
4
202
4
202
4
202
4
202
4
202
4
202
4
202
4
202
4
202
4
202
4
4
203
4
203
4
203
4
203
4
203
4
203
4
4
204
4
204
4
204
4
204
4
204
4
204
4
204
4
204
4
204
4
204


4
291
4
291
4
291
4
291
4
291
4
291
4
4
292
4
292
4
292
4
292
4
292
4
292
4
292
4
292
4
292
4
292
4
292
4
292
4
292
4
292
4
292
4
292
4
4
293
4
293
4
293
4
293
4
293
4
4
294
4
294
4
294
4
294
4
294
4
294
4
4
295
4
295
4
295
4
295
4
295
4
295
4
295
4
295
4
295
4
295
4
295
4
295
4
295
4
295
4
295
4
295
4
295
4
295
4
295
4
295
4
295
4
295
4
295
4
295
4
4
296
4
296
4
296
4
296
4
296
4
296
4
296
4
4
297
4
297
4
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
298
4
4
299
4
299
4
299
4
299
4
299
4
299
4
299
4
299
4
299
4
299
4
299
4
299
4
299
4
4
300
4
300
4
300
4
300
4
300
4
300
4
300
4
300
4
300
4
300
4
300
4
300
4
300
4
300
4
300
4
300
4
300
4
300
4
4
301
4
301
4
301
4
301
4
4
302
4
302
4
302
4
302
4
302
4
302
4
302
4
302
4
302
4
302
4
302
4
302
4
302
4
302
4
302
4
302
4
302
4
302
4
302
4
302
4
302
4
302
4
302


373
4
373
4
4
374
4
4
375
4
375
4
375
4
375
4
375
4
375
4
375
4
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
376
4
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
377
4
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
378
4
4
379
4
379
4
379
4
379
4
379
4
379
4
379
4
379
4
379
4
379


4
456
4
456
4
456
4
4
457
4
457
4
457
4
457
4
457
4
457
4
457
4
457
4
457
4
457
4
457
4
457
4
457
4
457
4
457
4
457
4
457
4
457
4
457
4
457
4
457
4
457
4
457
4
457
4
457
4
457
4
457
4
457
4
457
4
457
4
4
458
4
458
4
458
4
458
4
458
4
458
4
458
4
458
4
458
4
458
4
458
4
458
4
458
4
458
4
458
4
458
4
458
4
458
4
458
4
458
4
458
4
458
4
458
4
458
4
458
4
458
4
458
4
458
4
458
4
458
4
458
4
458
4
458
4
458
4
4
459
4
459
4
4
460
4
4
461
4
461
4
461
4
461
4
461
4
461
4
461
4
461
4
461
4
461
4
461
4
461
4
461
4
461
4
461
4
461
4
461
4
4
462
4
462
4
462
4
462
4
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
463
4
4
464
4
464
4
464
4
464
4
464
4
464
4
464
4
464
4
4
465
4
465
4
465
4
465
4
465
4
465
4
465
4
465
4
465
4
465
4
465
4
4
466
4
466
4
466
4
466
4
466
4
466
4
466
4
466
4
466
4
466
4
466
4
466
4
466
4
466
4


547
4
547
4
547
4
547
4
547
4
547
4
547
4
547
4
547
4
547
4
547
4
547
4
4
548
4
4
549
4
549
4
549
4
549
4
549
4
549
4
549
4
4
550
4
550
4
550
4
550
4
550
4
550
4
550
4
550
4
550
4
550
4
550
4
550
4
550
4
550
4
550
4
550
4
550
4
550
4
550
4
550
4
550
4
550
4
4
551
4
551
4
551
4
551
4
4
552
4
552
4
552
4
552
4
552
4
552
4
552
4
552
4
552
4
552
4
552
4
552
4
552
4
552
4
552
4
552
4
552
4
552
4
552
4
552
4
552
4
4
553
4
553
4
553
4
553
4
553
4
553
4
553
4
553
4
553
4
553
4
553
4
553
4
553
4
553
4
553
4
553
4
553
4
553
4
553
4
553
4
553
4
553
4
553
4
553
4
553
4
553
4
4
554
4
554
4
554
4
554
4
554
4
554
4
554
4
554
4
554
4
554
4
554
4
554
4
554
4
554
4
554
4
554
4
554
4
4
555
4
555
4
4
556
4
556
4
556
4
556
4
556
4
556
4
556
4
556
4
556
4
556
4
556
4
4
557
4
557
4
557
4
557
4
557
4
4
558
4
558
4
4
559
4
559
4
559
4
559
4
559
4
559
4
559
4
559
4
559
4
559
4
559
4
559
4
559
4
559
4
559
4
559
4
559
4
559
4
559
4
559
4
559
4
559
4
559
4
559
4
559
4
559
4
559
4
559
4
559
4
4
560
4
4
561
4
4
562


4
631
4
631
4
631
4
631
4
631
4
631
4
631
4
631
4
631
4
631
4
631
4
631
4
631
4
631
4
631
4
631
4
631
4
631
4
631
4
631
4
631
4
631
4
631
4
631
4
631
4
631
4
631
4
631
4
631
4
631
4
631
4
4
632
4
632
4
632
4
4
633
4
633
4
4
634
4
634
4
634
4
634
4
634
4
634
4
634
4
4
635
4
635
4
635
4
635
4
635
4
635
4
4
636
4
636
4
4
637
4
637
4
637
4
4
638
4
638
4
638
4
638
4
638
4
4
639
4
639
4
4
640
4
640
4
640
4
640
4
640
4
640
4
640
4
640
4
640
4
640
4
640
4
640
4
640
4
640
4
640
4
640
4
640
4
640
4
640
4
640
4
640
4
640
4
640
4
640
4
640
4
640
4
640
4
640
4
640
4
640
4
640
4
4
641
4
641
4
641
4
641
4
641
4
641
4
4
642
4
4
643
4
643
4
643
4
643
4
643
4
643
4
643
4
643
4
4
644
4
644
4
644
4
644
4
644
4
644
4
4
645
4
645
4
645
4
645
4
645
4
645
4
645
4
4
646
4
646
4
646
4
646
4
646
4
646
4
646
4
646
4
646
4
646
4
4
647
4
647
4
647
4
647
4
647
4
647
4
647
4
647
4
647
4
647
4
647
4
647
4
647
4
647
4
647
4
647
4
647
4
647
4
647
4
647
4
647
4
647
4
647
4
647
4
4
648
4
648
4
648
4
648
4
648
4
648
4
648


4
722
4
722
4
4
723
4
723
4
723
4
723
4
723
4
723
4
723
4
723
4
723
4
723
4
723
4
723
4
723
4
723
4
4
724
4
724
4
724
4
724
4
724
4
724
4
724
4
724
4
724
4
724
4
724
4
724
4
724
4
724
4
724
4
724
4
724
4
724
4
724
4
724
4
724
4
724
4
4
725
4
725
4
725
4
725
4
725
4
725
4
725
4
725
4
725
4
725
4
725
4
725
4
725
4
725
4
725
4
725
4
725
4
725
4
725
4
725
4
725
4
725
4
725
4
725
4
725
4
725
4
725
4
725
4
725
4
725
4
725
4
725
4
4
726
4
726
4
726
4
726
4
726
4
726
4
726
4
726
4
726
4
726
4
726
4
726
4
726
4
726
4
726
4
726
4
726
4
726
4
726
4
726
4
726
4
726
4
726
4
726
4
726
4
726
4
726
4
726
4
726
4
726
4
726
4
4
727
4
727
4
4
728
4
728
4
728
4
728
4
728
4
728
4
728
4
728
4
728
4
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
729
4
4
730


4
815
4
4
816
4
816
4
816
4
816
4
4
817
4
817
4
817
4
817
4
817
4
817
4
4
818
4
818
4
818
4
818
4
818
4
818
4
818
4
818
4
818
4
818
4
818
4
818
4
818
4
818
4
818
4
818
4
818
4
818
4
818
4
818
4
818
4
818
4
818
4
818
4
818
4
818
4
818
4
818
4
818
4
818
4
818
4
818
4
818
4
818
4
4
819
4
819
4
819
4
819
4
819
4
819
4
819
4
819
4
819
4
819
4
819
4
819
4
819
4
819
4
819
4
819
4
819
4
819
4
819
4
819
4
819
4
819
4
4
820
4
820
4
820
4
820
4
820
4
820
4
820
4
820
4
820
4
820
4
820
4
820
4
820
4
820
4
820
4
820
4
820
4
820
4
820
4
820
4
820
4
820
4
820
4
820
4
820
4
820
4
820
4
820
4
820
4
820
4
820
4
4
821
4
821
4
821
4
821
4
821
4
821
4
821
4
821
4
821
4
821
4
821
4
821
4
821
4
821
4
821
4
821
4
821
4
821
4
821
4
821
4
821
4
821
4
821
4
821
4
821
4
821
4
821
4
821
4
821
4
821
4
821
4
4
822
4
822
4
822
4
822
4
822
4
822
4
822
4
822
4
822
4
822
4
822
4
822
4
822
4
822
4
822
4
822
4
822
4
822
4
822
4
822
4
822
4
822
4
822
4
822
4
4
823
4
823
4
823
4
823
4
823
4
823
4
823
4
823
4
823
4
823
4
823


900
4
900
4
900
4
900
4
4
901
4
901
4
901
4
901
4
901
4
901
4
901
4
901
4
901
4
901
4
901
4
901
4
901
4
901
4
901
4
4
902
4
902
4
902
4
902
4
902
4
4
903
4
903
4
903
4
903
4
903
4
903
4
903
4
903
4
903
4
903
4
903
4
903
4
903
4
903
4
903
4
903
4
903
4
903
4
903
4
903
4
903
4
903
4
903
4
903
4
903
4
903
4
903
4
903
4
903
4
903
4
4
904
4
904
4
904
4
904
4
904
4
904
4
4
905
4
905
4
905
4
905
4
4
906
4
906
4
906
4
906
4
906
4
906
4
906
4
906
4
906
4
906
4
906
4
906
4
906
4
906
4
906
4
906
4
906
4
906
4
906
4
906
4
906
4
906
4
906
4
906
4
906
4
906
4
906
4
906
4
906
4
906
4
906
4
906
4
906
4
906
4
4
907
4
907
4
907
4
907
4
907
4
907
4
907
4
907
4
907
4
4
908
4
908
4
908
4
908
4
908
4
908
4
908
4
908
4
908
4
908
4
908
4
908
4
908
4
908
4
908
4
908
4
908
4
908
4
908
4
908
4
908
4
908
4
908
4
908
4
908
4
908
4
4
909
4
909
4
909
4
909
4
909
4
909
4
909
4
909
4
909
4
909
4
909
4
909
4
909
4
909
4
4
910
4
4
911
4
911
4
911
4
911
4
911
4
911
4
911
4
911
4
911
4
911
4
4
912
4
912
4
912
4
912
4
4
91

4
977
4
977
4
4
978
4
978
4
978
4
978
4
978
4
978
4
978
4
978
4
978
4
978
4
978
4
978
4
978
4
978
4
978
4
978
4
4
979
4
979
4
979
4
979
4
979
4
979
4
979
4
979
4
979
4
979
4
4
980
4
980
4
980
4
980
4
4
981
4
4
982
4
982
4
982
4
982
4
982
4
982
4
982
4
982
4
982
4
982
4
982
4
982
4
982
4
982
4
982
4
982
4
982
4
982
4
982
4
982
4
982
4
982
4
982
4
982
4
982
4
982
4
982
4
982
4
4
983
4
983
4
983
4
983
4
983
4
983
4
983
4
983
4
983
4
983
4
983
4
983
4
4
984
4
984
4
984
4
984
4
984
4
984
4
984
4
984
4
984
4
984
4
984
4
984
4
984
4
984
4
984
4
984
4
984
4
984
4
984
4
984
4
984
4
984
4
984
4
984
4
984
4
984
4
984
4
4
985
4
985
4
985
4
985
4
985
4
985
4
985
4
985
4
985
4
985
4
985
4
985
4
985
4
985
4
985
4
985
4
985
4
985
4
985
4
985
4
985
4
985
4
985
4
985
4
985
4
985
4
985
4
985
4
985
4
4
986
4
986
4
986
4
986
4
986
4
986
4
986
4
986
4
986
4
986
4
986
4
986
4
986
4
986
4
986
4
986
4
986
4
986
4
986
4
986
4
986
4
986
4
986
4
986
4
986
4
986
4
986
4
986
4
986
4
986
4
986
4
986
4
986
4
986
4
98

In [101]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = test_model(model, mode='random', display=False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num / 10
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 684
0.684
0.684
Games played: 1000, # of wins: 671
0.671
1.355
Games played: 1000, # of wins: 672
0.672
2.027
Games played: 1000, # of wins: 661
0.661
2.688
Games played: 1000, # of wins: 682
0.682
3.37
Games played: 1000, # of wins: 667
0.667
4.037
Games played: 1000, # of wins: 649
0.649
4.686
Games played: 1000, # of wins: 682
0.682
5.368
Games played: 1000, # of wins: 694
0.694
6.062
Games played: 1000, # of wins: 673
0.673
6.735
Win percentage: 67.35%


# Without Mask

In [89]:
import numpy as np
import random
import sys

def randPair(s,e):
    return np.random.randint(s,e), np.random.randint(s,e)

class BoardPiece:

    def __init__(self, name, code, pos):
        self.name = name #name of the piece
        self.code = code #an ASCII character to display on the board
        self.pos = pos #2-tuple e.g. (1,4)

class BoardMask:

    def __init__(self, name, mask, code):
        self.name = name
        self.mask = mask
        self.code = code

    def get_positions(self): #returns tuple of arrays
        return np.nonzero(self.mask)

def zip_positions2d(positions): #positions is tuple of two arrays
    x,y = positions
    return list(zip(x,y))

class GridBoard:

    def __init__(self, size=4):
        self.size = size #Board dimensions, e.g. 4 x 4
        self.components = {} #name : board piece
        self.masks = {}

    def addPiece(self, name, code, pos=(0,0)):
        newPiece = BoardPiece(name, code, pos)
        self.components[name] = newPiece

    #basically a set of boundary elements
    def addMask(self, name, mask, code):
        #mask is a 2D-numpy array with 1s where the boundary elements are
        newMask = BoardMask(name, mask, code)
        self.masks[name] = newMask

    def movePiece(self, name, pos):
        move = True
        for _, mask in self.masks.items():
            if pos in zip_positions2d(mask.get_positions()):
                move = False
        if move:
            self.components[name].pos = pos

    def delPiece(self, name):
        del self.components['name']

    def render(self):
        dtype = '<U2'
        displ_board = np.zeros((self.size, self.size), dtype=dtype)
        displ_board[:] = ' '

        for name, piece in self.components.items():
            displ_board[piece.pos] = piece.code

        for name, mask in self.masks.items():
            displ_board[mask.get_positions()] = mask.code

        return displ_board

    def render_np(self):
        num_pieces = len(self.components) + len(self.masks)
        displ_board = np.zeros((num_pieces, self.size, self.size), dtype=np.uint8)
        layer = 0
        for name, piece in self.components.items():
            pos = (layer,) + piece.pos
            displ_board[pos] = 1
            layer += 1
        return displ_board

def addTuple(a,b):
    return tuple([sum(x) for x in zip(a,b)])

In [90]:
import numpy as np
import torch
from IPython.display import clear_output
import random
from matplotlib import pylab as plt

l1 = 64
l2 = 150
l3 = 100
l4 = 4

model = torch.nn.Sequential(
    torch.nn.Linear(l1, l2),
    torch.nn.ReLU(),
    torch.nn.Linear(l2, l3),
    torch.nn.ReLU(),
    torch.nn.Linear(l3,l4)
)
loss_fn = torch.nn.MSELoss()
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

gamma = 0.9
epsilon = 1.0
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

gamma = 0.9
epsilon = 1.0

In [91]:
from collections import deque
epochs = 1000
losses = []
mem_size = 1000 #A
batch_size = 200 #B
replay = deque(maxlen=mem_size) #C
max_moves = 50 #D
h = 0
for i in range(epochs):
    game = Gridworld(size=4, mode='random')
    state1_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/100.0
    state1 = torch.from_numpy(state1_).float()
    status = 1
    mov = 0
    while(status == 1): 
        mov += 1
        qval = model(state1) #E
        qval_ = qval.data.numpy()
        if (random.random() < epsilon): #F
            action_ = np.random.randint(0,4)
        else:
            action_ = np.argmax(qval_)
        
        action = action_set[action_]
        game.makeMove(action)
        state2_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/100.0
        state2 = torch.from_numpy(state2_).float()
        reward = game.reward()
        done = True if reward > 0 else False
        exp =  (state1, action_, reward, state2, done) #G
        replay.append(exp) #H
        state1 = state2
        
        if len(replay) > batch_size: #I
            minibatch = random.sample(replay, batch_size) #J
            state1_batch = torch.cat([s1 for (s1,a,r,s2,d) in minibatch]) #K
            action_batch = torch.Tensor([a for (s1,a,r,s2,d) in minibatch])
            reward_batch = torch.Tensor([r for (s1,a,r,s2,d) in minibatch])
            state2_batch = torch.cat([s2 for (s1,a,r,s2,d) in minibatch])
            done_batch = torch.Tensor([d for (s1,a,r,s2,d) in minibatch])
            
            Q1 = model(state1_batch) #L
            with torch.no_grad():
                Q2 = model(state2_batch) #M
            
            Y = reward_batch + gamma * ((1 - done_batch) * torch.max(Q2,dim=1)[0]) #N
            X = Q1.gather(dim=1,index=action_batch.long().unsqueeze(dim=1)).squeeze()
            loss = loss_fn(X, Y.detach())
            print(i, loss.item())
            clear_output(wait=True)
            optimizer.zero_grad()
            loss.backward()
            losses.append(loss.item())
            optimizer.step()

        if reward != -1 or mov > max_moves: #O
            status = 0
            mov = 0
losses = np.array(losses)

999 0.0990048423409462


In [92]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = test_model(model, mode='random', display=False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num / 10
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 654
0.654
0.654
Games played: 1000, # of wins: 650
0.65
1.304
Games played: 1000, # of wins: 634
0.634
1.9380000000000002
Games played: 1000, # of wins: 642
0.642
2.58
Games played: 1000, # of wins: 650
0.65
3.23
Games played: 1000, # of wins: 653
0.653
3.883
Games played: 1000, # of wins: 646
0.646
4.529
Games played: 1000, # of wins: 629
0.629
5.1579999999999995
Games played: 1000, # of wins: 611
0.611
5.768999999999999
Games played: 1000, # of wins: 659
0.659
6.427999999999999
Win percentage: 64.27999999999999%


# Original 64 Pitt Player Goal Wall 

In [102]:
import numpy as np
import random
import sys

def randPair(s,e):
    return np.random.randint(s,e), np.random.randint(s,e)

class BoardPiece:

    def __init__(self, name, code, pos):
        self.name = name #name of the piece
        self.code = code #an ASCII character to display on the board
        self.pos = pos #2-tuple e.g. (1,4)

class BoardMask:

    def __init__(self, name, mask, code):
        self.name = name
        self.mask = mask
        self.code = code

    def get_positions(self): #returns tuple of arrays
        return np.nonzero(self.mask)

def zip_positions2d(positions): #positions is tuple of two arrays
    x,y = positions
    return list(zip(x,y))

class GridBoard:

    def __init__(self, size=4):
        self.size = size #Board dimensions, e.g. 4 x 4
        self.components = {} #name : board piece
        self.masks = {}

    def addPiece(self, name, code, pos=(0,0)):
        newPiece = BoardPiece(name, code, pos)
        self.components[name] = newPiece

    #basically a set of boundary elements
    def addMask(self, name, mask, code):
        #mask is a 2D-numpy array with 1s where the boundary elements are
        newMask = BoardMask(name, mask, code)
        self.masks[name] = newMask

    def movePiece(self, name, pos):
        move = True
        for _, mask in self.masks.items():
            if pos in zip_positions2d(mask.get_positions()):
                move = False
        if move:
            self.components[name].pos = pos

    def delPiece(self, name):
        del self.components['name']

    def render(self):
        dtype = '<U2'
        displ_board = np.zeros((self.size, self.size), dtype=dtype)
        displ_board[:] = ' '

        for name, piece in self.components.items():
            displ_board[piece.pos] = piece.code

        for name, mask in self.masks.items():
            displ_board[mask.get_positions()] = mask.code

        return displ_board

    def render_np(self):
        num_pieces = len(self.components) + len(self.masks)
        displ_board = np.zeros((num_pieces, self.size, self.size), dtype=np.uint8)
        layer = 0
        pos = []
        for name, piece in self.components.items():
            if name == 'Player':
                pos = (1,) + piece.pos
                displ_board[pos] = 1
                layer += 1
            if name == 'Pit':
                pos = (0,) + piece.pos
                displ_board[pos] = 1
                layer += 1
            if name == 'Goal':
                pos = (2,) + piece.pos
                displ_board[pos] = 1
                layer += 1
            if name == 'Wall':
                pos = (3,) + piece.pos
                displ_board[pos] = 1
                layer += 1

        for name, mask in self.masks.items():
            x,y = self.masks['boundary'].get_positions()
            z = np.repeat(layer,len(x))
            a = (z,x,y)
            displ_board[a] = 1
            layer += 1
            print(mask)
        return displ_board

def addTuple(a,b):
    return tuple([sum(x) for x in zip(a,b)])

In [103]:
game = Gridworld(size=4, mode='random')
game.board.render_np()

array([[[0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 1, 0],
        [0, 0, 0, 0]],

       [[0, 0, 0, 0],
        [0, 0, 1, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]],

       [[0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 1],
        [0, 0, 0, 0]],

       [[0, 0, 0, 1],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]]], dtype=uint8)

In [104]:
game.makeMove('d')
game.board.render_np()

array([[[0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 1, 0],
        [0, 0, 0, 0]],

       [[0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 1, 0],
        [0, 0, 0, 0]],

       [[0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 1],
        [0, 0, 0, 0]],

       [[0, 0, 0, 1],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]]], dtype=uint8)

In [105]:
import numpy as np
import torch
from IPython.display import clear_output
import random
from matplotlib import pylab as plt

l1 = 64
l2 = 150
l3 = 100
l4 = 4

model = torch.nn.Sequential(
    torch.nn.Linear(l1, l2),
    torch.nn.ReLU(),
    torch.nn.Linear(l2, l3),
    torch.nn.ReLU(),
    torch.nn.Linear(l3,l4)
)
loss_fn = torch.nn.MSELoss()
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

gamma = 0.9
epsilon = 1.0
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

gamma = 0.9
epsilon = 1.0

In [106]:
from collections import deque
epochs = 1000
losses = []
mem_size = 1000 #A
batch_size = 200 #B
replay = deque(maxlen=mem_size) #C
max_moves = 50 #D
h = 0
for i in range(epochs):
    game = Gridworld(size=4, mode='random')
    state1_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/100.0
    state1 = torch.from_numpy(state1_).float()
    status = 1
    mov = 0
    while(status == 1): 
        mov += 1
        qval = model(state1) #E
        qval_ = qval.data.numpy()
        if (random.random() < epsilon): #F
            action_ = np.random.randint(0,4)
        else:
            action_ = np.argmax(qval_)
        
        action = action_set[action_]
        game.makeMove(action)
        state2_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/100.0
        state2 = torch.from_numpy(state2_).float()
        reward = game.reward()
        done = True if reward > 0 else False
        exp =  (state1, action_, reward, state2, done) #G
        replay.append(exp) #H
        state1 = state2
        
        if len(replay) > batch_size: #I
            minibatch = random.sample(replay, batch_size) #J
            state1_batch = torch.cat([s1 for (s1,a,r,s2,d) in minibatch]) #K
            action_batch = torch.Tensor([a for (s1,a,r,s2,d) in minibatch])
            reward_batch = torch.Tensor([r for (s1,a,r,s2,d) in minibatch])
            state2_batch = torch.cat([s2 for (s1,a,r,s2,d) in minibatch])
            done_batch = torch.Tensor([d for (s1,a,r,s2,d) in minibatch])
            
            Q1 = model(state1_batch) #L
            with torch.no_grad():
                Q2 = model(state2_batch) #M
            
            Y = reward_batch + gamma * ((1 - done_batch) * torch.max(Q2,dim=1)[0]) #N
            X = Q1.gather(dim=1,index=action_batch.long().unsqueeze(dim=1)).squeeze()
            loss = loss_fn(X, Y.detach())
            print(i, loss.item())
            clear_output(wait=True)
            optimizer.zero_grad()
            loss.backward()
            losses.append(loss.item())
            optimizer.step()

        if reward != -1 or mov > max_moves: #O
            status = 0
            mov = 0
losses = np.array(losses)

999 0.08431217819452286


In [107]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = test_model(model, mode='random', display=False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num / 10
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 655
0.655
0.655
Games played: 1000, # of wins: 658
0.658
1.3130000000000002
Games played: 1000, # of wins: 646
0.646
1.959
Games played: 1000, # of wins: 672
0.672
2.6310000000000002
Games played: 1000, # of wins: 672
0.672
3.3030000000000004
Games played: 1000, # of wins: 666
0.666
3.9690000000000003
Games played: 1000, # of wins: 640
0.64
4.609
Games played: 1000, # of wins: 654
0.654
5.263
Games played: 1000, # of wins: 624
0.624
5.887
Games played: 1000, # of wins: 653
0.653
6.539999999999999
Win percentage: 65.39999999999999%


# Original 64 Pitt Goal Wall Player

In [77]:
import numpy as np
import random
import sys

def randPair(s,e):
    return np.random.randint(s,e), np.random.randint(s,e)

class BoardPiece:

    def __init__(self, name, code, pos):
        self.name = name #name of the piece
        self.code = code #an ASCII character to display on the board
        self.pos = pos #2-tuple e.g. (1,4)

class BoardMask:

    def __init__(self, name, mask, code):
        self.name = name
        self.mask = mask
        self.code = code

    def get_positions(self): #returns tuple of arrays
        return np.nonzero(self.mask)

def zip_positions2d(positions): #positions is tuple of two arrays
    x,y = positions
    return list(zip(x,y))

class GridBoard:

    def __init__(self, size=4):
        self.size = size #Board dimensions, e.g. 4 x 4
        self.components = {} #name : board piece
        self.masks = {}

    def addPiece(self, name, code, pos=(0,0)):
        newPiece = BoardPiece(name, code, pos)
        self.components[name] = newPiece

    #basically a set of boundary elements
    def addMask(self, name, mask, code):
        #mask is a 2D-numpy array with 1s where the boundary elements are
        newMask = BoardMask(name, mask, code)
        self.masks[name] = newMask

    def movePiece(self, name, pos):
        move = True
        for _, mask in self.masks.items():
            if pos in zip_positions2d(mask.get_positions()):
                move = False
        if move:
            self.components[name].pos = pos

    def delPiece(self, name):
        del self.components['name']

    def render(self):
        dtype = '<U2'
        displ_board = np.zeros((self.size, self.size), dtype=dtype)
        displ_board[:] = ' '

        for name, piece in self.components.items():
            displ_board[piece.pos] = piece.code

        for name, mask in self.masks.items():
            displ_board[mask.get_positions()] = mask.code

        return displ_board

    def render_np(self):
        num_pieces = len(self.components) + len(self.masks)
        displ_board = np.zeros((num_pieces, self.size, self.size), dtype=np.uint8)
        layer = 0
        pos = []
        for name, piece in self.components.items():
            if name == 'Player':
                pos = (3,) + piece.pos
                displ_board[pos] = 1
                layer += 1
            if name == 'Pit':
                pos = (0,) + piece.pos
                displ_board[pos] = 1
                layer += 1
            if name == 'Goal':
                pos = (1,) + piece.pos
                displ_board[pos] = 1
                layer += 1
            if name == 'Wall':
                pos = (2,) + piece.pos
                displ_board[pos] = 1
                layer += 1
        print(self.masks.items())
        for name, mask in self.masks.items():
            x,y = self.masks['boundary'].get_positions()
            z = np.repeat(layer,len(x))
            a = (z,x,y)
            displ_board[a] = 1
            layer += 1
            print(mask)
        return displ_board

def addTuple(a,b):
    return tuple([sum(x) for x in zip(a,b)])

In [78]:
game = Gridworld(size=4, mode='random')
game.board.render_np()

dict_items([])


array([[[0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 1, 0, 0],
        [0, 0, 0, 0]],

       [[0, 0, 0, 0],
        [1, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]],

       [[0, 0, 0, 0],
        [0, 0, 0, 0],
        [1, 0, 0, 0],
        [0, 0, 0, 0]],

       [[0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 1],
        [0, 0, 0, 0]]], dtype=uint8)

In [108]:
import numpy as np
import torch
from IPython.display import clear_output
import random
from matplotlib import pylab as plt

l1 = 64
l2 = 150
l3 = 100
l4 = 4

model = torch.nn.Sequential(
    torch.nn.Linear(l1, l2),
    torch.nn.ReLU(),
    torch.nn.Linear(l2, l3),
    torch.nn.ReLU(),
    torch.nn.Linear(l3,l4)
)
loss_fn = torch.nn.MSELoss()
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

gamma = 0.9
epsilon = 1.0
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

gamma = 0.9
epsilon = 1.0

In [109]:
from collections import deque
epochs = 1000
losses = []
mem_size = 1000 #A
batch_size = 200 #B
replay = deque(maxlen=mem_size) #C
max_moves = 50 #D
h = 0
for i in range(epochs):
    game = Gridworld(size=4, mode='random')
    state1_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/100.0
    state1 = torch.from_numpy(state1_).float()
    status = 1
    mov = 0
    while(status == 1): 
        mov += 1
        qval = model(state1) #E
        qval_ = qval.data.numpy()
        if (random.random() < epsilon): #F
            action_ = np.random.randint(0,4)
        else:
            action_ = np.argmax(qval_)
        
        action = action_set[action_]
        game.makeMove(action)
        state2_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/100.0
        state2 = torch.from_numpy(state2_).float()
        reward = game.reward()
        done = True if reward > 0 else False
        exp =  (state1, action_, reward, state2, done) #G
        replay.append(exp) #H
        state1 = state2
        
        if len(replay) > batch_size: #I
            minibatch = random.sample(replay, batch_size) #J
            state1_batch = torch.cat([s1 for (s1,a,r,s2,d) in minibatch]) #K
            action_batch = torch.Tensor([a for (s1,a,r,s2,d) in minibatch])
            reward_batch = torch.Tensor([r for (s1,a,r,s2,d) in minibatch])
            state2_batch = torch.cat([s2 for (s1,a,r,s2,d) in minibatch])
            done_batch = torch.Tensor([d for (s1,a,r,s2,d) in minibatch])
            
            Q1 = model(state1_batch) #L
            with torch.no_grad():
                Q2 = model(state2_batch) #M
            
            Y = reward_batch + gamma * ((1 - done_batch) * torch.max(Q2,dim=1)[0]) #N
            X = Q1.gather(dim=1,index=action_batch.long().unsqueeze(dim=1)).squeeze()
            loss = loss_fn(X, Y.detach())
            print(i, loss.item())
            clear_output(wait=True)
            optimizer.zero_grad()
            loss.backward()
            losses.append(loss.item())
            optimizer.step()

        if reward != -1 or mov > max_moves: #O
            status = 0
            mov = 0
losses = np.array(losses)

999 0.06660489737987518


In [110]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = test_model(model, mode='random', display=False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num / 10
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 600
0.6
0.6
Games played: 1000, # of wins: 628
0.628
1.228
Games played: 1000, # of wins: 623
0.623
1.851
Games played: 1000, # of wins: 610
0.61
2.461
Games played: 1000, # of wins: 621
0.621
3.082
Games played: 1000, # of wins: 621
0.621
3.703
Games played: 1000, # of wins: 604
0.604
4.3069999999999995
Games played: 1000, # of wins: 628
0.628
4.935
Games played: 1000, # of wins: 621
0.621
5.555999999999999
Games played: 1000, # of wins: 607
0.607
6.162999999999999
Win percentage: 61.629999999999995%


# Mask changed Original 64 Pitt Player Goal Wall 

In [114]:
import numpy as np
import random
import sys

def randPair(s,e):
    return np.random.randint(s,e), np.random.randint(s,e)

class BoardPiece:

    def __init__(self, name, code, pos):
        self.name = name #name of the piece
        self.code = code #an ASCII character to display on the board
        self.pos = pos #2-tuple e.g. (1,4)

class BoardMask:

    def __init__(self, name, mask, code):
        self.name = name
        self.mask = mask
        self.code = code

    def get_positions(self): #returns tuple of arrays
        return np.nonzero(self.mask)

def zip_positions2d(positions): #positions is tuple of two arrays
    x,y = positions
    return list(zip(x,y))

class GridBoard:

    def __init__(self, size=4):
        self.size = size #Board dimensions, e.g. 4 x 4
        self.components = {} #name : board piece
        self.masks = {}

    def addPiece(self, name, code, pos=(0,0)):
        newPiece = BoardPiece(name, code, pos)
        self.components[name] = newPiece

    #basically a set of boundary elements
    def addMask(self, name, mask, code):
        #mask is a 2D-numpy array with 1s where the boundary elements are
        newMask = BoardMask(name, mask, code)
        self.masks[name] = newMask

    def movePiece(self, name, pos):
        move = True
        for _, mask in self.masks.items():
            if pos in zip_positions2d(mask.get_positions()):
                move = False
        if move:
            self.components[name].pos = pos

    def delPiece(self, name):
        del self.components['name']

    def render(self):
        dtype = '<U2'
        displ_board = np.zeros((self.size, self.size), dtype=dtype)
        displ_board[:] = ' '

        for name, piece in self.components.items():
            displ_board[piece.pos] = piece.code

        for name, mask in self.masks.items():
            displ_board[mask.get_positions()] = mask.code

        return displ_board

    def render_np(self):
        num_pieces = len(self.components) + len(self.masks)
        displ_board = np.zeros((num_pieces, self.size, self.size), dtype=np.uint8)
        pos = []
        for name, piece in self.components.items():
            if name == 'Player':
                pos = (1,) + piece.pos
                displ_board[pos] = 1
            if name == 'Pit':
                pos = (0,) + piece.pos
                displ_board[pos] = 1
            if name == 'Goal':
                pos = (2,) + piece.pos
                displ_board[pos] = 1
            if name == 'Wall':
                pos = (3,) + piece.pos
                displ_board[pos] = 1

        for name, mask in self.masks.items():
            if name == 'Player':
                x,y = self.masks['boundary'].get_positions()
                z = np.repeat(1,len(x))
                a = (z,x,y)
                displ_board[a] = 1
                print(mask)
            if name == 'Pit':
                x,y = self.masks['boundary'].get_positions()
                z = np.repeat(0,len(x))
                a = (z,x,y)
                displ_board[a] = 1
                print(mask)
            if name == 'Goal':
                x,y = self.masks['boundary'].get_positions()
                z = np.repeat(2,len(x))
                a = (z,x,y)
                displ_board[a] = 1
                print(mask)
            if name == 'Wall':
                x,y = self.masks['boundary'].get_positions()
                z = np.repeat(3,len(x))
                a = (z,x,y)
                displ_board[a] = 1
                print(mask)
        return displ_board

def addTuple(a,b):
    return tuple([sum(x) for x in zip(a,b)])

In [115]:
import numpy as np
import torch
from IPython.display import clear_output
import random
from matplotlib import pylab as plt

l1 = 64
l2 = 150
l3 = 100
l4 = 4

model = torch.nn.Sequential(
    torch.nn.Linear(l1, l2),
    torch.nn.ReLU(),
    torch.nn.Linear(l2, l3),
    torch.nn.ReLU(),
    torch.nn.Linear(l3,l4)
)
loss_fn = torch.nn.MSELoss()
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

gamma = 0.9
epsilon = 1.0
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

gamma = 0.9
epsilon = 1.0

In [116]:
from collections import deque
epochs = 1000
losses = []
mem_size = 1000 #A
batch_size = 200 #B
replay = deque(maxlen=mem_size) #C
max_moves = 50 #D
h = 0
for i in range(epochs):
    game = Gridworld(size=4, mode='random')
    state1_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/100.0
    state1 = torch.from_numpy(state1_).float()
    status = 1
    mov = 0
    while(status == 1): 
        mov += 1
        qval = model(state1) #E
        qval_ = qval.data.numpy()
        if (random.random() < epsilon): #F
            action_ = np.random.randint(0,4)
        else:
            action_ = np.argmax(qval_)
        
        action = action_set[action_]
        game.makeMove(action)
        state2_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/100.0
        state2 = torch.from_numpy(state2_).float()
        reward = game.reward()
        done = True if reward > 0 else False
        exp =  (state1, action_, reward, state2, done) #G
        replay.append(exp) #H
        state1 = state2
        
        if len(replay) > batch_size: #I
            minibatch = random.sample(replay, batch_size) #J
            state1_batch = torch.cat([s1 for (s1,a,r,s2,d) in minibatch]) #K
            action_batch = torch.Tensor([a for (s1,a,r,s2,d) in minibatch])
            reward_batch = torch.Tensor([r for (s1,a,r,s2,d) in minibatch])
            state2_batch = torch.cat([s2 for (s1,a,r,s2,d) in minibatch])
            done_batch = torch.Tensor([d for (s1,a,r,s2,d) in minibatch])
            
            Q1 = model(state1_batch) #L
            with torch.no_grad():
                Q2 = model(state2_batch) #M
            
            Y = reward_batch + gamma * ((1 - done_batch) * torch.max(Q2,dim=1)[0]) #N
            X = Q1.gather(dim=1,index=action_batch.long().unsqueeze(dim=1)).squeeze()
            loss = loss_fn(X, Y.detach())
            print(i, loss.item())
            clear_output(wait=True)
            optimizer.zero_grad()
            loss.backward()
            losses.append(loss.item())
            optimizer.step()

        if reward != -1 or mov > max_moves: #O
            status = 0
            mov = 0
losses = np.array(losses)

999 0.029673662036657333


In [117]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = test_model(model, mode='random', display=False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num / 10
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 683
0.683
0.683
Games played: 1000, # of wins: 696
0.696
1.379
Games played: 1000, # of wins: 703
0.703
2.082
Games played: 1000, # of wins: 687
0.687
2.769
Games played: 1000, # of wins: 683
0.683
3.452
Games played: 1000, # of wins: 715
0.715
4.167
Games played: 1000, # of wins: 680
0.68
4.8469999999999995
Games played: 1000, # of wins: 676
0.676
5.523
Games played: 1000, # of wins: 703
0.703
6.226
Games played: 1000, # of wins: 690
0.69
6.916
Win percentage: 69.16%


# Mask changed Original 64 Pitt Goal Wall Player

In [119]:
import numpy as np
import random
import sys

def randPair(s,e):
    return np.random.randint(s,e), np.random.randint(s,e)

class BoardPiece:

    def __init__(self, name, code, pos):
        self.name = name #name of the piece
        self.code = code #an ASCII character to display on the board
        self.pos = pos #2-tuple e.g. (1,4)

class BoardMask:

    def __init__(self, name, mask, code):
        self.name = name
        self.mask = mask
        self.code = code

    def get_positions(self): #returns tuple of arrays
        return np.nonzero(self.mask)

def zip_positions2d(positions): #positions is tuple of two arrays
    x,y = positions
    return list(zip(x,y))

class GridBoard:

    def __init__(self, size=4):
        self.size = size #Board dimensions, e.g. 4 x 4
        self.components = {} #name : board piece
        self.masks = {}

    def addPiece(self, name, code, pos=(0,0)):
        newPiece = BoardPiece(name, code, pos)
        self.components[name] = newPiece

    #basically a set of boundary elements
    def addMask(self, name, mask, code):
        #mask is a 2D-numpy array with 1s where the boundary elements are
        newMask = BoardMask(name, mask, code)
        self.masks[name] = newMask

    def movePiece(self, name, pos):
        move = True
        for _, mask in self.masks.items():
            if pos in zip_positions2d(mask.get_positions()):
                move = False
        if move:
            self.components[name].pos = pos

    def delPiece(self, name):
        del self.components['name']

    def render(self):
        dtype = '<U2'
        displ_board = np.zeros((self.size, self.size), dtype=dtype)
        displ_board[:] = ' '

        for name, piece in self.components.items():
            displ_board[piece.pos] = piece.code

        for name, mask in self.masks.items():
            displ_board[mask.get_positions()] = mask.code

        return displ_board

    def render_np(self):
        num_pieces = len(self.components) + len(self.masks)
        displ_board = np.zeros((num_pieces, self.size, self.size), dtype=np.uint8)
        layer = 0
        pos = []
        for name, piece in self.components.items():
            if name == 'Player':
                pos = (3,) + piece.pos
                displ_board[pos] = 1
                layer += 1
            if name == 'Pit':
                pos = (0,) + piece.pos
                displ_board[pos] = 1
                layer += 1
            if name == 'Goal':
                pos = (1,) + piece.pos
                displ_board[pos] = 1
                layer += 1
            if name == 'Wall':
                pos = (2,) + piece.pos
                displ_board[pos] = 1
                layer += 1

        for name, mask in self.masks.items():
            if name == 'Player':
                x,y = self.masks['boundary'].get_positions()
                z = np.repeat(3,len(x))
                a = (z,x,y)
                displ_board[a] = 1
                print(mask)
            if name == 'Pit':
                x,y = self.masks['boundary'].get_positions()
                z = np.repeat(0,len(x))
                a = (z,x,y)
                displ_board[a] = 1
                print(mask)
            if name == 'Goal':
                x,y = self.masks['boundary'].get_positions()
                z = np.repeat(1,len(x))
                a = (z,x,y)
                displ_board[a] = 1
                print(mask)
            if name == 'Wall':
                x,y = self.masks['boundary'].get_positions()
                z = np.repeat(2,len(x))
                a = (z,x,y)
                displ_board[a] = 1
                print(mask)
        return displ_board

def addTuple(a,b):
    return tuple([sum(x) for x in zip(a,b)])

In [120]:
import numpy as np
import torch
from IPython.display import clear_output
import random
from matplotlib import pylab as plt

l1 = 64
l2 = 150
l3 = 100
l4 = 4

model = torch.nn.Sequential(
    torch.nn.Linear(l1, l2),
    torch.nn.ReLU(),
    torch.nn.Linear(l2, l3),
    torch.nn.ReLU(),
    torch.nn.Linear(l3,l4)
)
loss_fn = torch.nn.MSELoss()
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

gamma = 0.9
epsilon = 1.0
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

gamma = 0.9
epsilon = 1.0

In [121]:
from collections import deque
epochs = 1000
losses = []
mem_size = 1000 #A
batch_size = 200 #B
replay = deque(maxlen=mem_size) #C
max_moves = 50 #D
h = 0
for i in range(epochs):
    game = Gridworld(size=4, mode='random')
    state1_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/100.0
    state1 = torch.from_numpy(state1_).float()
    status = 1
    mov = 0
    while(status == 1): 
        mov += 1
        qval = model(state1) #E
        qval_ = qval.data.numpy()
        if (random.random() < epsilon): #F
            action_ = np.random.randint(0,4)
        else:
            action_ = np.argmax(qval_)
        
        action = action_set[action_]
        game.makeMove(action)
        state2_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/100.0
        state2 = torch.from_numpy(state2_).float()
        reward = game.reward()
        done = True if reward > 0 else False
        exp =  (state1, action_, reward, state2, done) #G
        replay.append(exp) #H
        state1 = state2
        
        if len(replay) > batch_size: #I
            minibatch = random.sample(replay, batch_size) #J
            state1_batch = torch.cat([s1 for (s1,a,r,s2,d) in minibatch]) #K
            action_batch = torch.Tensor([a for (s1,a,r,s2,d) in minibatch])
            reward_batch = torch.Tensor([r for (s1,a,r,s2,d) in minibatch])
            state2_batch = torch.cat([s2 for (s1,a,r,s2,d) in minibatch])
            done_batch = torch.Tensor([d for (s1,a,r,s2,d) in minibatch])
            
            Q1 = model(state1_batch) #L
            with torch.no_grad():
                Q2 = model(state2_batch) #M
            
            Y = reward_batch + gamma * ((1 - done_batch) * torch.max(Q2,dim=1)[0]) #N
            X = Q1.gather(dim=1,index=action_batch.long().unsqueeze(dim=1)).squeeze()
            loss = loss_fn(X, Y.detach())
            print(i, loss.item())
            clear_output(wait=True)
            optimizer.zero_grad()
            loss.backward()
            losses.append(loss.item())
            optimizer.step()

        if reward != -1 or mov > max_moves: #O
            status = 0
            mov = 0
losses = np.array(losses)

999 0.048999395221471786


In [122]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = test_model(model, mode='random', display=False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num / 10
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 698
0.698
0.698
Games played: 1000, # of wins: 658
0.658
1.3559999999999999
Games played: 1000, # of wins: 688
0.688
2.0439999999999996
Games played: 1000, # of wins: 668
0.668
2.7119999999999997
Games played: 1000, # of wins: 666
0.666
3.3779999999999997
Games played: 1000, # of wins: 676
0.676
4.053999999999999
Games played: 1000, # of wins: 671
0.671
4.725
Games played: 1000, # of wins: 679
0.679
5.404
Games played: 1000, # of wins: 688
0.688
6.092
Games played: 1000, # of wins: 709
0.709
6.800999999999999
Win percentage: 68.00999999999999%


# Mask changed Original 64 Goal Pitt Player Wall 

In [123]:
import numpy as np
import random
import sys

def randPair(s,e):
    return np.random.randint(s,e), np.random.randint(s,e)

class BoardPiece:

    def __init__(self, name, code, pos):
        self.name = name #name of the piece
        self.code = code #an ASCII character to display on the board
        self.pos = pos #2-tuple e.g. (1,4)

class BoardMask:

    def __init__(self, name, mask, code):
        self.name = name
        self.mask = mask
        self.code = code

    def get_positions(self): #returns tuple of arrays
        return np.nonzero(self.mask)

def zip_positions2d(positions): #positions is tuple of two arrays
    x,y = positions
    return list(zip(x,y))

class GridBoard:

    def __init__(self, size=4):
        self.size = size #Board dimensions, e.g. 4 x 4
        self.components = {} #name : board piece
        self.masks = {}

    def addPiece(self, name, code, pos=(0,0)):
        newPiece = BoardPiece(name, code, pos)
        self.components[name] = newPiece

    #basically a set of boundary elements
    def addMask(self, name, mask, code):
        #mask is a 2D-numpy array with 1s where the boundary elements are
        newMask = BoardMask(name, mask, code)
        self.masks[name] = newMask

    def movePiece(self, name, pos):
        move = True
        for _, mask in self.masks.items():
            if pos in zip_positions2d(mask.get_positions()):
                move = False
        if move:
            self.components[name].pos = pos

    def delPiece(self, name):
        del self.components['name']

    def render(self):
        dtype = '<U2'
        displ_board = np.zeros((self.size, self.size), dtype=dtype)
        displ_board[:] = ' '

        for name, piece in self.components.items():
            displ_board[piece.pos] = piece.code

        for name, mask in self.masks.items():
            displ_board[mask.get_positions()] = mask.code

        return displ_board

    def render_np(self):
        num_pieces = len(self.components) + len(self.masks)
        displ_board = np.zeros((num_pieces, self.size, self.size), dtype=np.uint8)
        layer = 0
        pos = []
        for name, piece in self.components.items():
            if name == 'Player':
                pos = (2,) + piece.pos
                displ_board[pos] = 1
                layer += 1
            if name == 'Pit':
                pos = (1,) + piece.pos
                displ_board[pos] = 1
                layer += 1
            if name == 'Goal':
                pos = (0,) + piece.pos
                displ_board[pos] = 1
                layer += 1
            if name == 'Wall':
                pos = (3,) + piece.pos
                displ_board[pos] = 1
                layer += 1

        for name, mask in self.masks.items():
            if name == 'Player':
                x,y = self.masks['boundary'].get_positions()
                z = np.repeat(2,len(x))
                a = (z,x,y)
                displ_board[a] = 1
                print(mask)
            if name == 'Pit':
                x,y = self.masks['boundary'].get_positions()
                z = np.repeat(1,len(x))
                a = (z,x,y)
                displ_board[a] = 1
                print(mask)
            if name == 'Goal':
                x,y = self.masks['boundary'].get_positions()
                z = np.repeat(0,len(x))
                a = (z,x,y)
                displ_board[a] = 1
                print(mask)
            if name == 'Wall':
                x,y = self.masks['boundary'].get_positions()
                z = np.repeat(3,len(x))
                a = (z,x,y)
                displ_board[a] = 1
                print(mask)
        return displ_board

def addTuple(a,b):
    return tuple([sum(x) for x in zip(a,b)])

In [124]:
import numpy as np
import torch
from IPython.display import clear_output
import random
from matplotlib import pylab as plt

l1 = 64
l2 = 150
l3 = 100
l4 = 4

model = torch.nn.Sequential(
    torch.nn.Linear(l1, l2),
    torch.nn.ReLU(),
    torch.nn.Linear(l2, l3),
    torch.nn.ReLU(),
    torch.nn.Linear(l3,l4)
)
loss_fn = torch.nn.MSELoss()
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

gamma = 0.9
epsilon = 1.0
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

gamma = 0.9
epsilon = 1.0

In [125]:
from collections import deque
epochs = 1000
losses = []
mem_size = 1000 #A
batch_size = 200 #B
replay = deque(maxlen=mem_size) #C
max_moves = 50 #D
h = 0
for i in range(epochs):
    game = Gridworld(size=4, mode='random')
    state1_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/100.0
    state1 = torch.from_numpy(state1_).float()
    status = 1
    mov = 0
    while(status == 1): 
        mov += 1
        qval = model(state1) #E
        qval_ = qval.data.numpy()
        if (random.random() < epsilon): #F
            action_ = np.random.randint(0,4)
        else:
            action_ = np.argmax(qval_)
        
        action = action_set[action_]
        game.makeMove(action)
        state2_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/100.0
        state2 = torch.from_numpy(state2_).float()
        reward = game.reward()
        done = True if reward > 0 else False
        exp =  (state1, action_, reward, state2, done) #G
        replay.append(exp) #H
        state1 = state2
        
        if len(replay) > batch_size: #I
            minibatch = random.sample(replay, batch_size) #J
            state1_batch = torch.cat([s1 for (s1,a,r,s2,d) in minibatch]) #K
            action_batch = torch.Tensor([a for (s1,a,r,s2,d) in minibatch])
            reward_batch = torch.Tensor([r for (s1,a,r,s2,d) in minibatch])
            state2_batch = torch.cat([s2 for (s1,a,r,s2,d) in minibatch])
            done_batch = torch.Tensor([d for (s1,a,r,s2,d) in minibatch])
            
            Q1 = model(state1_batch) #L
            with torch.no_grad():
                Q2 = model(state2_batch) #M
            
            Y = reward_batch + gamma * ((1 - done_batch) * torch.max(Q2,dim=1)[0]) #N
            X = Q1.gather(dim=1,index=action_batch.long().unsqueeze(dim=1)).squeeze()
            loss = loss_fn(X, Y.detach())
            print(i, loss.item())
            clear_output(wait=True)
            optimizer.zero_grad()
            loss.backward()
            losses.append(loss.item())
            optimizer.step()

        if reward != -1 or mov > max_moves: #O
            status = 0
            mov = 0
losses = np.array(losses)

999 0.0745161846280098


In [126]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = test_model(model, mode='random', display=False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num / 10
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 631
0.631
0.631
Games played: 1000, # of wins: 584
0.584
1.2149999999999999
Games played: 1000, # of wins: 644
0.644
1.859
Games played: 1000, # of wins: 598
0.598
2.457
Games played: 1000, # of wins: 624
0.624
3.081
Games played: 1000, # of wins: 630
0.63
3.711
Games played: 1000, # of wins: 628
0.628
4.3389999999999995
Games played: 1000, # of wins: 614
0.614
4.952999999999999
Games played: 1000, # of wins: 616
0.616
5.568999999999999
Games played: 1000, # of wins: 600
0.6
6.168999999999999
Win percentage: 61.68999999999999%


# Mask changed Original 64 Goal Pitt Player Wall Test 2

In [127]:
import numpy as np
import random
import sys

def randPair(s,e):
    return np.random.randint(s,e), np.random.randint(s,e)

class BoardPiece:

    def __init__(self, name, code, pos):
        self.name = name #name of the piece
        self.code = code #an ASCII character to display on the board
        self.pos = pos #2-tuple e.g. (1,4)

class BoardMask:

    def __init__(self, name, mask, code):
        self.name = name
        self.mask = mask
        self.code = code

    def get_positions(self): #returns tuple of arrays
        return np.nonzero(self.mask)

def zip_positions2d(positions): #positions is tuple of two arrays
    x,y = positions
    return list(zip(x,y))

class GridBoard:

    def __init__(self, size=4):
        self.size = size #Board dimensions, e.g. 4 x 4
        self.components = {} #name : board piece
        self.masks = {}

    def addPiece(self, name, code, pos=(0,0)):
        newPiece = BoardPiece(name, code, pos)
        self.components[name] = newPiece

    #basically a set of boundary elements
    def addMask(self, name, mask, code):
        #mask is a 2D-numpy array with 1s where the boundary elements are
        newMask = BoardMask(name, mask, code)
        self.masks[name] = newMask

    def movePiece(self, name, pos):
        move = True
        for _, mask in self.masks.items():
            if pos in zip_positions2d(mask.get_positions()):
                move = False
        if move:
            self.components[name].pos = pos

    def delPiece(self, name):
        del self.components['name']

    def render(self):
        dtype = '<U2'
        displ_board = np.zeros((self.size, self.size), dtype=dtype)
        displ_board[:] = ' '

        for name, piece in self.components.items():
            displ_board[piece.pos] = piece.code

        for name, mask in self.masks.items():
            displ_board[mask.get_positions()] = mask.code

        return displ_board

    def render_np(self):
        num_pieces = len(self.components) + len(self.masks)
        displ_board = np.zeros((num_pieces, self.size, self.size), dtype=np.uint8)
        layer = 0
        pos = []
        for name, piece in self.components.items():
            if name == 'Player':
                pos = (2,) + piece.pos
                displ_board[pos] = 1
                layer += 1
            if name == 'Pit':
                pos = (1,) + piece.pos
                displ_board[pos] = 1
                layer += 1
            if name == 'Goal':
                pos = (0,) + piece.pos
                displ_board[pos] = 1
                layer += 1
            if name == 'Wall':
                pos = (3,) + piece.pos
                displ_board[pos] = 1
                layer += 1

        for name, mask in self.masks.items():
            if name == 'Player':
                x,y = self.masks['boundary'].get_positions()
                z = np.repeat(2,len(x))
                a = (z,x,y)
                displ_board[a] = 1
                print(mask)
            if name == 'Pit':
                x,y = self.masks['boundary'].get_positions()
                z = np.repeat(1,len(x))
                a = (z,x,y)
                displ_board[a] = 1
                print(mask)
            if name == 'Goal':
                x,y = self.masks['boundary'].get_positions()
                z = np.repeat(0,len(x))
                a = (z,x,y)
                displ_board[a] = 1
                print(mask)
            if name == 'Wall':
                x,y = self.masks['boundary'].get_positions()
                z = np.repeat(3,len(x))
                a = (z,x,y)
                displ_board[a] = 1
                print(mask)
        return displ_board

def addTuple(a,b):
    return tuple([sum(x) for x in zip(a,b)])

In [128]:
import numpy as np
import torch
from IPython.display import clear_output
import random
from matplotlib import pylab as plt

l1 = 64
l2 = 150
l3 = 100
l4 = 4

model = torch.nn.Sequential(
    torch.nn.Linear(l1, l2),
    torch.nn.ReLU(),
    torch.nn.Linear(l2, l3),
    torch.nn.ReLU(),
    torch.nn.Linear(l3,l4)
)
loss_fn = torch.nn.MSELoss()
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

gamma = 0.9
epsilon = 1.0
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

gamma = 0.9
epsilon = 1.0

In [129]:
from collections import deque
epochs = 1000
losses = []
mem_size = 1000 #A
batch_size = 200 #B
replay = deque(maxlen=mem_size) #C
max_moves = 50 #D
h = 0
for i in range(epochs):
    game = Gridworld(size=4, mode='random')
    state1_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/100.0
    state1 = torch.from_numpy(state1_).float()
    status = 1
    mov = 0
    while(status == 1): 
        mov += 1
        qval = model(state1) #E
        qval_ = qval.data.numpy()
        if (random.random() < epsilon): #F
            action_ = np.random.randint(0,4)
        else:
            action_ = np.argmax(qval_)
        
        action = action_set[action_]
        game.makeMove(action)
        state2_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/100.0
        state2 = torch.from_numpy(state2_).float()
        reward = game.reward()
        done = True if reward > 0 else False
        exp =  (state1, action_, reward, state2, done) #G
        replay.append(exp) #H
        state1 = state2
        
        if len(replay) > batch_size: #I
            minibatch = random.sample(replay, batch_size) #J
            state1_batch = torch.cat([s1 for (s1,a,r,s2,d) in minibatch]) #K
            action_batch = torch.Tensor([a for (s1,a,r,s2,d) in minibatch])
            reward_batch = torch.Tensor([r for (s1,a,r,s2,d) in minibatch])
            state2_batch = torch.cat([s2 for (s1,a,r,s2,d) in minibatch])
            done_batch = torch.Tensor([d for (s1,a,r,s2,d) in minibatch])
            
            Q1 = model(state1_batch) #L
            with torch.no_grad():
                Q2 = model(state2_batch) #M
            
            Y = reward_batch + gamma * ((1 - done_batch) * torch.max(Q2,dim=1)[0]) #N
            X = Q1.gather(dim=1,index=action_batch.long().unsqueeze(dim=1)).squeeze()
            loss = loss_fn(X, Y.detach())
            print(i, loss.item())
            clear_output(wait=True)
            optimizer.zero_grad()
            loss.backward()
            losses.append(loss.item())
            optimizer.step()

        if reward != -1 or mov > max_moves: #O
            status = 0
            mov = 0
losses = np.array(losses)

999 0.0677986592054367


In [130]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = test_model(model, mode='random', display=False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num / 10
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 632
0.632
0.632
Games played: 1000, # of wins: 610
0.61
1.242
Games played: 1000, # of wins: 642
0.642
1.884
Games played: 1000, # of wins: 620
0.62
2.504
Games played: 1000, # of wins: 629
0.629
3.133
Games played: 1000, # of wins: 628
0.628
3.761
Games played: 1000, # of wins: 625
0.625
4.386
Games played: 1000, # of wins: 633
0.633
5.019
Games played: 1000, # of wins: 644
0.644
5.663
Games played: 1000, # of wins: 633
0.633
6.296
Win percentage: 62.96000000000001%


# More modified mask changed Original 64 Goal Pitt Player Wall

In [136]:
import numpy as np
import random
import sys

def randPair(s,e):
    return np.random.randint(s,e), np.random.randint(s,e)

class BoardPiece:

    def __init__(self, name, code, pos):
        self.name = name #name of the piece
        self.code = code #an ASCII character to display on the board
        self.pos = pos #2-tuple e.g. (1,4)

class BoardMask:

    def __init__(self, name, mask, code):
        self.name = name
        self.mask = mask
        self.code = code

    def get_positions(self): #returns tuple of arrays
        return np.nonzero(self.mask)

def zip_positions2d(positions): #positions is tuple of two arrays
    x,y = positions
    return list(zip(x,y))

class GridBoard:

    def __init__(self, size=4):
        self.size = size #Board dimensions, e.g. 4 x 4
        self.components = {} #name : board piece
        self.masks = {}

    def addPiece(self, name, code, pos=(0,0)):
        newPiece = BoardPiece(name, code, pos)
        self.components[name] = newPiece

    #basically a set of boundary elements
    def addMask(self, name, mask, code):
        #mask is a 2D-numpy array with 1s where the boundary elements are
        newMask = BoardMask(name, mask, code)
        self.masks[name] = newMask

    def movePiece(self, name, pos):
        move = True
        for _, mask in self.masks.items():
            if pos in zip_positions2d(mask.get_positions()):
                move = False
        if move:
            self.components[name].pos = pos

    def delPiece(self, name):
        del self.components['name']

    def render(self):
        dtype = '<U2'
        displ_board = np.zeros((self.size, self.size), dtype=dtype)
        displ_board[:] = ' '

        for name, piece in self.components.items():
            displ_board[piece.pos] = piece.code

        for name, mask in self.masks.items():
            displ_board[mask.get_positions()] = mask.code

        return displ_board

    def render_np(self):
        num_pieces = len(self.components) + len(self.masks)
        displ_board = np.zeros((num_pieces, self.size, self.size), dtype=np.uint8)
        layer = 0
        pos = []
        for name, piece in self.components.items():
            if name == 'Player':
                pos = (2,) + piece.pos
                displ_board[pos] = 1
                layer += 1
            if name == 'Pit':
                pos = (1,) + piece.pos
                displ_board[pos] = 1
                layer += 1
            if name == 'Goal':
                pos = (0,) + piece.pos
                displ_board[pos] = 1
                layer += 1
            if name == 'Wall':
                pos = (3,) + piece.pos
                displ_board[pos] = 1
                layer += 1
        i = 0
        layer = 4
        for name, mask in self.masks.items():
            if i == 0:
                if name == 'Player':
                    x,y = self.masks['boundary'].get_positions()
                    z = np.repeat(2,len(x))
                    a = (z,x,y)
                    displ_board[a] = 1
                    i += 1
                    print(mask)
                if name == 'Pit':
                    x,y = self.masks['boundary'].get_positions()
                    z = np.repeat(1,len(x))
                    a = (z,x,y)
                    displ_board[a] = 1
                    i += 1
                    print(mask)
                if name == 'Goal':
                    x,y = self.masks['boundary'].get_positions()
                    z = np.repeat(0,len(x))
                    a = (z,x,y)
                    displ_board[a] = 1
                    i += 1
                    print(mask)
                if name == 'Wall':
                    x,y = self.masks['boundary'].get_positions()
                    z = np.repeat(3,len(x))
                    a = (z,x,y)
                    displ_board[a] = 1
                    i += 1
                    print(mask)
            else:
                    x,y = self.masks['boundary'].get_positions()
                    z = np.repeat(layer,len(x))
                    a = (z,x,y)
                    displ_board[a] = 1
                    layer += 1
                    print(mask)
                    
        return displ_board

def addTuple(a,b):
    return tuple([sum(x) for x in zip(a,b)])

In [137]:
import numpy as np
import torch
from IPython.display import clear_output
import random
from matplotlib import pylab as plt

l1 = 64
l2 = 150
l3 = 100
l4 = 4

model = torch.nn.Sequential(
    torch.nn.Linear(l1, l2),
    torch.nn.ReLU(),
    torch.nn.Linear(l2, l3),
    torch.nn.ReLU(),
    torch.nn.Linear(l3,l4)
)
loss_fn = torch.nn.MSELoss()
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

gamma = 0.9
epsilon = 1.0
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

gamma = 0.9
epsilon = 1.0

In [138]:
from collections import deque
epochs = 1000
losses = []
mem_size = 1000 #A
batch_size = 200 #B
replay = deque(maxlen=mem_size) #C
max_moves = 50 #D
h = 0
for i in range(epochs):
    game = Gridworld(size=4, mode='random')
    state1_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/100.0
    state1 = torch.from_numpy(state1_).float()
    status = 1
    mov = 0
    while(status == 1): 
        mov += 1
        qval = model(state1) #E
        qval_ = qval.data.numpy()
        if (random.random() < epsilon): #F
            action_ = np.random.randint(0,4)
        else:
            action_ = np.argmax(qval_)
        
        action = action_set[action_]
        game.makeMove(action)
        state2_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/100.0
        state2 = torch.from_numpy(state2_).float()
        reward = game.reward()
        done = True if reward > 0 else False
        exp =  (state1, action_, reward, state2, done) #G
        replay.append(exp) #H
        state1 = state2
        
        if len(replay) > batch_size: #I
            minibatch = random.sample(replay, batch_size) #J
            state1_batch = torch.cat([s1 for (s1,a,r,s2,d) in minibatch]) #K
            action_batch = torch.Tensor([a for (s1,a,r,s2,d) in minibatch])
            reward_batch = torch.Tensor([r for (s1,a,r,s2,d) in minibatch])
            state2_batch = torch.cat([s2 for (s1,a,r,s2,d) in minibatch])
            done_batch = torch.Tensor([d for (s1,a,r,s2,d) in minibatch])
            
            Q1 = model(state1_batch) #L
            with torch.no_grad():
                Q2 = model(state2_batch) #M
            
            Y = reward_batch + gamma * ((1 - done_batch) * torch.max(Q2,dim=1)[0]) #N
            X = Q1.gather(dim=1,index=action_batch.long().unsqueeze(dim=1)).squeeze()
            loss = loss_fn(X, Y.detach())
            print(i, loss.item())
            clear_output(wait=True)
            optimizer.zero_grad()
            loss.backward()
            losses.append(loss.item())
            optimizer.step()

        if reward != -1 or mov > max_moves: #O
            status = 0
            mov = 0
losses = np.array(losses)

999 0.1491580605506897


In [139]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = test_model(model, mode='random', display=False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num / 10
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 625
0.625
0.625
Games played: 1000, # of wins: 640
0.64
1.2650000000000001
Games played: 1000, # of wins: 649
0.649
1.9140000000000001
Games played: 1000, # of wins: 625
0.625
2.539
Games played: 1000, # of wins: 656
0.656
3.1950000000000003
Games played: 1000, # of wins: 661
0.661
3.8560000000000003
Games played: 1000, # of wins: 636
0.636
4.492
Games played: 1000, # of wins: 655
0.655
5.147
Games played: 1000, # of wins: 663
0.663
5.8100000000000005
Games played: 1000, # of wins: 664
0.664
6.474
Win percentage: 64.74%


# 3 Layer Player + Pit, Goal + Wall

In [176]:
import numpy as np
import random
import sys

def randPair(s,e):
    return np.random.randint(s,e), np.random.randint(s,e)

class BoardPiece:

    def __init__(self, name, code, pos):
        self.name = name #name of the piece
        self.code = code #an ASCII character to display on the board
        self.pos = pos #2-tuple e.g. (1,4)

class BoardMask:

    def __init__(self, name, mask, code):
        self.name = name
        self.mask = mask
        self.code = code

    def get_positions(self): #returns tuple of arrays
        return np.nonzero(self.mask)

def zip_positions2d(positions): #positions is tuple of two arrays
    x,y = positions
    return list(zip(x,y))

class GridBoard:

    def __init__(self, size=4):
        self.size = size #Board dimensions, e.g. 4 x 4movePiece
        self.components = {} #name : board piece
        self.masks = {}

    def addPiece(self, name, code, pos=(0,0)):
        newPiece = BoardPiece(name, code, pos)
        self.components[name] = newPiece

    #basically a set of boundary elements
    def addMask(self, name, mask, code):
        #mask is a 2D-numpy array with 1s where the boundary elements are
        newMask = BoardMask(name, mask, code)
        self.masks[name] = newMask

    def movePiece(self, name, pos):
        move = True
        for _, mask in self.masks.items():
            if pos in zip_positions2d(mask.get_positions()):
                print(mask.get_positions())
                move = False
        if move:
            self.components[name].pos = pos

    def delPiece(self, name):
        del self.components['name']

    def render(self):
        dtype = '<U2'
        displ_board = np.zeros((self.size, self.size), dtype=dtype)
        displ_board[:] = ' '

        for name, piece in self.components.items():
            displ_board[piece.pos] = piece.code

        for name, mask in self.masks.items():
            displ_board[mask.get_positions()] = mask.code

        return displ_board

    def render_np(self):
        num_pieces = len(self.components) + len(self.masks) - 1
        displ_board = np.zeros((num_pieces, self.size, self.size), dtype=np.uint8)
        layer = 0
        pos = []
        for name, piece in self.components.items():
            if name == 'Player':
                pos = (0,) + piece.pos
                displ_board[pos] = 1
                layer += 1
            if name == 'Pit':
                pos = (0,) + piece.pos
                displ_board[pos] = 2
                layer += 1
            if name == 'Goal':
                pos = (1,) + piece.pos
                displ_board[pos] = 1
                layer += 1
            if name == 'Wall':
                pos = (2,) + piece.pos
                displ_board[pos] = 1
                layer += 1
        for name, mask in self.masks.items():
                if name == 'Player':
                    x,y = self.masks['boundary'].get_positions()
                    z = np.repeat(0,len(x))
                    a = (z,x,y)
                    displ_board[a] = 1
                    i += 1
                    print(mask)
                if name == 'Pit':
                    x,y = self.masks['boundary'].get_positions()
                    z = np.repeat(0,len(x))
                    a = (z,x,y)
                    displ_board[a] = 2
                    i += 1
                    print(mask)
                if name == 'Goal':
                    x,y = self.masks['boundary'].get_positions()
                    z = np.repeat(1,len(x))
                    a = (z,x,y)
                    displ_board[a] = 1
                    i += 1
                    print(mask)
                if name == 'Wall':
                    x,y = self.masks['boundary'].get_positions()
                    z = np.repeat(2,len(x))
                    a = (z,x,y)
                    displ_board[a] = 1
                    i += 1
                    print(mask)
                    
        return displ_board

def addTuple(a,b):
    return tuple([sum(x) for x in zip(a,b)])

In [177]:
class Gridworld:

    def __init__(self, size=4, mode='static'):
        if size >= 4:
            self.board = GridBoard(size=size)
        else:
            print("Minimum board size is 4. Initialized to size 4.")
            self.board = GridBoard(size=4)

        #Add pieces, positions will be updated later
        self.board.addPiece('Player','P',(0,0))
        self.board.addPiece('Goal','+',(1,0))
        self.board.addPiece('Pit','-',(2,0))
        self.board.addPiece('Wall','W',(3,0))

        if mode == 'static':
            self.initGridStatic()
        elif mode == 'player':
            self.initGridPlayer()
        else:
            self.initGridRand()

    #Initialize stationary grid, all items are placed deterministically
    def initGridStatic(self):
        #Setup static pieces
        self.board.components['Player'].pos = (0,3) #Row, Column
        self.board.components['Goal'].pos = (0,0)
        self.board.components['Pit'].pos = (0,1)
        self.board.components['Wall'].pos = (1,1)

    #Check if board is initialized appropriately (no overlapping pieces)
    #also remove impossible-to-win boards
    def validateBoard(self):
        valid = True

        player = self.board.components['Player']
        goal = self.board.components['Goal']
        wall = self.board.components['Wall']
        pit = self.board.components['Pit']

        all_positions = [piece for name,piece in self.board.components.items()]
        all_positions = [player.pos, goal.pos, wall.pos, pit.pos]
        if len(all_positions) > len(set(all_positions)):
            return False

        corners = [(0,0),(0,self.board.size), (self.board.size,0), (self.board.size,self.board.size)]
        #if player is in corner, can it move? if goal is in corner, is it blocked?
        if player.pos in corners or goal.pos in corners:
            val_move_pl = [self.validateMove('Player', addpos) for addpos in [(0,1),(1,0),(-1,0),(0,-1)]]
            val_move_go = [self.validateMove('Goal', addpos) for addpos in [(0,1),(1,0),(-1,0),(0,-1)]]
            if 0 not in val_move_pl or 0 not in val_move_go:
                #print(self.display())
                #print("Invalid board. Re-initializing...")
                valid = False

        return valid

    #Initialize player in random location, but keep wall, goal and pit stationary
    def initGridPlayer(self):
        #height x width x depth (number of pieces)
        self.initGridStatic()
        #place player
        self.board.components['Player'].pos = randPair(0,self.board.size)

        if (not self.validateBoard()):
            #print('Invalid grid. Rebuilding..')
            self.initGridPlayer()

    #Initialize grid so that goal, pit, wall, player are all randomly placed
    def initGridRand(self):
        #height x width x depth (number of pieces)
        self.board.components['Player'].pos = randPair(0,self.board.size)
        self.board.components['Goal'].pos = randPair(0,self.board.size)
        self.board.components['Pit'].pos = randPair(0,self.board.size)
        self.board.components['Wall'].pos = randPair(0,self.board.size)

        if (not self.validateBoard()):
            #print('Invalid grid. Rebuilding..')
            self.initGridRand()

    def validateMove(self, piece, addpos=(0,0)):
        outcome = 0 #0 is valid, 1 invalid, 2 lost game
        pit = self.board.components['Pit'].pos
        wall = self.board.components['Wall'].pos
        new_pos = addTuple(self.board.components[piece].pos, addpos)
        if new_pos == wall:
            outcome = 1 #block move, player can't move to wall
        elif max(new_pos) > (self.board.size-1):    #if outside bounds of board
            outcome = 1
        elif min(new_pos) < 0: #if outside bounds
            outcome = 1
        elif new_pos == pit:
            outcome = 2

        return outcome

    def makeMove(self, action):
        #need to determine what object (if any) is in the new grid spot the player is moving to
        #actions in {u,d,l,r}
        def checkMove(addpos):
            if self.validateMove('Player', addpos) in [0,2]:
                new_pos = addTuple(self.board.components['Player'].pos, addpos)
                self.board.movePiece('Player', new_pos)

        if action == 'u': #up
            checkMove((-1,0))
        elif action == 'd': #down
            checkMove((1,0))
        elif action == 'l': #left
            checkMove((0,-1))
        elif action == 'r': #right
            checkMove((0,1))
        else:
            pass

    def reward(self):
        if (self.board.components['Player'].pos == self.board.components['Pit'].pos):
            return -10
        elif (self.board.components['Player'].pos == self.board.components['Goal'].pos):
            return 10
        else:
            return -1

    def display(self):
        return self.board.render()

In [178]:
game = Gridworld(size=4, mode='random')
game.board.render_np()

array([[[0, 0, 0, 0],
        [0, 0, 2, 0],
        [0, 0, 0, 0],
        [0, 1, 0, 0]],

       [[0, 0, 0, 0],
        [0, 0, 0, 1],
        [0, 0, 0, 0],
        [0, 0, 0, 0]],

       [[0, 1, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]]], dtype=uint8)

In [189]:
game.makeMove('d')

In [169]:
import numpy as np
import torch
from IPython.display import clear_output
import random
from matplotlib import pylab as plt

l1 = 48
l2 = 150
l3 = 100
l4 = 4

model = torch.nn.Sequential(
    torch.nn.Linear(l1, l2),
    torch.nn.ReLU(),
    torch.nn.Linear(l2, l3),
    torch.nn.ReLU(),
    torch.nn.Linear(l3,l4)
)
loss_fn = torch.nn.MSELoss()
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

gamma = 0.9
epsilon = 1.0
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

gamma = 0.9
epsilon = 1.0

In [170]:
from collections import deque
epochs = 1000
losses = []
mem_size = 1000 #A
batch_size = 200 #B
replay = deque(maxlen=mem_size) #C
max_moves = 50 #D
h = 0
for i in range(epochs):
    game = Gridworld(size=4, mode='random')
    state1_ = game.board.render_np().reshape(1,48) + np.random.rand(1,48)/100.0
    state1 = torch.from_numpy(state1_).float()
    status = 1
    mov = 0
    while(status == 1): 
        mov += 1
        qval = model(state1) #E
        qval_ = qval.data.numpy()
        if (random.random() < epsilon): #F
            action_ = np.random.randint(0,4)
        else:
            action_ = np.argmax(qval_)
        
        action = action_set[action_]
        game.makeMove(action)
        state2_ = game.board.render_np().reshape(1,48) + np.random.rand(1,48)/100.0
        state2 = torch.from_numpy(state2_).float()
        reward = game.reward()
        done = True if reward > 0 else False
        exp =  (state1, action_, reward, state2, done) #G
        replay.append(exp) #H
        state1 = state2
        
        if len(replay) > batch_size: #I
            minibatch = random.sample(replay, batch_size) #J
            state1_batch = torch.cat([s1 for (s1,a,r,s2,d) in minibatch]) #K
            action_batch = torch.Tensor([a for (s1,a,r,s2,d) in minibatch])
            reward_batch = torch.Tensor([r for (s1,a,r,s2,d) in minibatch])
            state2_batch = torch.cat([s2 for (s1,a,r,s2,d) in minibatch])
            done_batch = torch.Tensor([d for (s1,a,r,s2,d) in minibatch])
            
            Q1 = model(state1_batch) #L
            with torch.no_grad():
                Q2 = model(state2_batch) #M
            
            Y = reward_batch + gamma * ((1 - done_batch) * torch.max(Q2,dim=1)[0]) #N
            X = Q1.gather(dim=1,index=action_batch.long().unsqueeze(dim=1)).squeeze()
            loss = loss_fn(X, Y.detach())
            print(i, loss.item())
            clear_output(wait=True)
            optimizer.zero_grad()
            loss.backward()
            losses.append(loss.item())
            optimizer.step()

        if reward != -1 or mov > max_moves: #O
            status = 0
            mov = 0
losses = np.array(losses)

999 0.20165178179740906


In [171]:
def test_model(model, mode='static', display=True):
    i = 0
    test_game = Gridworld(mode=mode)
    state_ = test_game.board.render_np().reshape(1,48) + np.random.rand(1,48)/10.0
    state = torch.from_numpy(state_).float()
    if display:
        print("Initial State:")
        print(test_game.display())
    status = 1
    while(status == 1): #A
        qval = model(state)
        qval_ = qval.data.numpy()
        action_ = np.argmax(qval_) #B
        action = action_set[action_]
        if display:
            print('Move #: %s; Taking action: %s' % (i, action))
        test_game.makeMove(action)
        state_ = test_game.board.render_np().reshape(1,48) + np.random.rand(1,48)/10.0
        state = torch.from_numpy(state_).float()
        if display:
            print(test_game.display())
        reward = test_game.reward()
        if reward != -1:
            if reward > 0:
                status = 2
                if display:
                    print("Game won! Reward: %s" % (reward,))
            else:
                status = 0
                if display:
                    print("Game LOST. Reward: %s" % (reward,))
        i += 1
        if (i > 15):
            if display:
                print("Game lost; too many moves.")
            break
    
    win = True if status == 2 else False
    return win

In [172]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = test_model(model, mode='random', display=False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num / 10
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 285
0.285
0.285
Games played: 1000, # of wins: 255
0.255
0.54
Games played: 1000, # of wins: 287
0.287
0.827
Games played: 1000, # of wins: 300
0.3
1.127
Games played: 1000, # of wins: 268
0.268
1.395
Games played: 1000, # of wins: 264
0.264
1.659
Games played: 1000, # of wins: 250
0.25
1.909
Games played: 1000, # of wins: 268
0.268
2.177
Games played: 1000, # of wins: 285
0.285
2.462
Games played: 1000, # of wins: 247
0.247
2.709
Win percentage: 27.090000000000003%


# One layer

In [213]:
import numpy as np
import random
import sys

def randPair(s,e):
    return np.random.randint(s,e), np.random.randint(s,e)

class BoardPiece:

    def __init__(self, name, code, pos):
        self.name = name #name of the piece
        self.code = code #an ASCII character to display on the board
        self.pos = pos #2-tuple e.g. (1,4)

class BoardMask:

    def __init__(self, name, mask, code):
        self.name = name
        self.mask = mask
        self.code = code

    def get_positions(self): #returns tuple of arrays
        return np.nonzero(self.mask)

def zip_positions2d(positions): #positions is tuple of two arrays
    x,y = positions
    return list(zip(x,y))

class GridBoard:

    def __init__(self, size=4):
        self.size = size #Board dimensions, e.g. 4 x 4movePiece
        self.components = {} #name : board piece
        self.masks = {}

    def addPiece(self, name, code, pos=(0,0)):
        newPiece = BoardPiece(name, code, pos)
        self.components[name] = newPiece

    #basically a set of boundary elements
    def addMask(self, name, mask, code):
        #mask is a 2D-numpy array with 1s where the boundary elements are
        newMask = BoardMask(name, mask, code)
        self.masks[name] = newMask

    def movePiece(self, name, pos):
        move = True
        for _, mask in self.masks.items():
            if pos in zip_positions2d(mask.get_positions()):
                print(mask.get_positions())
                move = False
        if move:
            self.components[name].pos = pos

    def delPiece(self, name):
        del self.components['name']

    def render(self):
        dtype = '<U2'
        displ_board = np.zeros((self.size, self.size), dtype=dtype)
        displ_board[:] = ' '

        for name, piece in self.components.items():
            displ_board[piece.pos] = piece.code

        for name, mask in self.masks.items():
            displ_board[mask.get_positions()] = mask.code

        return displ_board

    def render_np(self):
        num_pieces = len(self.components) + len(self.masks) - 3
        displ_board = np.zeros((num_pieces, self.size, self.size), dtype=np.uint8)
        layer = 0
        pos = []
        for name, piece in self.components.items():
            if name == 'Player':
                pos = (0,) + piece.pos
                displ_board[pos] = 1
                layer += 1
            if name == 'Pit':
                pos = (0,) + piece.pos
                displ_board[pos] = 2
                layer += 1
            if name == 'Goal':
                pos = (0,) + piece.pos
                displ_board[pos] = 3
                layer += 1
            if name == 'Wall':
                pos = (0,) + piece.pos
                displ_board[pos] = 4
                layer += 1
        for name, mask in self.masks.items():
                print('Entered')
                if name == 'Player':
                    x,y = self.masks['boundary'].get_positions()
                    z = np.repeat(0,len(x))
                    a = (z,x,y)
                    displ_board[a] = 1
                    print(mask)
                    print("Entered P")
                if name == 'Pit':
                    x,y = self.masks['boundary'].get_positions()
                    z = np.repeat(0,len(x))
                    a = (z,x,y)
                    displ_board[a] = 2
                    print(mask)
                    print("Entered P")
                if name == 'Goal':
                    x,y = self.masks['boundary'].get_positions()
                    z = np.repeat(0,len(x))
                    a = (z,x,y)
                    displ_board[a] = 3
                    print(mask)
                    print("Entered P")
                if name == 'Wall':
                    x,y = self.masks['boundary'].get_positions()
                    z = np.repeat(0,len(x))
                    a = (z,x,y)
                    displ_board[a] = 4
                    print(mask)
                    print("Entered P")
        return displ_board

def addTuple(a,b):
    return tuple([sum(x) for x in zip(a,b)])

In [215]:
game = Gridworld(size=4, mode='random')
game.board.render_np()

array([[[3, 0, 0, 4],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [1, 0, 2, 0]]], dtype=uint8)

In [216]:
import numpy as np
import torch
from IPython.display import clear_output
import random
from matplotlib import pylab as plt

l1 = 16
l2 = 150
l3 = 100
l4 = 4

model = torch.nn.Sequential(
    torch.nn.Linear(l1, l2),
    torch.nn.ReLU(),
    torch.nn.Linear(l2, l3),
    torch.nn.ReLU(),
    torch.nn.Linear(l3,l4)
)
loss_fn = torch.nn.MSELoss()
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

gamma = 0.9
epsilon = 1.0
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

gamma = 0.9
epsilon = 1.0

In [217]:
from collections import deque
epochs = 1000
losses = []
mem_size = 1000 #A
batch_size = 200 #B
replay = deque(maxlen=mem_size) #C
max_moves = 50 #D
h = 0
for i in range(epochs):
    game = Gridworld(size=4, mode='random')
    state1_ = game.board.render_np().reshape(1,16) + np.random.rand(1,16)/100.0
    state1 = torch.from_numpy(state1_).float()
    status = 1
    mov = 0
    while(status == 1): 
        mov += 1
        qval = model(state1) #E
        qval_ = qval.data.numpy()
        if (random.random() < epsilon): #F
            action_ = np.random.randint(0,4)
        else:
            action_ = np.argmax(qval_)
        
        action = action_set[action_]
        game.makeMove(action)
        state2_ = game.board.render_np().reshape(1,16) + np.random.rand(1,16)/100.0
        state2 = torch.from_numpy(state2_).float()
        reward = game.reward()
        done = True if reward > 0 else False
        exp =  (state1, action_, reward, state2, done) #G
        replay.append(exp) #H
        state1 = state2
        
        if len(replay) > batch_size: #I
            minibatch = random.sample(replay, batch_size) #J
            state1_batch = torch.cat([s1 for (s1,a,r,s2,d) in minibatch]) #K
            action_batch = torch.Tensor([a for (s1,a,r,s2,d) in minibatch])
            reward_batch = torch.Tensor([r for (s1,a,r,s2,d) in minibatch])
            state2_batch = torch.cat([s2 for (s1,a,r,s2,d) in minibatch])
            done_batch = torch.Tensor([d for (s1,a,r,s2,d) in minibatch])
            
            Q1 = model(state1_batch) #L
            with torch.no_grad():
                Q2 = model(state2_batch) #M
            
            Y = reward_batch + gamma * ((1 - done_batch) * torch.max(Q2,dim=1)[0]) #N
            X = Q1.gather(dim=1,index=action_batch.long().unsqueeze(dim=1)).squeeze()
            loss = loss_fn(X, Y.detach())
#             print(i, loss.item())
#             clear_output(wait=True)
            optimizer.zero_grad()
            loss.backward()
            losses.append(loss.item())
            optimizer.step()

        if reward != -1 or mov > max_moves: #O
            status = 0
            mov = 0
losses = np.array(losses)

In [218]:
def test_model(model, mode='static', display=True):
    i = 0
    test_game = Gridworld(mode=mode)
    state_ = test_game.board.render_np().reshape(1,16) + np.random.rand(1,16)/10.0
    state = torch.from_numpy(state_).float()
    if display:
        print("Initial State:")
        print(test_game.display())
    status = 1
    while(status == 1): #A
        qval = model(state)
        qval_ = qval.data.numpy()
        action_ = np.argmax(qval_) #B
        action = action_set[action_]
        if display:
            print('Move #: %s; Taking action: %s' % (i, action))
        test_game.makeMove(action)
        state_ = test_game.board.render_np().reshape(1,16) + np.random.rand(1,16)/10.0
        state = torch.from_numpy(state_).float()
        if display:
            print(test_game.display())
        reward = test_game.reward()
        if reward != -1:
            if reward > 0:
                status = 2
                if display:
                    print("Game won! Reward: %s" % (reward,))
            else:
                status = 0
                if display:
                    print("Game LOST. Reward: %s" % (reward,))
        i += 1
        if (i > 15):
            if display:
                print("Game lost; too many moves.")
            break
    
    win = True if status == 2 else False
    return win

In [219]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = test_model(model, mode='random', display=False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num / 10
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 106
0.106
0.106
Games played: 1000, # of wins: 116
0.116
0.222
Games played: 1000, # of wins: 128
0.128
0.35
Games played: 1000, # of wins: 105
0.105
0.45499999999999996
Games played: 1000, # of wins: 111
0.111
0.566
Games played: 1000, # of wins: 113
0.113
0.6789999999999999
Games played: 1000, # of wins: 98
0.098
0.7769999999999999
Games played: 1000, # of wins: 121
0.121
0.8979999999999999
Games played: 1000, # of wins: 103
0.103
1.001
Games played: 1000, # of wins: 106
0.106
1.107
Win percentage: 11.069999999999999%
