In [1]:
import numpy as np
import random
import sys

def randPair(s,e):
    return np.random.randint(s,e), np.random.randint(s,e)

class BoardPiece:

    def __init__(self, name, code, pos):
        self.name = name #name of the piece
        self.code = code #an ASCII character to display on the board
        self.pos = pos #2-tuple e.g. (1,4)

class BoardMask:

    def __init__(self, name, mask, code):
        self.name = name
        self.mask = mask
        self.code = code

    def get_positions(self): #returns tuple of arrays
        return np.nonzero(self.mask)

def zip_positions2d(positions): #positions is tuple of two arrays
    x,y = positions
    return list(zip(x,y))

class GridBoard:

    def __init__(self, size=4):
        self.size = size #Board dimensions, e.g. 4 x 4
        self.components = {} #name : board piece
        self.masks = {}

    def addPiece(self, name, code, pos=(0,0)):
        newPiece = BoardPiece(name, code, pos)
        self.components[name] = newPiece

    #basically a set of boundary elements
    def addMask(self, name, mask, code):
        #mask is a 2D-numpy array with 1s where the boundary elements are
        newMask = BoardMask(name, mask, code)
        self.masks[name] = newMask

    def movePiece(self, name, pos):
        move = True
        for _, mask in self.masks.items():
            if pos in zip_positions2d(mask.get_positions()):
                move = False
        if move:
            self.components[name].pos = pos

    def delPiece(self, name):
        del self.components['name']

    def render(self):
        dtype = '<U2'
        displ_board = np.zeros((self.size, self.size), dtype=dtype)
        displ_board[:] = ' '

        for name, piece in self.components.items():
            displ_board[piece.pos] = piece.code

        for name, mask in self.masks.items():
            displ_board[mask.get_positions()] = mask.code

        return displ_board

    def render_np(self):
        num_pieces = len(self.components) + len(self.masks)
        displ_board = np.zeros((num_pieces, self.size, self.size), dtype=np.uint8)
        layer = 0
        for name, piece in self.components.items():
            pos = (layer,) + piece.pos
            displ_board[pos] = 1
            layer += 1

        for name, mask in self.masks.items():
            x,y = self.masks['boundary'].get_positions()
            z = np.repeat(layer,len(x))
            a = (z,x,y)
            displ_board[a] = 1
            layer += 1
        return displ_board

def addTuple(a,b):
    return tuple([sum(x) for x in zip(a,b)])

In [2]:
class Gridworld:

    def __init__(self, size=4, mode='static'):
        if size >= 4:
            self.board = GridBoard(size=size)
        else:
            print("Minimum board size is 4. Initialized to size 4.")
            self.board = GridBoard(size=4)

        #Add pieces, positions will be updated later
        self.board.addPiece('Player','P',(0,0))
        self.board.addPiece('Goal','+',(0,3))
        self.board.addPiece('Pit','-',(0,3))
        self.board.addPiece('Wall','W',(2,3))

        if mode == 'static':
            self.initGridStatic()
        elif mode == 'player':
            self.initGridPlayer()
        else:
            self.initGridRand()

    #Initialize stationary grid, all items are placed deterministically
    def initGridStatic(self):
        #Setup static pieces
        self.board.components['Player'].pos = randPair(0,self.board.size) #Row, Column
        self.board.components['Goal'].pos = (3,0)
        self.board.components['Pit'].pos = (3,1)
        self.board.components['Wall'].pos = (3,2)
        
        if (not self.validateBoard()):
            #print('Invalid grid. Rebuilding..')
            self.initGridStatic()


    #Check if board is initialized appropriately (no overlapping pieces)
    #also remove impossible-to-win boards
    def validateBoard(self):
        valid = True

        player = self.board.components['Player']
        goal = self.board.components['Goal']
        wall = self.board.components['Wall']
        pit = self.board.components['Pit']

        all_positions = [piece for name,piece in self.board.components.items()]
        all_positions = [player.pos, goal.pos, wall.pos, pit.pos]
        if len(all_positions) > len(set(all_positions)):
            return False

        corners = [(0,0),(0,self.board.size), (self.board.size,0), (self.board.size,self.board.size)]
        #if player is in corner, can it move? if goal is in corner, is it blocked?
        if player.pos in corners or goal.pos in corners:
            val_move_pl = [self.validateMove('Player', addpos) for addpos in [(0,1),(1,0),(-1,0),(0,-1)]]
            val_move_go = [self.validateMove('Goal', addpos) for addpos in [(0,1),(1,0),(-1,0),(0,-1)]]
            if 0 not in val_move_pl or 0 not in val_move_go:
                #print(self.display())
                #print("Invalid board. Re-initializing...")
                valid = False

        return valid

    #Initialize player in random location, but keep wall, goal and pit stationary
    def initGridPlayer(self):
        #height x width x depth (number of pieces)
        self.initGridStatic()
        #place player
        self.board.components['Player'].pos = randPair(0,self.board.size)

        if (not self.validateBoard()):
            #print('Invalid grid. Rebuilding..')
            self.initGridPlayer()

    #Initialize grid so that goal, pit, wall, player are all randomly placed
    def initGridRand(self):
        #height x width x depth (number of pieces)
        self.board.components['Player'].pos = randPair(0,self.board.size)
        self.board.components['Goal'].pos = randPair(0,self.board.size)
        self.board.components['Pit'].pos = randPair(0,self.board.size)
        self.board.components['Wall'].pos = randPair(0,self.board.size)

        if (not self.validateBoard()):
            #print('Invalid grid. Rebuilding..')
            self.initGridRand()

    def validateMove(self, piece, addpos=(0,0)):
        outcome = 0 #0 is valid, 1 invalid, 2 lost game
        pit = self.board.components['Pit'].pos
        wall = self.board.components['Wall'].pos
        new_pos = addTuple(self.board.components[piece].pos, addpos)
        if new_pos == wall:
            outcome = 1 #block move, player can't move to wall
        elif max(new_pos) > (self.board.size-1):    #if outside bounds of board
            outcome = 1
        elif min(new_pos) < 0: #if outside bounds
            outcome = 1
        elif new_pos == pit:
            outcome = 2

        return outcome

    def makeMove(self, action):
        #need to determine what object (if any) is in the new grid spot the player is moving to
        #actions in {u,d,l,r}
        def checkMove(addpos):
            if self.validateMove('Player', addpos) in [0,2]:
                new_pos = addTuple(self.board.components['Player'].pos, addpos)
                self.board.movePiece('Player', new_pos)

        if action == 'u': #up
            checkMove((-1,0))
        elif action == 'd': #down
            checkMove((1,0))
        elif action == 'l': #left
            checkMove((0,-1))
        elif action == 'r': #right
            checkMove((0,1))
        else:
            pass

    def reward(self):
        if (self.board.components['Player'].pos == self.board.components['Pit'].pos):
            return -10
        elif (self.board.components['Player'].pos == self.board.components['Goal'].pos):
            return 10
        else:
            return -1

    def display(self):
        return self.board.render()

In [3]:
action_set = {
    0: 'u',
    1: 'd',
    2: 'l',
    3: 'r',
}

In [4]:
def from2dto1d(pos):
    if pos == '(0, 0)':
        return 0
    if pos == '(0, 1)':
        return 1
    if pos == '(0, 2)':
        return 2
    if pos == '(0, 3)':
        return 3
    if pos == '(1, 0)':
        return 4
    if pos == '(1, 1)':
        return 5
    if pos == '(1, 2)':
        return 6
    if pos == '(1, 3)':
        return 7
    if pos == '(2, 0)':
        return 8
    if pos == '(2, 1)':
        return 9
    if pos == '(2, 2)':
        return 10
    if pos == '(2, 3)':
        return 11
    if pos == '(3, 0)':
        return 12
    if pos == '(3, 1)':
        return 13
    if pos == '(3, 2)':
        return 14
    if pos == '(3, 3)':
        return 15

In [4]:
def from1dto2d(pos):
    if pos == 0:
        return (0, 0)
    if pos == 1:
        return (0, 1)
    if pos == 2:
        return (0, 2)
    if pos == 3:
        return (0, 3)
    if pos == 4:
        return (1, 0)
    if pos == 5:
        return (1, 1)
    if pos == 6:
        return (1, 2)
    if pos == 7:
        return (1, 3)
    if pos == 8:
        return (2, 0)
    if pos == 9:
        return (2, 1)
    if pos == 10:
        return (2, 2)
    if pos == 11:
        return (2, 3)
    if pos == 12:
        return (3, 0)
    if pos == 13:
        return (3, 1)
    if pos == 14:
        return (3, 2)
    if pos == 15:
        return (3, 3)

In [5]:
def from_num_to_one_hot_encode(num):
    en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    if num == 0:
        en0 = 1
    if num == 1:
        en1 = 1
    if num == 2:
        en2 = 1
    if num == 3:
        en3 = 1
    if num == 4:
        en4 = 1
    if num == 5:
        en5 = 1
    if num == 6:
        en6 = 1
    if num == 7:
        en7 = 1
    if num == 8:
        en8 = 1
    if num == 9:
        en9 = 1
    if num == 10:
        en10 = 1
    if num == 11:
        en11 = 1
    if num == 12:
        en12 = 1
    if num == 13:
        en13 = 1
    if num == 14:
        en14 = 1
    if num == 15:
        en15 = 1
    return en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15

In [5]:
import numpy as np
import torch
from IPython.display import clear_output
import random
from matplotlib import pylab as plt

l1 = 64
l2 = 200
l3 = 120
l4 = 4

model = torch.nn.Sequential(
    torch.nn.Linear(l1, l2),
    torch.nn.ReLU(),
    torch.nn.Linear(l2, l3),
    torch.nn.ReLU(),
    torch.nn.Linear(l3,l4)
)
loss_fn = torch.nn.MSELoss()
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

gamma = 0.9
epsilon = 0.3

In [15]:
h = 0
w, h = 20, 3000000
matrix = [[0 for x in range(w)] for y in range(h)] 

h = 0
w, h = 8, 3000000
experience = [[0 for x in range(w)] for y in range(h)] 

In [6]:
from collections import deque
losses = []
mem_size = 1000 #A
batch_size = 200 #B
replay = deque(maxlen=mem_size) #C
max_moves = 50 #D

h = 0
w, h = 20, 300000
matrix = [[0 for x in range(w)] for y in range(h)] 

counter = 0
num = 0
epoch = 0
for i in range(0,15):
    for j in range(0,15):
        for k in range (0,15):
            for c in range (0,15):
                if j == i:
                    pass
                if k == j or k == i:
                    pass
                if c == k or c == i or c == j:
                    pass
                else:
                    game = Gridworld(size=4, mode='random')
                    game.board.components['Player'].pos = from1dto2d(c)
                    game.board.components['Goal'].pos = from1dto2d(k)
                    game.board.components['Wall'].pos = from1dto2d(j)
                    game.board.components['Pit'].pos = from1dto2d(i)
                    state1_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/100.0
                    state1 = torch.from_numpy(state1_).float()
                    status = 1
                    mov = 0
                    while(status == 1): 
                        mov += 1
                        qval = model(state1) #E
                        qval_ = qval.data.numpy()

                        matrix[counter][9] = qval_[0][np.argmax(qval_)]
                        matrix[counter][10] = qval_[0][0]
                        matrix[counter][11] = qval_[0][1]
                        matrix[counter][12] = qval_[0][2]
                        matrix[counter][13] = qval_[0][3]


                        if (random.random() < epsilon): #F
                            matrix[counter][8] = 1 
                            action_ = np.random.randint(0,4)
                        else:
                            matrix[counter][8] = 0 
                            action_ = np.argmax(qval_)

                        action = action_set[action_]

                        matrix[counter][0] = epoch
                        matrix[counter][1] = (str(game.board.components['Player'].pos))
                        matrix[counter][2] = (str(game.board.components['Pit'].pos))
                        matrix[counter][3] = (str(game.board.components['Goal'].pos))
                        matrix[counter][4] = (str(game.board.components['Wall'].pos))
                        matrix[counter][5] = action_

                        game.makeMove(action)
                        state2_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/100.0
                        state2 = torch.from_numpy(state2_).float()
                        next_state = model(state2)
                        next_state_ = next_state.data.numpy()

                        matrix[counter][18] = next_state_[0][np.argmax(next_state_)]
                        matrix[counter][14] = next_state_[0][0]
                        matrix[counter][15] = next_state_[0][1]
                        matrix[counter][16] = next_state_[0][2]
                        matrix[counter][17] = next_state_[0][3]
                        reward = game.reward()
                        matrix[counter][6] = reward
                        if reward == -1 or reward == -10: #N
                            Y = reward + (gamma * next_state_[0][np.argmax(next_state_)])
                        else:
                            Y = reward

                        matrix[counter][19] = Y
                        done = True if reward > 0 else False
                        matrix[counter][7] = (str(game.board.components['Player'].pos))
                        exp =  (state1, action_, reward, state2, done) #G
                        replay.append(exp) #H
                        state1 = state2


                        if len(replay) > batch_size: #I
                            minibatch = random.sample(replay, batch_size) #J
                            state1_batch = torch.cat([s1 for (s1,a,r,s2,d) in minibatch]) #K
                            action_batch = torch.Tensor([a for (s1,a,r,s2,d) in minibatch])
                            reward_batch = torch.Tensor([r for (s1,a,r,s2,d) in minibatch])
                            state2_batch = torch.cat([s2 for (s1,a,r,s2,d) in minibatch])
                            done_batch = torch.Tensor([d for (s1,a,r,s2,d) in minibatch])

                            Q1 = model(state1_batch) #L
                            with torch.no_grad():
                                Q2 = model(state2_batch) #M

                            Y = reward_batch + gamma * ((1 - done_batch) * torch.max(Q2,dim=1)[0]) #N
                            X = Q1.gather(dim=1,index=action_batch.long().unsqueeze(dim=1)).squeeze()
                            loss = loss_fn(X, Y.detach())
                            print(epoch)
                            clear_output(wait=True)
                            optimizer.zero_grad()
                            loss.backward()
                            losses.append(loss.item())
                            optimizer.step()
                        counter += 1
                        if reward != -1 or mov > max_moves: #O
                            status = 0
                            mov = 0
                            epoch += 1
losses = np.array(losses)

KeyboardInterrupt: 

In [79]:
def lr_test_model(model_b, matrix,experience, mode='static', display=True):
    global counter
    i = 0
    test_game = Gridworld(mode=mode)
#     state_ = test_game.board.render_np().reshape(1,64) + np.random.rand(1,64)/10.0
#     state = torch.from_numpy(state_).float()
    if display:
        print("Initial State:")
        print(test_game.display())
    status = 1
    j = 0
    while(status == 1): #A
        p_curr = from2dto1d((str(test_game.board.components['Player'].pos)))
        pi = from2dto1d((str(test_game.board.components['Pit'].pos)))
        g = from2dto1d((str(test_game.board.components['Goal'].pos)))
        w = from2dto1d((str(test_game.board.components['Wall'].pos)))
        #q_current = model.predict([[p_curr,pi,g,w]])[0][0]
        
        q_value_current = []
        q_value_next = []
        q_value = []
        test_game.makeMove('u')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)

        reward = test_game.reward()
       # q_value_current.append(model.predict([[p,pi,g,w]])[0][0])
        q_value_next.append(model_b.predict([[en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15,pi,g,w,reward]])[0][0])
        #q_value.append(abs(model_b.predict([[p,pi,g,w,reward]])[0][0]-q_current))
        if p_curr != p:
            test_game.makeMove('d')
        
        test_game.makeMove('d')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)

        reward = test_game.reward()
       # q_value_current.append(model.predict([[p,pi,g,w]])[0][0])
        q_value_next.append(model_b.predict([[en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15,pi,g,w,reward]])[0][0])
        #q_value.append(abs(model_b.predict([[p,pi,g,w,reward]])[0][0]-q_current))
        if p_curr != p:
            test_game.makeMove('u')
        
        test_game.makeMove('l')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)

        reward = test_game.reward()
       # q_value_current.append(model.predict([[p,pi,g,w]])[0][0])
        q_value_next.append(model_b.predict([[en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15,pi,g,w,reward]])[0][0])
        #q_value.append(abs(model_b.predict([[p,pi,g,w,reward]])[0][0]-q_current))
        if p_curr != p:
            test_game.makeMove('r')
        
        test_game.makeMove('r')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)

        reward = test_game.reward()
       # q_value_current.append(model.predict([[p,pi,g,w]])[0][0])
        q_value_next.append(model_b.predict([[en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15,pi,g,w,reward]])[0][0])
        #q_value.append(abs(model_b.predict([[p,pi,g,w,reward]])[0][0]-q_current))
        if p_curr != p:
            test_game.makeMove('l')
        
#         print(q_value_current)
        
        
        action_ = np.argmax(q_value_next)
#         print(action_)
        action = action_set[action_]
        matrix[j][0] = p_curr
        matrix[j][1] = pi
        matrix[j][2] = g
        matrix[j][3] = w
        
        experience[counter][0] = p_curr
        experience[counter][1] = pi
        experience[counter][2] = g
        experience[counter][3] = w
        experience[counter][4] = action_
        
        
#         matrix[j][4] = q_current
#         matrix[j][5:9] = q_value_current
#         matrix[j][9] = max(q_value_current)
#         matrix[j][10:14] = q_value_next
#         matrix[j][14] = max(q_value_next)
        if display:
            print('Move #: %s; Taking action: %s' % (i, action))
        test_game.makeMove(action)
        p_next = from2dto1d((str(test_game.board.components['Player'].pos)))
        experience[counter][5] = p_next
        
#         state_ = test_game.board.render_np().reshape(1,64) + np.random.rand(1,64)/10.0
#         state = torch.from_numpy(state_).float()
#         if test_game.reward() == -1:
#             Y = test_game.reward() + 0.9 * max(q_value_next)
#         else:
#             Y = test_game.reward()
#         Y = q_value_next[np.argmax(q_value_current)]
#         matrix[j][15] = q_value[0]
#         matrix[j][16] = q_value[1]
#         matrix[j][17] = q_value[2]
#         matrix[j][18] = q_value[3]
#         matrix[j][19] = min(q_value)
        j += 1
        if display:
            print(test_game.display())
        reward = test_game.reward()
        experience[counter][6] = reward
        if reward > -1:
            experience[counter][7] = True
        else:
            experience[counter][7] = False
        experience[counter][8] =  q_value_next[action_]
        experience[counter][9] = q_value_next[0]
        experience[counter][10] = q_value_next[1]
        experience[counter][11] = q_value_next[2]
        experience[counter][12] = q_value_next[3]
        if reward != -1:
            if reward > 0:
                status = 2
                if display:
                    print("Game won! Reward: %s" % (reward,))
            else:
                status = 0
                if display:
                    print("Game LOST. Reward: %s" % (reward,))
        counter += 1
        i += 1
        if (i > 15):
            if display:
                print("Game lost; too many moves.")
            break
    
    win = True if status == 2 else False
    return win

In [10]:
def onlinetest_model(model, mode='static', display=True):
    i = 0
    test_game = Gridworld(mode=mode)
    state_ = test_game.board.render_np().reshape(1,64) + np.random.rand(1,64)/10.0
    state = torch.from_numpy(state_).float()
    if display:
        print("Initial State:")
        print(test_game.display())
    status = 1
    while(status == 1): #A
        qval = model(state)
        qval_ = qval.data.numpy()
        action_ = np.argmax(qval_) #B
        action = action_set[action_]
        if display:
            print('Move #: %s; Taking action: %s' % (i, action))
        test_game.makeMove(action)
        state_ = test_game.board.render_np().reshape(1,64) + np.random.rand(1,64)/10.0
        state = torch.from_numpy(state_).float()
        if display:
            print(test_game.display())
        reward = test_game.reward()
        if reward != -1:
            if reward > 0:
                status = 2
                if display:
                    print("Game won! Reward: %s" % (reward,))
            else:
                status = 0
                if display:
                    print("Game LOST. Reward: %s" % (reward,))
        i += 1
        if (i > 15):
            if display:
                print("Game lost; too many moves.")
            break
    
    win = True if status == 2 else False
    return win

In [11]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = onlinetest_model(model, mode='random', display=False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num / 10
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 949
0.949
0.949
Games played: 1000, # of wins: 965
0.965
1.914
Games played: 1000, # of wins: 951
0.951
2.8649999999999998
Games played: 1000, # of wins: 956
0.956
3.8209999999999997
Games played: 1000, # of wins: 951
0.951
4.771999999999999
Games played: 1000, # of wins: 950
0.95
5.7219999999999995
Games played: 1000, # of wins: 947
0.947
6.669
Games played: 1000, # of wins: 933
0.933
7.601999999999999
Games played: 1000, # of wins: 947
0.947
8.549
Games played: 1000, # of wins: 955
0.955
9.504
Win percentage: 95.03999999999999%


In [46]:
h = 0
w, h = 12, 3000000
experience_dqn = [[0 for x in range(w)] for y in range(h)] 

global counter
counter = 0

In [47]:
def dqn_test_model(model, matrix, experience_dqn, mode='static', display=True):
    global counter
    i = 0
    test_game = Gridworld(mode=mode)
    state_ = test_game.board.render_np().reshape(1,64) + np.random.rand(1,64)/10.0
    state = torch.from_numpy(state_).float()
    if display:
        print("Initial State:")
        print(test_game.display())
    status = 1
    while(status == 1): #A
        qval = model(state)
        qval_ = qval.data.numpy()
        action_ = np.argmax(qval_) #B
        action = action_set[action_]
        p_curr = from2dto1d((str(test_game.board.components['Player'].pos)))
        pi = from2dto1d((str(test_game.board.components['Pit'].pos)))
        g = from2dto1d((str(test_game.board.components['Goal'].pos)))
        w = from2dto1d((str(test_game.board.components['Wall'].pos)))
        experience_dqn[counter][0] = p_curr
        experience_dqn[counter][1] = pi
        experience_dqn[counter][2] = g
        experience_dqn[counter][3] = w
        
        experience_dqn[counter][4] =  qval_[0][action_]
        experience_dqn[counter][5] = qval_[0][0]
        experience_dqn[counter][6] = qval_[0][1]
        experience_dqn[counter][7] = qval_[0][2]
        experience_dqn[counter][8] = qval_[0][3]
        experience_dqn[counter][9] = action_
        
        if display:
            print('Move #: %s; Taking action: %s' % (i, action))
        test_game.makeMove(action)
        p_next = from2dto1d((str(test_game.board.components['Player'].pos)))
        experience_dqn[counter][10] = p_next
        state_ = test_game.board.render_np().reshape(1,64) + np.random.rand(1,64)/10.0
        state = torch.from_numpy(state_).float()
        if display:
            print(test_game.display())
        reward = test_game.reward()
        experience_dqn[counter][11] = reward
        if reward != -1:
            if reward > 0:
                status = 2
                if display:
                    print("Game won! Reward: %s" % (reward,))
            else:
                status = 0
                if display:
                    print("Game LOST. Reward: %s" % (reward,))
        i += 1
        counter += 1
        if (i > 15):
            if display:
                print("Game lost; too many moves.")
            break
    
    win = True if status == 2 else False
    return win

In [56]:
dqn_test_model(model, matrix, experience_dqn, 'static')

Initial State:
[[' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 ['+' '-' 'W' 'P']]
Move #: 0; Taking action: d
[[' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 ['+' '-' 'W' 'P']]
Move #: 1; Taking action: d
[[' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 ['+' '-' 'W' 'P']]
Move #: 2; Taking action: d
[[' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 ['+' '-' 'W' 'P']]
Move #: 3; Taking action: d
[[' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 ['+' '-' 'W' 'P']]
Move #: 4; Taking action: l
[[' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 ['+' '-' 'W' 'P']]
Move #: 5; Taking action: d
[[' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 ['+' '-' 'W' 'P']]
Move #: 6; Taking action: u
[[' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 [' ' ' ' ' ' 'P']
 ['+' '-' 'W' ' ']]
Move #: 7; Taking action: l
[[' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 [' ' ' ' 'P' ' ']
 ['+' '-' 'W' ' ']]
Move #: 8; Taking action: l
[[' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 [

True

In [49]:
win_num = 0
for i in range(0,1):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = dqn_test_model(model, matrix, experience_dqn, 'static', display = False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num 
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 941
0.941
0.941
Win percentage: 94.1%


In [50]:
experience_dqn

[[1,
  13,
  12,
  14,
  5.1477304,
  2.7381797,
  5.1477304,
  3.0928195,
  2.818922,
  1,
  5,
  -1],
 [5,
  13,
  12,
  14,
  7.2005606,
  4.1897044,
  7.2005606,
  6.0265217,
  5.05708,
  1,
  9,
  -1],
 [9, 13, 12, 14, 7.985275, 5.3782983, 5.385748, 7.985275, 5.726155, 2, 8, -1],
 [8, 13, 12, 14, 9.645841, 6.049334, 9.645841, 7.658503, 7.2170644, 1, 12, 10],
 [5,
  13,
  12,
  14,
  6.8113513,
  3.6529794,
  6.8113513,
  5.9571495,
  4.8358283,
  1,
  9,
  -1],
 [9,
  13,
  12,
  14,
  7.6934943,
  4.557288,
  4.8107243,
  7.6934943,
  5.8251753,
  2,
  8,
  -1],
 [8,
  13,
  12,
  14,
  8.981035,
  5.6695447,
  8.981035,
  7.4318395,
  6.9907217,
  1,
  12,
  10],
 [3,
  13,
  12,
  14,
  3.0031178,
  0.4028201,
  3.0031178,
  2.1615727,
  1.439548,
  1,
  7,
  -1],
 [7,
  13,
  12,
  14,
  3.7910373,
  0.11770601,
  3.7910373,
  3.4168482,
  2.454812,
  1,
  11,
  -1],
 [11,
  13,
  12,
  14,
  4.688688,
  1.857086,
  3.0403688,
  4.688688,
  3.8676505,
  2,
  10,
  -1],
 [10,
 

In [58]:
import pandas as pd
column_names = ["Player", "Pit", "Goal", "Wall","Q_MAX", "Q_0", "Q_1","Q_2","Q_3", "Action", "Player_Next", "Reward"]
df_experience = pd.DataFrame(experience_dqn, columns = column_names)
df_experience = df_experience.loc[(df_experience != 0).any(axis=1)]
df_experience

Unnamed: 0,Player,Pit,Goal,Wall,Q_MAX,Q_0,Q_1,Q_2,Q_3,Action,Player_Next,Reward
0,1,13,12,14,5.147730,2.738180,5.147730,3.092819,2.818922,1,5,-1
1,5,13,12,14,7.200561,4.189704,7.200561,6.026522,5.057080,1,9,-1
2,9,13,12,14,7.985275,5.378298,5.385748,7.985275,5.726155,2,8,-1
3,8,13,12,14,9.645841,6.049334,9.645841,7.658503,7.217064,1,12,10
4,5,13,12,14,6.811351,3.652979,6.811351,5.957150,4.835828,1,9,-1
...,...,...,...,...,...,...,...,...,...,...,...,...
4418,15,13,12,14,3.994129,3.994129,3.983859,3.807921,3.262913,0,11,-1
4419,11,13,12,14,4.804834,1.950031,3.223331,4.804834,3.973870,2,10,-1
4420,10,13,12,14,5.854650,5.120645,4.657995,5.854650,3.809538,2,9,-1
4421,9,13,12,14,7.345867,3.502544,4.407230,7.345867,5.565319,2,8,-1


In [59]:
df_experience = df_experience.drop_duplicates(keep='last',subset=['Player', 'Pit', 'Goal', 'Wall',"Player_Next"])
df_experience

Unnamed: 0,Player,Pit,Goal,Wall,Q_MAX,Q_0,Q_1,Q_2,Q_3,Action,Player_Next,Reward
4265,0,13,12,14,6.19767,4.872125,6.19767,4.403681,4.323163,1,4,-1
4316,3,13,12,14,3.071207,0.827102,3.071207,2.240221,1.595719,1,7,-1
4329,10,13,12,14,5.431682,5.431682,4.616965,5.313085,3.653417,0,6,-1
4330,6,13,12,14,4.858483,1.80925,4.842836,4.858483,2.586259,2,5,-1
4334,7,13,12,14,3.187635,-0.241995,3.129218,3.187635,2.025084,2,6,-1
4387,1,13,12,14,5.756451,3.277809,5.756451,3.63506,3.630623,1,5,-1
4397,2,13,12,14,5.215746,2.140071,5.215746,3.768744,2.59353,1,6,-1
4398,6,13,12,14,4.416037,3.18686,4.416037,4.059733,2.88062,1,10,-1
4402,4,13,12,14,8.734666,3.586729,8.734666,6.296149,5.685254,1,8,-1
4404,7,13,12,14,3.261198,0.320293,3.261198,3.044313,1.856465,1,11,-1


In [60]:
df_experience.to_csv('DQN_Policy.csv')

In [61]:
online = pd.read_csv("Online_Dataset(5000-10000).csv")

In [62]:
def calculate_smape(actual, predicted) -> float:
    if not all([isinstance(actual, np.ndarray), 
                isinstance(predicted, np.ndarray)]):
        actual, predicted = np.array(actual),
        np.array(predicted)
  
    return round(
        np.mean(
            np.abs(predicted - actual) / 
            ((np.abs(predicted) + np.abs(actual))/2)
        )*100, 2
    )

In [63]:
online.drop('Unnamed: 0', axis=1, inplace=True)
online

Unnamed: 0,Epochs,Current_State,Pit_Position,Goal_Position,Wall_Position,Action,Reward,Next_State,Epsilon_Boolean,Q_Max,Q1_Current,Q2_Current,Q3_Current,Q4_Current,Q1_Next,Q2_Next,Q3_Next,Q4_Next,Q_Next_Max,Y
0,0,6,3,14,12,1,-1,10,0,8.452592,3.790069,8.452592,4.717931,5.029382,7.000268,10.203282,6.134435,7.117641,10.203282,8.182954
1,0,10,3,14,12,1,10,14,0,10.203282,7.000268,10.203282,6.134435,7.117641,7.472913,9.481479,9.008953,9.420471,9.481479,10.000000
2,1,8,13,14,4,1,-1,12,1,5.833664,2.322584,3.567182,3.874437,5.833664,3.781209,2.623182,3.403938,3.906078,3.906078,2.515470
3,1,12,13,14,4,1,-1,12,1,3.906078,3.781209,2.623182,3.403938,3.906078,3.742697,2.616387,3.370567,3.838040,3.838040,2.454236
4,1,12,13,14,4,3,-10,13,1,3.838040,3.742697,2.616387,3.370567,3.838040,5.941330,6.442807,3.411242,8.147532,8.147532,-2.667221
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22515,4998,6,3,13,1,1,-1,10,0,5.842478,2.497576,5.842478,5.246665,2.757470,4.389945,7.810893,7.655078,2.499007,7.810893,6.029804
22516,4998,10,3,13,1,1,-1,14,0,7.821082,4.406665,7.821082,7.695354,2.487689,5.645626,8.178638,9.581903,4.737934,9.581903,7.623713
22517,4998,14,3,13,1,2,10,13,0,9.636883,5.658291,8.187815,9.636883,4.742816,6.240669,7.947538,7.641408,6.690786,7.947538,10.000000
22518,4999,7,3,9,13,3,-1,7,1,6.307261,0.962413,6.117292,6.307261,4.638416,0.750458,6.031558,6.322142,4.590769,6.322142,4.689928


In [64]:
from sklearn.preprocessing import OneHotEncoder
x = online[["Current_State"]]
y = OneHotEncoder().fit_transform(x).toarray()
print(y)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [65]:
dataset = pd.DataFrame({'Encode_0': y[:, 0], 'Encode_1': y[:, 1], 'Encode_2': y[:, 2], 'Encode_3': y[:, 3], 'Encode_4': y[:, 4], 'Encode_5': y[:, 5], 'Encode_6': y[:, 6], 'Encode_7': y[:, 7], 'Encode_8': y[:, 8], 'Encode_9': y[:, 9], 'Encode_10': y[:, 10], 'Encode_11': y[:, 11], 'Encode_12': y[:, 12], 'Encode_13': y[:, 13], 'Encode_14': y[:, 14], 'Encode_15': y[:, 15]})

In [66]:
online['Encode_0'] = dataset["Encode_0"]
online['Encode_1'] = dataset["Encode_1"]
online['Encode_2'] = dataset["Encode_2"]
online['Encode_3'] = dataset["Encode_3"]
online['Encode_4'] = dataset["Encode_4"]
online['Encode_5'] = dataset["Encode_5"]
online['Encode_6'] = dataset["Encode_6"]
online['Encode_7'] = dataset["Encode_7"]
online['Encode_8'] = dataset["Encode_8"]
online['Encode_9'] = dataset["Encode_9"]
online['Encode_10'] = dataset["Encode_10"]
online['Encode_11'] = dataset["Encode_11"]
online['Encode_12'] = dataset["Encode_12"]
online['Encode_13'] = dataset["Encode_13"]
online['Encode_14'] = dataset["Encode_14"]
online['Encode_15'] = dataset["Encode_15"]
online

Unnamed: 0,Epochs,Current_State,Pit_Position,Goal_Position,Wall_Position,Action,Reward,Next_State,Epsilon_Boolean,Q_Max,...,Encode_6,Encode_7,Encode_8,Encode_9,Encode_10,Encode_11,Encode_12,Encode_13,Encode_14,Encode_15
0,0,6,3,14,12,1,-1,10,0,8.452592,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,10,3,14,12,1,10,14,0,10.203282,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1,8,13,14,4,1,-1,12,1,5.833664,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,12,13,14,4,1,-1,12,1,3.906078,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1,12,13,14,4,3,-10,13,1,3.838040,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22515,4998,6,3,13,1,1,-1,10,0,5.842478,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22516,4998,10,3,13,1,1,-1,14,0,7.821082,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
22517,4998,14,3,13,1,2,10,13,0,9.636883,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
22518,4999,7,3,9,13,3,-1,7,1,6.307261,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [67]:
pc = online[online["Epsilon_Boolean"] == 0]
pc

Unnamed: 0,Epochs,Current_State,Pit_Position,Goal_Position,Wall_Position,Action,Reward,Next_State,Epsilon_Boolean,Q_Max,...,Encode_6,Encode_7,Encode_8,Encode_9,Encode_10,Encode_11,Encode_12,Encode_13,Encode_14,Encode_15
0,0,6,3,14,12,1,-1,10,0,8.452592,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,10,3,14,12,1,10,14,0,10.203282,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,2,13,0,5,1,0,-1,9,0,8.848181,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,2,9,0,5,1,0,10,5,0,10.177234,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
8,3,5,2,8,1,1,-1,9,0,9.151602,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22513,4997,14,1,15,12,3,10,15,0,10.571692,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
22514,4998,7,3,13,1,2,-1,6,0,3.782595,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22515,4998,6,3,13,1,1,-1,10,0,5.842478,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22516,4998,10,3,13,1,1,-1,14,0,7.821082,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [69]:
pf = pc.drop_duplicates(keep='last',subset=[ 'Current_State', 'Pit_Position', 'Goal_Position', 'Wall_Position', 'Next_State'])
pf

Unnamed: 0,Epochs,Current_State,Pit_Position,Goal_Position,Wall_Position,Action,Reward,Next_State,Epsilon_Boolean,Q_Max,...,Encode_6,Encode_7,Encode_8,Encode_9,Encode_10,Encode_11,Encode_12,Encode_13,Encode_14,Encode_15
0,0,6,3,14,12,1,-1,10,0,8.452592,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,10,3,14,12,1,10,14,0,10.203282,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,2,13,0,5,1,0,-1,9,0,8.848181,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,2,9,0,5,1,0,10,5,0,10.177234,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
13,5,15,11,14,2,2,10,14,0,10.531881,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22513,4997,14,1,15,12,3,10,15,0,10.571692,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
22514,4998,7,3,13,1,2,-1,6,0,3.782595,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22515,4998,6,3,13,1,1,-1,10,0,5.842478,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22516,4998,10,3,13,1,1,-1,14,0,7.821082,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [71]:
pf = pf.reset_index()

In [72]:
X = pf[['Encode_0','Encode_1','Encode_2','Encode_3','Encode_4','Encode_5','Encode_6','Encode_7','Encode_8','Encode_9','Encode_10',
        'Encode_11','Encode_12','Encode_13','Encode_14','Encode_15','Pit_Position', 'Goal_Position', 'Wall_Position', 'Reward']].values
y = pf[['Y']].values

In [73]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/5, random_state = 0)
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train, y_train)

# Predicting the Test set results
y_pred = reg.predict(X_test)
calculate_smape(y_test, y_pred)

22.75

In [80]:
h = 0
w, h = 20, 3000000
matrix = [[0 for x in range(w)] for y in range(h)] 

h = 0
w, h = 13, 3000000
experience = [[0 for x in range(w)] for y in range(h)] 
global count
count = 0

In [81]:
lr_test_model(reg, matrix, experience, 'static')

Initial State:
[[' ' ' ' ' ' ' ']
 [' ' 'P' ' ' ' ']
 [' ' ' ' ' ' ' ']
 ['+' '-' 'W' ' ']]
Move #: 0; Taking action: d
[[' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 [' ' 'P' ' ' ' ']
 ['+' '-' 'W' ' ']]
Move #: 1; Taking action: r
[[' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 [' ' ' ' 'P' ' ']
 ['+' '-' 'W' ' ']]
Move #: 2; Taking action: l
[[' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 [' ' 'P' ' ' ' ']
 ['+' '-' 'W' ' ']]
Move #: 3; Taking action: r
[[' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 [' ' ' ' 'P' ' ']
 ['+' '-' 'W' ' ']]
Move #: 4; Taking action: l
[[' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 [' ' 'P' ' ' ' ']
 ['+' '-' 'W' ' ']]
Move #: 5; Taking action: r
[[' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 [' ' ' ' 'P' ' ']
 ['+' '-' 'W' ' ']]
Move #: 6; Taking action: l
[[' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 [' ' 'P' ' ' ' ']
 ['+' '-' 'W' ' ']]
Move #: 7; Taking action: r
[[' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 [' ' ' ' 'P' ' ']
 ['+' '-' 'W' ' ']]
Move #: 8; Taking action: l
[[' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 [

False

In [82]:
win_num = 0
for i in range(0,1):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = lr_test_model(reg, matrix, experience, 'static', display = False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num 
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 85
0.085
0.085
Win percentage: 8.5%


In [83]:
import pandas as pd
column_names = ["Player", "Pit", "Goal", "Wall", "Action", "Player_Next", "Reward", "Done", "Q_MAX", "Q_0", "Q_1", "Q_2", 'Q_3']
df_experience_lr = pd.DataFrame(experience, columns = column_names)
df_experience_lr = df_experience_lr.loc[(df_experience_lr != 0).any(axis=1)]
df_experience_lr

Unnamed: 0,Player,Pit,Goal,Wall,Action,Player_Next,Reward,Done,Q_MAX,Q_0,Q_1,Q_2,Q_3
19420,5,13,12,14,1,9,-1,False,6.447574,5.577848,6.447574,5.562833,6.395176
19421,9,13,12,14,3,10,-1,False,6.407895,6.310153,2.339450,5.683438,6.407895
19422,10,13,12,14,2,9,-1,False,6.447574,6.395176,6.407895,6.447574,5.675828
19423,9,13,12,14,3,10,-1,False,6.407895,6.310153,2.339450,5.683438,6.407895
19424,10,13,12,14,2,9,-1,False,6.447574,6.395176,6.407895,6.447574,5.675828
...,...,...,...,...,...,...,...,...,...,...,...,...,...
34156,10,13,12,14,2,9,-1,False,6.447574,6.395176,6.407895,6.447574,5.675828
34157,9,13,12,14,3,10,-1,False,6.407895,6.310153,2.339450,5.683438,6.407895
34158,10,13,12,14,2,9,-1,False,6.447574,6.395176,6.407895,6.447574,5.675828
34159,9,13,12,14,3,10,-1,False,6.407895,6.310153,2.339450,5.683438,6.407895


In [84]:
df_experience_lr = df_experience_lr.drop_duplicates(keep='last',subset=['Player', 'Pit', 'Goal', 'Wall',"Player_Next"])
df_experience_lr

Unnamed: 0,Player,Pit,Goal,Wall,Action,Player_Next,Reward,Done,Q_MAX,Q_0,Q_1,Q_2,Q_3
33791,0,13,12,14,3,1,-1,False,5.577848,4.906998,5.562833,4.906998,5.577848
33839,15,13,12,14,0,11,-1,False,5.675828,5.675828,5.073291,5.073291,5.073291
33855,1,13,12,14,1,5,-1,False,6.310153,5.577848,6.310153,4.906998,5.539522
33920,8,13,12,14,1,12,10,True,9.011333,5.562833,9.011333,5.683438,6.447574
34049,2,13,12,14,1,6,-1,False,6.395176,5.539522,6.395176,5.577848,4.87391
34097,4,13,12,14,3,5,-1,False,6.310153,4.906998,5.683438,5.562833,6.310153
34098,5,13,12,14,1,9,-1,False,6.447574,5.577848,6.447574,5.562833,6.395176
34129,11,13,12,14,2,10,-1,False,6.407895,5.583927,5.073291,6.407895,5.675828
34145,3,13,12,14,1,7,-1,False,5.583927,4.87391,5.583927,5.539522,4.87391
34146,7,13,12,14,2,6,-1,False,6.395176,4.87391,5.675828,6.395176,5.583927


In [85]:
df_experience.to_csv('LR_Policy_Comparison.csv')

In [86]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = lr_test_model(reg, matrix, experience, mode='random', display=False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num / 10
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 533
0.533
0.533
Games played: 1000, # of wins: 554
0.554
1.0870000000000002
Games played: 1000, # of wins: 551
0.551
1.6380000000000003
Games played: 1000, # of wins: 541
0.541
2.1790000000000003
Games played: 1000, # of wins: 546
0.546
2.7250000000000005
Games played: 1000, # of wins: 556
0.556
3.2810000000000006
Games played: 1000, # of wins: 539
0.539
3.8200000000000007
Games played: 1000, # of wins: 493
0.493
4.313000000000001
Games played: 1000, # of wins: 562
0.562
4.875000000000001
Games played: 1000, # of wins: 566
0.566
5.441000000000001
Win percentage: 54.410000000000004%


In [87]:
X = pf[['Encode_0','Encode_1','Encode_2','Encode_3','Encode_4','Encode_5','Encode_6','Encode_7','Encode_8','Encode_9','Encode_10',
        'Encode_11','Encode_12','Encode_13','Encode_14','Encode_15','Pit_Position', 'Goal_Position', 'Wall_Position', 'Reward']].values
y = pf[['Q_Max']].values

In [88]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/5, random_state = 0)
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train, y_train)

# Predicting the Test set results
y_pred = reg.predict(X_test)
calculate_smape(y_test, y_pred)

21.48

In [90]:
win_num = 0
for i in range(0,1):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = lr_test_model(reg, matrix, experience, mode='static', display=False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num / 10
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 74
0.074
0.074
Win percentage: 0.74%


In [91]:
pf['Loss'] = abs(pf['Q_Max'] - pf["Y"])

In [92]:
pf

Unnamed: 0,index,Epochs,Current_State,Pit_Position,Goal_Position,Wall_Position,Action,Reward,Next_State,Epsilon_Boolean,...,Encode_7,Encode_8,Encode_9,Encode_10,Encode_11,Encode_12,Encode_13,Encode_14,Encode_15,Loss
0,0,0,6,3,14,12,1,-1,10,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.269638
1,1,0,10,3,14,12,1,10,14,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.203282
2,5,2,13,0,5,1,0,-1,9,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.688670
3,6,2,9,0,5,1,0,10,5,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.177234
4,13,5,15,11,14,2,2,10,14,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.531881
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11082,22513,4997,14,1,15,12,3,10,15,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.571692
11083,22514,4998,7,3,13,1,2,-1,6,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.461421
11084,22515,4998,6,3,13,1,1,-1,10,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.187325
11085,22516,4998,10,3,13,1,1,-1,14,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.197369


In [109]:
X = pf[['Encode_0','Encode_1','Encode_2','Encode_3','Encode_4','Encode_5','Encode_6','Encode_7','Encode_8','Encode_9','Encode_10',
        'Encode_11','Encode_12','Encode_13','Encode_14','Encode_15','Pit_Position', 'Goal_Position', 'Wall_Position', 'Reward']].values
y = pf[['Loss']].values

In [110]:
def loss_test_model(model_b, matrix,experience, mode='static', display=True):
    global counter
    i = 0
    test_game = Gridworld(mode=mode)
#     state_ = test_game.board.render_np().reshape(1,64) + np.random.rand(1,64)/10.0
#     state = torch.from_numpy(state_).float()
    if display:
        print("Initial State:")
        print(test_game.display())
    status = 1
    j = 0
    while(status == 1): #A
        p_curr = from2dto1d((str(test_game.board.components['Player'].pos)))
        pi = from2dto1d((str(test_game.board.components['Pit'].pos)))
        g = from2dto1d((str(test_game.board.components['Goal'].pos)))
        w = from2dto1d((str(test_game.board.components['Wall'].pos)))
        #q_current = model.predict([[p_curr,pi,g,w]])[0][0]
        
        q_value_current = []
        q_value_next = []
        q_value = []
        test_game.makeMove('u')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)

        reward = test_game.reward()
       # q_value_current.append(model.predict([[p,pi,g,w]])[0][0])
        q_value_next.append(model_b.predict([[en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15,pi,g,w,reward]])[0][0])
        #q_value.append(abs(model_b.predict([[p,pi,g,w,reward]])[0][0]-q_current))
        if p_curr != p:
            test_game.makeMove('d')
        
        test_game.makeMove('d')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)

        reward = test_game.reward()
       # q_value_current.append(model.predict([[p,pi,g,w]])[0][0])
        q_value_next.append(model_b.predict([[en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15,pi,g,w,reward]])[0][0])
        #q_value.append(abs(model_b.predict([[p,pi,g,w,reward]])[0][0]-q_current))
        if p_curr != p:
            test_game.makeMove('u')
        
        test_game.makeMove('l')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)

        reward = test_game.reward()
       # q_value_current.append(model.predict([[p,pi,g,w]])[0][0])
        q_value_next.append(model_b.predict([[en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15,pi,g,w,reward]])[0][0])
        #q_value.append(abs(model_b.predict([[p,pi,g,w,reward]])[0][0]-q_current))
        if p_curr != p:
            test_game.makeMove('r')
        
        test_game.makeMove('r')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)

        reward = test_game.reward()
       # q_value_current.append(model.predict([[p,pi,g,w]])[0][0])
        q_value_next.append(model_b.predict([[en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15,pi,g,w,reward]])[0][0])
        #q_value.append(abs(model_b.predict([[p,pi,g,w,reward]])[0][0]-q_current))
        if p_curr != p:
            test_game.makeMove('l')
        
#         print(q_value_current)
        
        
        action_ = np.argmin(q_value_next)
#         print(action_)
        action = action_set[action_]
        matrix[j][0] = p_curr
        matrix[j][1] = pi
        matrix[j][2] = g
        matrix[j][3] = w
        
        experience[counter][0] = p_curr
        experience[counter][1] = pi
        experience[counter][2] = g
        experience[counter][3] = w
        experience[counter][4] = action_
        
        
#         matrix[j][4] = q_current
#         matrix[j][5:9] = q_value_current
#         matrix[j][9] = max(q_value_current)
#         matrix[j][10:14] = q_value_next
#         matrix[j][14] = max(q_value_next)
        if display:
            print('Move #: %s; Taking action: %s' % (i, action))
        test_game.makeMove(action)
        p_next = from2dto1d((str(test_game.board.components['Player'].pos)))
        experience[counter][5] = p_next
        
#         state_ = test_game.board.render_np().reshape(1,64) + np.random.rand(1,64)/10.0
#         state = torch.from_numpy(state_).float()
#         if test_game.reward() == -1:
#             Y = test_game.reward() + 0.9 * max(q_value_next)
#         else:
#             Y = test_game.reward()
#         Y = q_value_next[np.argmax(q_value_current)]
#         matrix[j][15] = q_value[0]
#         matrix[j][16] = q_value[1]
#         matrix[j][17] = q_value[2]
#         matrix[j][18] = q_value[3]
#         matrix[j][19] = min(q_value)
        j += 1
        if display:
            print(test_game.display())
        reward = test_game.reward()
        experience[counter][6] = reward
        if reward > -1:
            experience[counter][7] = True
        else:
            experience[counter][7] = False
        experience[counter][8] =  q_value_next[action_]
        experience[counter][9] = q_value_next[0]
        experience[counter][10] = q_value_next[1]
        experience[counter][11] = q_value_next[2]
        experience[counter][12] = q_value_next[3]
        if reward != -1:
            if reward > 0:
                status = 2
                if display:
                    print("Game won! Reward: %s" % (reward,))
            else:
                status = 0
                if display:
                    print("Game LOST. Reward: %s" % (reward,))
        counter += 1
        i += 1
        if (i > 15):
            if display:
                print("Game lost; too many moves.")
            break
    
    win = True if status == 2 else False
    return win

In [111]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/5, random_state = 0)
from sklearn.linear_model import LinearRegression
reg_loss = LinearRegression()
reg_loss.fit(X_train, y_train)

# Predicting the Test set results
y_pred = reg.predict(X_test)
calculate_smape(y_test, y_pred)

73.82

In [114]:


h = 0
w, h = 13, 3000000
experience_loss = [[0 for x in range(w)] for y in range(h)] 

global count
count = 0

In [115]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = loss_test_model(reg_loss, matrix, experience_loss, mode='random', display=False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num / 10
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 550
0.55
0.55
Games played: 1000, # of wins: 536
0.536
1.086
Games played: 1000, # of wins: 527
0.527
1.613
Games played: 1000, # of wins: 547
0.547
2.16
Games played: 1000, # of wins: 533
0.533
2.693
Games played: 1000, # of wins: 575
0.575
3.268
Games played: 1000, # of wins: 504
0.504
3.772
Games played: 1000, # of wins: 520
0.52
4.292
Games played: 1000, # of wins: 517
0.517
4.809
Games played: 1000, # of wins: 566
0.566
5.375
Win percentage: 53.75%


In [120]:
res = pf[(pf["Pit_Position"] == 13) & (pf["Goal_Position"] == 12)  & (pf["Wall_Position"] == 14)]

In [124]:
res

Unnamed: 0,index,Epochs,Current_State,Pit_Position,Goal_Position,Wall_Position,Action,Reward,Next_State,Epsilon_Boolean,...,Encode_7,Encode_8,Encode_9,Encode_10,Encode_11,Encode_12,Encode_13,Encode_14,Encode_15,Loss
6541,14393,3109,7,13,12,14,1,-1,11,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.488372
6542,14394,3109,11,13,12,14,2,-1,10,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.495987
6543,14395,3109,10,13,12,14,2,-1,9,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.176584
10563,21665,4800,5,13,12,14,1,-1,9,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.153598
11002,22401,4970,7,13,12,14,3,-1,7,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.313464
11003,22403,4970,3,13,12,14,3,-1,3,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.266188
11004,22404,4970,3,13,12,14,2,-1,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.369237
11005,22407,4970,2,13,12,14,2,-1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.187058
11006,22410,4970,1,13,12,14,2,-1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003901
11007,22411,4970,0,13,12,14,1,-1,4,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.360051


In [125]:
res.to_csv('LR_DATA_For Policy.csv')