In [18]:
import numpy as np
import random
import sys

def randPair(s,e):
    return np.random.randint(s,e), np.random.randint(s,e)

class BoardPiece:

    def __init__(self, name, code, pos):
        self.name = name #name of the piece
        self.code = code #an ASCII character to display on the board
        self.pos = pos #2-tuple e.g. (1,4)

class BoardMask:

    def __init__(self, name, mask, code):
        self.name = name
        self.mask = mask
        self.code = code

    def get_positions(self): #returns tuple of arrays
        return np.nonzero(self.mask)

def zip_positions2d(positions): #positions is tuple of two arrays
    x,y = positions
    return list(zip(x,y))

class GridBoard:

    def __init__(self, size=4):
        self.size = size #Board dimensions, e.g. 4 x 4
        self.components = {} #name : board piece
        self.masks = {}

    def addPiece(self, name, code, pos=(0,0)):
        newPiece = BoardPiece(name, code, pos)
        self.components[name] = newPiece

    #basically a set of boundary elements
    def addMask(self, name, mask, code):
        #mask is a 2D-numpy array with 1s where the boundary elements are
        newMask = BoardMask(name, mask, code)
        self.masks[name] = newMask

    def movePiece(self, name, pos):
        move = True
        for _, mask in self.masks.items():
            if pos in zip_positions2d(mask.get_positions()):
                move = False
        if move:
            self.components[name].pos = pos

    def delPiece(self, name):
        del self.components['name']

    def render(self):
        dtype = '<U2'
        displ_board = np.zeros((self.size, self.size), dtype=dtype)
        displ_board[:] = ' '

        for name, piece in self.components.items():
            displ_board[piece.pos] = piece.code

        for name, mask in self.masks.items():
            displ_board[mask.get_positions()] = mask.code

        return displ_board

    def render_np(self):
        num_pieces = len(self.components) + len(self.masks)
        displ_board = np.zeros((num_pieces, self.size, self.size), dtype=np.uint8)
        layer = 0
        for name, piece in self.components.items():
            pos = (layer,) + piece.pos
            displ_board[pos] = 1
            layer += 1

        for name, mask in self.masks.items():
            x,y = self.masks['boundary'].get_positions()
            z = np.repeat(layer,len(x))
            a = (z,x,y)
            displ_board[a] = 1
            layer += 1
        return displ_board

def addTuple(a,b):
    return tuple([sum(x) for x in zip(a,b)])

In [19]:
class Gridworld:

    def __init__(self, size=4, mode='static'):
        if size >= 4:
            self.board = GridBoard(size=size)
        else:
            print("Minimum board size is 4. Initialized to size 4.")
            self.board = GridBoard(size=4)

        #Add pieces, positions will be updated later
        self.board.addPiece('Player','P',(0,0))
        self.board.addPiece('Goal','+',(1,0))
        self.board.addPiece('Pit','-',(2,0))
        self.board.addPiece('Wall','W',(3,0))

        if mode == 'static':
            self.initGridStatic()
        elif mode == 'player':
            self.initGridPlayer()
        else:
            self.initGridRand()

    #Initialize stationary grid, all items are placed deterministically
    def initGridStatic(self):
        #Setup static pieces
        self.board.components['Player'].pos = (0,3) #Row, Column
        self.board.components['Goal'].pos = (0,0)
        self.board.components['Pit'].pos = (0,1)
        self.board.components['Wall'].pos = (1,1)

    #Check if board is initialized appropriately (no overlapping pieces)
    #also remove impossible-to-win boards
    def validateBoard(self):
        valid = True

        player = self.board.components['Player']
        goal = self.board.components['Goal']
        wall = self.board.components['Wall']
        pit = self.board.components['Pit']

        all_positions = [piece for name,piece in self.board.components.items()]
        all_positions = [player.pos, goal.pos, wall.pos, pit.pos]
        if len(all_positions) > len(set(all_positions)):
            return False

        corners = [(0,0),(0,self.board.size), (self.board.size,0), (self.board.size,self.board.size)]
        #if player is in corner, can it move? if goal is in corner, is it blocked?
        if player.pos in corners or goal.pos in corners:
            val_move_pl = [self.validateMove('Player', addpos) for addpos in [(0,1),(1,0),(-1,0),(0,-1)]]
            val_move_go = [self.validateMove('Goal', addpos) for addpos in [(0,1),(1,0),(-1,0),(0,-1)]]
            if 0 not in val_move_pl or 0 not in val_move_go:
                #print(self.display())
                #print("Invalid board. Re-initializing...")
                valid = False

        return valid

    #Initialize player in random location, but keep wall, goal and pit stationary
    def initGridPlayer(self):
        #height x width x depth (number of pieces)
        self.initGridStatic()
        #place player
        self.board.components['Player'].pos = randPair(0,self.board.size)

        if (not self.validateBoard()):
            #print('Invalid grid. Rebuilding..')
            self.initGridPlayer()

    #Initialize grid so that goal, pit, wall, player are all randomly placed
    def initGridRand(self):
        #height x width x depth (number of pieces)
        self.board.components['Player'].pos = randPair(0,self.board.size)
        self.board.components['Goal'].pos = randPair(0,self.board.size)
        self.board.components['Pit'].pos = randPair(0,self.board.size)
        self.board.components['Wall'].pos = randPair(0,self.board.size)

        if (not self.validateBoard()):
            #print('Invalid grid. Rebuilding..')
            self.initGridRand()

    def validateMove(self, piece, addpos=(0,0)):
        outcome = 0 #0 is valid, 1 invalid, 2 lost game
        pit = self.board.components['Pit'].pos
        wall = self.board.components['Wall'].pos
        new_pos = addTuple(self.board.components[piece].pos, addpos)
        if new_pos == wall:
            outcome = 1 #block move, player can't move to wall
        elif max(new_pos) > (self.board.size-1):    #if outside bounds of board
            outcome = 1
        elif min(new_pos) < 0: #if outside bounds
            outcome = 1
        elif new_pos == pit:
            outcome = 2

        return outcome

    def makeMove(self, action):
        #need to determine what object (if any) is in the new grid spot the player is moving to
        #actions in {u,d,l,r}
        def checkMove(addpos):
            if self.validateMove('Player', addpos) in [0,2]:
                new_pos = addTuple(self.board.components['Player'].pos, addpos)
                self.board.movePiece('Player', new_pos)

        if action == 'u': #up
            checkMove((-1,0))
        elif action == 'd': #down
            checkMove((1,0))
        elif action == 'l': #left
            checkMove((0,-1))
        elif action == 'r': #right
            checkMove((0,1))
        else:
            pass

    def reward(self):
        if (self.board.components['Player'].pos == self.board.components['Pit'].pos):
            return -10
        elif (self.board.components['Player'].pos == self.board.components['Goal'].pos):
            return 10
        else:
            return -1

    def display(self):
        return self.board.render()

In [20]:
action_set = {
    0: 'u',
    1: 'd',
    2: 'l',
    3: 'r',
}

In [21]:
import numpy as np
import torch
from IPython.display import clear_output
import random
from matplotlib import pylab as plt

l1 = 64
l2 = 200
l3 = 120
l4 = 4

model = torch.nn.Sequential(
    torch.nn.Linear(l1, l2),
    torch.nn.ReLU(),
    torch.nn.Linear(l2, l3),
    torch.nn.ReLU(),
    torch.nn.Linear(l3,l4)
)
loss_fn = torch.nn.MSELoss()
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

gamma = 0.9
epsilon = 0.3

In [22]:
from collections import deque
epochs = 5000
losses = []
mem_size = 1000 #A
batch_size = 200 #B
replay = deque(maxlen=mem_size) #C
max_moves = 50 #D

h = 0
w, h = 20, 300000
matrix = [[0 for x in range(w)] for y in range(h)] 

e = 0
e,r= 10, 3000000 
experience = [[0 for x in range(e)] for y in range(r)] 

y = 0
u,y= 7, 3000000 
q_table = [[0 for x in range(u)] for y in range(y)] 
q_counter = 0 

counter = 0
num = 0
for i in range(epochs):
    game = Gridworld(size=4, mode='random')
    state1_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/100.0
    state1 = torch.from_numpy(state1_).float()
    status = 1
    mov = 0
    while(status == 1): 
        mov += 1
        qval = model(state1) #E
        qval_ = qval.data.numpy()
        
        matrix[counter][9] = qval_[0][np.argmax(qval_)]
        matrix[counter][10] = qval_[0][0]
        matrix[counter][11] = qval_[0][1]
        matrix[counter][12] = qval_[0][2]
        matrix[counter][13] = qval_[0][3]
        
        for k in range(0,4):
            q_table[q_counter][0] = i
            q_table[q_counter][1] = (str(game.board.components['Player'].pos))
            q_table[q_counter][2] = (str(game.board.components['Pit'].pos))
            q_table[q_counter][3] = (str(game.board.components['Goal'].pos))
            q_table[q_counter][4] = (str(game.board.components['Wall'].pos))
            q_table[q_counter][5] = k
            q_table[q_counter][6] = qval_[0][k]
            q_counter+=1
            
        if (random.random() < epsilon): #F
            matrix[counter][8] = 1 
            action_ = np.random.randint(0,4)
        else:
            matrix[counter][8] = 0 
            action_ = np.argmax(qval_)
        
        action = action_set[action_]
        
        matrix[counter][0] = i
        matrix[counter][1] = (str(game.board.components['Player'].pos))
        matrix[counter][2] = (str(game.board.components['Pit'].pos))
        matrix[counter][3] = (str(game.board.components['Goal'].pos))
        matrix[counter][4] = (str(game.board.components['Wall'].pos))
        matrix[counter][5] = action_
        
        game.makeMove(action)
        state2_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/100.0
        state2 = torch.from_numpy(state2_).float()
        next_state = model(state2)
        next_state_ = next_state.data.numpy()
        
        matrix[counter][18] = next_state_[0][np.argmax(next_state_)]
        matrix[counter][14] = next_state_[0][0]
        matrix[counter][15] = next_state_[0][1]
        matrix[counter][16] = next_state_[0][2]
        matrix[counter][17] = next_state_[0][3]
        reward = game.reward()
        matrix[counter][6] = reward
        if reward == -1 or reward == -10: #N
            Y = reward + (gamma * next_state_[0][np.argmax(next_state_)])
        else:
            Y = reward
            
        matrix[counter][19] = Y
        done = True if reward > 0 else False
        matrix[counter][7] = (str(game.board.components['Player'].pos))
        exp =  (state1, action_, reward, state2, done) #G
        replay.append(exp) #H
        state1 = state2
        
        
        
        if len(replay) > batch_size: #I
            minibatch = random.sample(replay, batch_size) #J
            state1_batch = torch.cat([s1 for (s1,a,r,s2,d) in minibatch]) #K
            action_batch = torch.Tensor([a for (s1,a,r,s2,d) in minibatch])
            reward_batch = torch.Tensor([r for (s1,a,r,s2,d) in minibatch])
            state2_batch = torch.cat([s2 for (s1,a,r,s2,d) in minibatch])
            done_batch = torch.Tensor([d for (s1,a,r,s2,d) in minibatch])
            
            Q1 = model(state1_batch) #L
            with torch.no_grad():
                Q2 = model(state2_batch) #M
            
            Y = reward_batch + gamma * ((1 - done_batch) * torch.max(Q2,dim=1)[0]) #N
            X = Q1.gather(dim=1,index=action_batch.long().unsqueeze(dim=1)).squeeze()
            loss = loss_fn(X, Y.detach())
            print(i)
#             if (i >= 4500):
#                 for j in range (0, batch_size):
#                     experience[num][0] = i
#                     experience[num][1] = state1_batch[j].numpy()
#                     experience[num][2] = action_batch[j].item()
#                     experience[num][3] = reward_batch[j].item()
#                     experience[num][4] = state2_batch[j].numpy()
#                     experience[num][5] = done_batch[j].item()
#                     experience[num][6] = Q1[j].detach().numpy()
#                     experience[num][7] = Q2[j].detach().numpy()
#                     experience[num][8] = Y[j].item()
#                     experience[num][9] = X[j].item()
#                     num += 1
            clear_output(wait=True)
            optimizer.zero_grad()
            loss.backward()
            losses.append(loss.item())
            optimizer.step()
        counter += 1
        if reward != -1 or mov > max_moves: #O
            status = 0
            mov = 0
losses = np.array(losses)

4999


In [11]:
q_table

[[0, '(2, 1)', '(3, 1)', '(3, 3)', '(0, 1)', 0, 0.03592627],
 [0, '(2, 1)', '(3, 1)', '(3, 3)', '(0, 1)', 1, 0.108402416],
 [0, '(2, 1)', '(3, 1)', '(3, 3)', '(0, 1)', 2, 0.036907464],
 [0, '(2, 1)', '(3, 1)', '(3, 3)', '(0, 1)', 3, 0.017709576],
 [0, '(2, 0)', '(3, 1)', '(3, 3)', '(0, 1)', 0, 0.042650506],
 [0, '(2, 0)', '(3, 1)', '(3, 3)', '(0, 1)', 1, 0.12379443],
 [0, '(2, 0)', '(3, 1)', '(3, 3)', '(0, 1)', 2, 0.059843518],
 [0, '(2, 0)', '(3, 1)', '(3, 3)', '(0, 1)', 3, 0.029764444],
 [0, '(3, 0)', '(3, 1)', '(3, 3)', '(0, 1)', 0, 0.04561755],
 [0, '(3, 0)', '(3, 1)', '(3, 3)', '(0, 1)', 1, 0.12730353],
 [0, '(3, 0)', '(3, 1)', '(3, 3)', '(0, 1)', 2, 0.0404923],
 [0, '(3, 0)', '(3, 1)', '(3, 3)', '(0, 1)', 3, 0.025522701],
 [0, '(2, 0)', '(3, 1)', '(3, 3)', '(0, 1)', 0, 0.042276006],
 [0, '(2, 0)', '(3, 1)', '(3, 3)', '(0, 1)', 1, 0.12374428],
 [0, '(2, 0)', '(3, 1)', '(3, 3)', '(0, 1)', 2, 0.05909939],
 [0, '(2, 0)', '(3, 1)', '(3, 3)', '(0, 1)', 3, 0.029744856],
 [0, '(2, 1)', '

In [12]:
import pandas as pd
column_names = ["Epochs","Current_State", "Pit_Position", "Goal_Position", "Wall_Position", "Action", "Q_value"]
df = pd.DataFrame(q_table, columns = column_names)
df = df.loc[(df != 0).any(axis=1)]
df

Unnamed: 0,Epochs,Current_State,Pit_Position,Goal_Position,Wall_Position,Action,Q_value
0,0,"(2, 1)","(3, 1)","(3, 3)","(0, 1)",0,0.035926
1,0,"(2, 1)","(3, 1)","(3, 3)","(0, 1)",1,0.108402
2,0,"(2, 1)","(3, 1)","(3, 3)","(0, 1)",2,0.036907
3,0,"(2, 1)","(3, 1)","(3, 3)","(0, 1)",3,0.017710
4,0,"(2, 0)","(3, 1)","(3, 3)","(0, 1)",0,0.042651
...,...,...,...,...,...,...,...
137939,4999,"(2, 0)","(0, 1)","(3, 1)","(0, 0)",3,9.188289
137940,4999,"(3, 0)","(0, 1)","(3, 1)","(0, 0)",0,8.201901
137941,4999,"(3, 0)","(0, 1)","(3, 1)","(0, 0)",1,9.030347
137942,4999,"(3, 0)","(0, 1)","(3, 1)","(0, 0)",2,9.910853


In [17]:
matrix

[[0,
  '(2, 0)',
  '(1, 2)',
  '(0, 1)',
  '(3, 3)',
  3,
  -1,
  '(2, 1)',
  0,
  -0.0050479732,
  -0.042233966,
  -0.016494315,
  -0.013858455,
  -0.0050479732,
  -0.058048114,
  -0.020728547,
  0.00090735033,
  -0.027855178,
  0.00090735033,
  -1],
 [0,
  '(2, 1)',
  '(1, 2)',
  '(0, 1)',
  '(3, 3)',
  2,
  -1,
  '(2, 0)',
  0,
  0.00090735033,
  -0.058048114,
  -0.020728547,
  0.00090735033,
  -0.027855178,
  -0.042253375,
  -0.016714249,
  -0.013994893,
  -0.005112931,
  -0.005112931,
  -1],
 [0,
  '(2, 0)',
  '(1, 2)',
  '(0, 1)',
  '(3, 3)',
  1,
  -1,
  '(3, 0)',
  1,
  -0.005112931,
  -0.042253375,
  -0.016714249,
  -0.013994893,
  -0.005112931,
  -0.07393616,
  -0.0015625726,
  -0.00231513,
  -0.050331123,
  -0.0015625726,
  -1],
 [0,
  '(3, 0)',
  '(1, 2)',
  '(0, 1)',
  '(3, 3)',
  1,
  -1,
  '(3, 0)',
  0,
  -0.0015625726,
  -0.07393616,
  -0.0015625726,
  -0.00231513,
  -0.050331123,
  -0.073955745,
  -0.00176547,
  -0.0021139868,
  -0.050305657,
  -0.00176547,
  -1],
 [0

In [13]:
experience[2]

[4500,
 array([3.2539605e-03, 9.7015774e-04, 7.2274408e-03, 7.5357137e-03,
        7.3235007e-03, 7.1853027e-03, 9.7252671e-03, 3.6755535e-03,
        1.0041515e+00, 7.2642164e-03, 7.0681907e-03, 9.0014646e-03,
        3.9145285e-03, 3.7193100e-03, 2.6582205e-03, 9.8369492e-04,
        6.9013084e-03, 6.8926043e-03, 6.4846114e-03, 9.6267257e-03,
        6.7130802e-03, 7.6192454e-03, 8.7181106e-03, 6.6959769e-03,
        5.7528266e-03, 7.2048665e-03, 3.9622462e-03, 3.1661226e-03,
        5.1939827e-03, 4.5041484e-03, 1.0037941e+00, 2.4061492e-03,
        8.3033945e-03, 6.0667624e-03, 1.3720625e-03, 3.3434054e-03,
        4.5473198e-03, 1.6682047e-03, 9.8228827e-03, 3.0641505e-03,
        8.3222594e-03, 9.5135011e-03, 1.2286160e-03, 7.1313875e-03,
        7.0047881e-03, 1.0030392e+00, 4.6320846e-03, 6.2114592e-03,
        6.1559742e-03, 6.3625011e-03, 6.0470263e-03, 2.9625061e-03,
        9.2503019e-03, 4.1746474e-03, 7.1051200e-03, 7.0976121e-03,
        5.9057627e-04, 1.0046518e+00, 4.9

In [23]:
matrix

[[0,
  '(0, 3)',
  '(1, 3)',
  '(1, 1)',
  '(2, 2)',
  0,
  -1,
  '(0, 3)',
  0,
  0.025503702,
  0.025503702,
  -0.02880814,
  -0.05968801,
  -0.09679721,
  0.025506958,
  -0.029396646,
  -0.05887635,
  -0.096399225,
  0.025506958,
  -0.9770437374711036],
 [0,
  '(0, 3)',
  '(1, 3)',
  '(1, 1)',
  '(2, 2)',
  0,
  -1,
  '(0, 3)',
  0,
  0.025506958,
  0.025506958,
  -0.029396646,
  -0.05887635,
  -0.096399225,
  0.025310554,
  -0.029185627,
  -0.05897473,
  -0.09692076,
  0.025310554,
  -0.9772205017507076],
 [0,
  '(0, 3)',
  '(1, 3)',
  '(1, 1)',
  '(2, 2)',
  0,
  -1,
  '(0, 3)',
  0,
  0.025310554,
  0.025310554,
  -0.029185627,
  -0.05897473,
  -0.09692076,
  0.025258556,
  -0.029254213,
  -0.05908372,
  -0.096569814,
  0.025258556,
  -0.9772672995924949],
 [0,
  '(0, 3)',
  '(1, 3)',
  '(1, 1)',
  '(2, 2)',
  0,
  -1,
  '(0, 3)',
  0,
  0.025258556,
  0.025258556,
  -0.029254213,
  -0.05908372,
  -0.096569814,
  0.025271365,
  -0.029087517,
  -0.059040077,
  -0.09714595,
  0.025

In [15]:
import pandas as pd
column_names = ["Epochs","Current_State",  "Action",  "Reward", "Next_State", "Done_Boolean", 
                "Q-Current", "Q-Next", "Y", "X"
                ]
df = pd.DataFrame(experience, columns = column_names)


In [16]:
df= df[df['Epochs'] != 0]

In [17]:
df

Unnamed: 0,Epochs,Current_State,Action,Reward,Next_State,Done_Boolean,Q-Current,Q-Next,Y,X
0,4500,"[0.008007733, 0.009518548, 0.00027466935, 0.00...",2.0,-1.0,"[0.0020411045, 0.0045970655, 0.0018630754, 0.0...",0.0,"[4.3824067, 3.251664, 4.6633377, 6.3426304]","[4.223745, 3.1851935, 4.5819945, 6.2175064]",4.595756,4.663338
1,4500,"[0.0072316937, 0.0042586084, 0.00784287, 0.006...",0.0,-1.0,"[0.006537159, 0.0002867183, 0.00012923105, 0.0...",0.0,"[8.301725, 0.60131323, 6.3039174, 4.110693]","[10.371106, 5.831517, 8.135586, 4.8211718]",8.333995,8.301725
2,4500,"[0.0032539605, 0.00097015774, 0.007227441, 0.0...",3.0,-1.0,"[0.0048687514, 0.006160582, 0.0030966785, 0.00...",0.0,"[2.126464, 1.0499609, 2.239248, 1.1695688]","[2.1127906, 1.0743947, 2.2515953, 1.2407337]",1.026436,1.169569
3,4500,"[0.007025323, 0.0026434937, 0.0041451245, 0.00...",3.0,-1.0,"[0.008336444, 0.0030398597, 0.0049631037, 0.00...",0.0,"[2.6179202, 3.7666523, 2.4381506, 4.9389353]","[3.5367153, 6.892476, 3.1914701, 5.3697224]",5.203228,4.938935
4,4500,"[0.007767832, 0.0063852174, 0.0074349227, 0.00...",1.0,-1.0,"[0.005770394, 0.0022664063, 0.00053195155, 0.0...",0.0,"[2.034112, 4.4193277, 4.04026, 1.9478679]","[3.0766172, 4.379309, 6.116813, 4.5215044]",4.505132,4.419328
...,...,...,...,...,...,...,...,...,...,...
443195,4999,"[0.0018970773, 0.0066727498, 1.0092647, 0.0071...",1.0,-10.0,"[0.005642313, 0.0033716604, 0.004429705, 0.000...",0.0,"[-2.6020896, -8.185914, -3.6095724, -3.681721]","[-6.1712103, -1.0318992, 1.5723869, -3.768174]",-8.584852,-8.185914
443196,4999,"[0.0062436424, 0.0094558615, 0.00820832, 0.008...",2.0,-1.0,"[0.0021442291, 0.00028203957, 0.0063584824, 0....",0.0,"[3.105644, 4.960172, 4.994894, 3.1819112]","[3.3093066, 6.637617, 6.585969, 4.787982]",4.973855,4.994894
443197,4999,"[0.0022087647, 0.003021355, 0.003702806, 0.005...",2.0,-1.0,"[0.0066556972, 0.008289577, 0.0045121717, 0.00...",0.0,"[5.8290334, 3.2023811, 5.9338555, 0.15436226]","[7.683959, 4.275585, 7.29659, 4.1320457]",5.915563,5.933856
443198,4999,"[0.009340141, 0.0013026751, 0.0018482549, 0.00...",0.0,-1.0,"[0.0007070488, 0.008925953, 0.0047548045, 0.00...",0.0,"[7.9756384, 5.1795044, 7.2091713, 6.2919846]","[6.755589, 5.8122826, 10.087701, 7.892994]",8.078931,7.975638


In [18]:
df.to_csv('Experience Replay Dataset.csv')

In [28]:
def test_model(model, mode='static', display=True):
    i = 0
    test_game = Gridworld(mode=mode)
    state_ = test_game.board.render_np().reshape(1,64) + np.random.rand(1,64)/10.0
    state = torch.from_numpy(state_).float()
    if display:
        print("Initial State:")
        print(test_game.display())
    status = 1
    while(status == 1): #A
        qval = model(state)
        qval_ = qval.data.numpy()
        action_ = np.argmax(qval_) #B
        action = action_set[action_]
        if display:
            print('Move #: %s; Taking action: %s' % (i, action))
        test_game.makeMove(action)
        state_ = test_game.board.render_np().reshape(1,64) + np.random.rand(1,64)/10.0
        state = torch.from_numpy(state_).float()
        if display:
            print(test_game.display())
        reward = test_game.reward()
        if reward != -1:
            if reward > 0:
                status = 2
                if display:
                    print("Game won! Reward: %s" % (reward,))
            else:
                status = 0
                if display:
                    print("Game LOST. Reward: %s" % (reward,))
        i += 1
        if (i > 15):
            if display:
                print("Game lost; too many moves.")
            break
    
    win = True if status == 2 else False
    return win

In [29]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = test_model(model, mode='random', display=False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num / 10
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 924
0.924
0.924
Games played: 1000, # of wins: 928
0.928
1.852
Games played: 1000, # of wins: 927
0.927
2.779
Games played: 1000, # of wins: 921
0.921
3.7
Games played: 1000, # of wins: 924
0.924
4.6240000000000006
Games played: 1000, # of wins: 940
0.94
5.564
Games played: 1000, # of wins: 925
0.925
6.489
Games played: 1000, # of wins: 939
0.939
7.428
Games played: 1000, # of wins: 922
0.922
8.35
Games played: 1000, # of wins: 930
0.93
9.28
Win percentage: 92.8%


In [24]:
import pandas as pd
column_names = ["Epochs","Current_State", "Pit_Position", "Goal_Position", "Wall_Position", "Action",  "Reward", "Next_State", "Epsilon_Boolean", 
                "Q_Max", "Q1_Current", "Q2_Current", "Q3_Current", "Q4_Current", "Q1_Next", "Q2_Next", "Q3_Next", "Q4_Next","Q_Next_Max", "Y"
                ]
df = pd.DataFrame(matrix, columns = column_names)
df = df.loc[(df != 0).any(axis=1)]

In [25]:
df['Current_State'] = df['Current_State'].replace(['(0, 0)'],0)
df['Current_State'] = df['Current_State'].replace(['(0, 1)'],1)
df['Current_State'] = df['Current_State'].replace(['(0, 2)'],2)
df['Current_State'] = df['Current_State'].replace(['(0, 3)'],3)
df['Current_State'] = df['Current_State'].replace(['(1, 0)'],4)
df['Current_State'] = df['Current_State'].replace(['(1, 1)'],5)
df['Current_State'] = df['Current_State'].replace(['(1, 2)'],6)
df['Current_State'] = df['Current_State'].replace(['(1, 3)'],7)
df['Current_State'] = df['Current_State'].replace(['(2, 0)'],8)
df['Current_State'] = df['Current_State'].replace(['(2, 1)'],9)
df['Current_State'] = df['Current_State'].replace(['(2, 2)'],10)
df['Current_State'] = df['Current_State'].replace(['(2, 3)'],11)
df['Current_State'] = df['Current_State'].replace(['(3, 0)'],12)
df['Current_State'] = df['Current_State'].replace(['(3, 1)'],13)
df['Current_State'] = df['Current_State'].replace(['(3, 2)'],14)
df['Current_State'] = df['Current_State'].replace(['(3, 3)'],15)

df['Next_State'] = df['Next_State'].replace(['(0, 0)'],0)
df['Next_State'] = df['Next_State'].replace(['(0, 1)'],1)
df['Next_State'] = df['Next_State'].replace(['(0, 2)'],2)
df['Next_State'] = df['Next_State'].replace(['(0, 3)'],3)
df['Next_State'] = df['Next_State'].replace(['(1, 0)'],4)
df['Next_State'] = df['Next_State'].replace(['(1, 1)'],5)
df['Next_State'] = df['Next_State'].replace(['(1, 2)'],6)
df['Next_State'] = df['Next_State'].replace(['(1, 3)'],7)
df['Next_State'] = df['Next_State'].replace(['(2, 0)'],8)
df['Next_State'] = df['Next_State'].replace(['(2, 1)'],9)
df['Next_State'] = df['Next_State'].replace(['(2, 2)'],10)
df['Next_State'] = df['Next_State'].replace(['(2, 3)'],11)
df['Next_State'] = df['Next_State'].replace(['(3, 0)'],12)
df['Next_State'] = df['Next_State'].replace(['(3, 1)'],13)
df['Next_State'] = df['Next_State'].replace(['(3, 2)'],14)
df['Next_State'] = df['Next_State'].replace(['(3, 3)'],15)

df['Goal_Position'] = df['Goal_Position'].replace(['(0, 0)'],0)
df['Goal_Position'] = df['Goal_Position'].replace(['(0, 1)'],1)
df['Goal_Position'] = df['Goal_Position'].replace(['(0, 2)'],2)
df['Goal_Position'] = df['Goal_Position'].replace(['(0, 3)'],3)
df['Goal_Position'] = df['Goal_Position'].replace(['(1, 0)'],4)
df['Goal_Position'] = df['Goal_Position'].replace(['(1, 1)'],5)
df['Goal_Position'] = df['Goal_Position'].replace(['(1, 2)'],6)
df['Goal_Position'] = df['Goal_Position'].replace(['(1, 3)'],7)
df['Goal_Position'] = df['Goal_Position'].replace(['(2, 0)'],8)
df['Goal_Position'] = df['Goal_Position'].replace(['(2, 1)'],9)
df['Goal_Position'] = df['Goal_Position'].replace(['(2, 2)'],10)
df['Goal_Position'] = df['Goal_Position'].replace(['(2, 3)'],11)
df['Goal_Position'] = df['Goal_Position'].replace(['(3, 0)'],12)
df['Goal_Position'] = df['Goal_Position'].replace(['(3, 1)'],13)
df['Goal_Position'] = df['Goal_Position'].replace(['(3, 2)'],14)
df['Goal_Position'] = df['Goal_Position'].replace(['(3, 3)'],15)

df['Pit_Position'] = df['Pit_Position'].replace(['(0, 0)'],0)
df['Pit_Position'] = df['Pit_Position'].replace(['(0, 1)'],1)
df['Pit_Position'] = df['Pit_Position'].replace(['(0, 2)'],2)
df['Pit_Position'] = df['Pit_Position'].replace(['(0, 3)'],3)
df['Pit_Position'] = df['Pit_Position'].replace(['(1, 0)'],4)
df['Pit_Position'] = df['Pit_Position'].replace(['(1, 1)'],5)
df['Pit_Position'] = df['Pit_Position'].replace(['(1, 2)'],6)
df['Pit_Position'] = df['Pit_Position'].replace(['(1, 3)'],7)
df['Pit_Position'] = df['Pit_Position'].replace(['(2, 0)'],8)
df['Pit_Position'] = df['Pit_Position'].replace(['(2, 1)'],9)
df['Pit_Position'] = df['Pit_Position'].replace(['(2, 2)'],10)
df['Pit_Position'] = df['Pit_Position'].replace(['(2, 3)'],11)
df['Pit_Position'] = df['Pit_Position'].replace(['(3, 0)'],12)
df['Pit_Position'] = df['Pit_Position'].replace(['(3, 1)'],13)
df['Pit_Position'] = df['Pit_Position'].replace(['(3, 2)'],14)
df['Pit_Position'] = df['Pit_Position'].replace(['(3, 3)'],15)

df['Wall_Position'] = df['Wall_Position'].replace(['(1, 1)'],5)
df['Wall_Position'] = df['Wall_Position'].replace(['(0, 0)'],0)
df['Wall_Position'] = df['Wall_Position'].replace(['(0, 1)'],1)
df['Wall_Position'] = df['Wall_Position'].replace(['(0, 2)'],2)
df['Wall_Position'] = df['Wall_Position'].replace(['(0, 3)'],3)
df['Wall_Position'] = df['Wall_Position'].replace(['(1, 0)'],4)
df['Wall_Position'] = df['Wall_Position'].replace(['(1, 1)'],5)
df['Wall_Position'] = df['Wall_Position'].replace(['(1, 2)'],6)
df['Wall_Position'] = df['Wall_Position'].replace(['(1, 3)'],7)
df['Wall_Position'] = df['Wall_Position'].replace(['(2, 0)'],8)
df['Wall_Position'] = df['Wall_Position'].replace(['(2, 1)'],9)
df['Wall_Position'] = df['Wall_Position'].replace(['(2, 2)'],10)
df['Wall_Position'] = df['Wall_Position'].replace(['(2, 3)'],11)
df['Wall_Position'] = df['Wall_Position'].replace(['(3, 0)'],12)
df['Wall_Position'] = df['Wall_Position'].replace(['(3, 1)'],13)
df['Wall_Position'] = df['Wall_Position'].replace(['(3, 2)'],14)
df['Wall_Position'] = df['Wall_Position'].replace(['(3, 3)'],15)

In [26]:
df

Unnamed: 0,Epochs,Current_State,Pit_Position,Goal_Position,Wall_Position,Action,Reward,Next_State,Epsilon_Boolean,Q_Max,Q1_Current,Q2_Current,Q3_Current,Q4_Current,Q1_Next,Q2_Next,Q3_Next,Q4_Next,Q_Next_Max,Y
0,0,3,7,5,10,0,-1,3,0,0.025504,0.025504,-0.028808,-0.059688,-0.096797,0.025507,-0.029397,-0.058876,-0.096399,0.025507,-0.977044
1,0,3,7,5,10,0,-1,3,0,0.025507,0.025507,-0.029397,-0.058876,-0.096399,0.025311,-0.029186,-0.058975,-0.096921,0.025311,-0.977221
2,0,3,7,5,10,0,-1,3,0,0.025311,0.025311,-0.029186,-0.058975,-0.096921,0.025259,-0.029254,-0.059084,-0.096570,0.025259,-0.977267
3,0,3,7,5,10,0,-1,3,0,0.025259,0.025259,-0.029254,-0.059084,-0.096570,0.025271,-0.029088,-0.059040,-0.097146,0.025271,-0.977256
4,0,3,7,5,10,0,-1,3,0,0.025271,0.025271,-0.029088,-0.059040,-0.097146,0.025396,-0.029485,-0.058923,-0.096761,0.025396,-0.977144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34655,4998,6,3,1,8,2,-1,5,0,7.985765,6.543100,5.417122,7.985765,3.816025,9.737021,5.751934,6.621516,6.852162,9.737021,7.763319
34656,4998,5,3,1,8,0,10,1,0,9.757797,9.757797,5.713107,6.605927,6.872090,8.405054,8.555333,9.564618,9.528024,9.564618,10.000000
34657,4999,10,8,3,14,0,-1,6,0,6.479850,6.479850,2.173633,3.403223,4.824383,8.252207,4.815167,5.064015,8.466753,8.466753,6.620078
34658,4999,6,8,3,14,3,-1,7,0,8.470242,8.273722,4.820699,5.066678,8.470242,10.759487,5.722989,5.832384,7.741495,10.759487,8.683538


In [27]:
df.to_csv('Online Dataset.csv')

In [25]:
px = df[df["Epochs"] == 0]
px

Unnamed: 0,Epochs,Current_State,Pit_Position,Goal_Position,Wall_Position,Action,Reward,Next_State,Epsilon_Boolean,Q_Max,Q1_Current,Q2_Current,Q3_Current,Q4_Current,Q1_Next,Q2_Next,Q3_Next,Q4_Next
0,0,9,13,15,1,2,-1,8,1,0.108402,0.035926,0.108402,0.036907,0.01771,0.042651,0.123794,0.059844,0.029764
1,0,8,13,15,1,1,-1,12,0,0.123794,0.042651,0.123794,0.059844,0.029764,0.045618,0.127304,0.040492,0.025523
2,0,12,13,15,1,0,-1,8,1,0.127304,0.045618,0.127304,0.040492,0.025523,0.042276,0.123744,0.059099,0.029745
3,0,8,13,15,1,3,-1,9,1,0.123744,0.042276,0.123744,0.059099,0.029745,0.03563,0.108014,0.036395,0.017554
4,0,9,13,15,1,1,-10,13,0,0.108014,0.03563,0.108014,0.036395,0.017554,0.032679,0.118293,0.045409,0.017385


In [26]:
pc = df[df["Epsilon_Boolean"] == 0]
pc

Unnamed: 0,Epochs,Current_State,Pit_Position,Goal_Position,Wall_Position,Action,Reward,Next_State,Epsilon_Boolean,Q_Max,Q1_Current,Q2_Current,Q3_Current,Q4_Current,Q1_Next,Q2_Next,Q3_Next,Q4_Next
1,0,8,13,15,1,1,-1,12,0,0.123794,0.042651,0.123794,0.059844,0.029764,0.045618,0.127304,0.040492,0.025523
4,0,9,13,15,1,1,-10,13,0,0.108014,0.035630,0.108014,0.036395,0.017554,0.032679,0.118293,0.045409,0.017385
5,1,11,13,3,7,1,-1,15,0,0.078054,0.061446,0.078054,0.055022,-0.022881,0.057225,0.072453,0.032065,-0.035964
6,1,15,13,3,7,1,-1,15,0,0.072453,0.057225,0.072453,0.032065,-0.035964,0.057361,0.073211,0.031677,-0.036029
8,1,15,13,3,7,1,-1,15,0,0.073290,0.057288,0.073290,0.031928,-0.036297,0.057360,0.072935,0.031787,-0.036132
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34480,4997,10,4,6,9,0,10,6,0,9.665089,9.665089,7.087432,5.057520,6.142529,6.756205,5.681049,6.925500,8.358397
34481,4998,11,14,8,15,2,-1,10,0,5.284176,1.598961,0.085791,5.284176,3.155251,3.115969,-2.888023,7.756437,4.292966
34482,4998,10,14,8,15,2,-1,9,0,7.754542,3.096023,-2.874245,7.754542,4.265141,5.813448,5.861845,10.081732,6.863310
34483,4998,9,14,8,15,2,10,8,0,10.080821,5.794363,5.884357,10.080821,6.834452,7.030609,7.304423,8.274137,6.244328


In [27]:
def foo(x):
    m = pd.Series.mode(x)
    if len(m) == 1: 
        return m

df = df.groupby(['Current_State', 'Next_State', 'Pit_Position', 'Goal_Position', 'Wall_Position', 'Q_Max']).Action.apply(foo).reset_index(level=1, drop=True).reset_index()

In [41]:
df

Current_State  Next_State  Pit_Position  Goal_Position  Wall_Position  Q_Max       
0              0           1             5              8              -5.488395  0    0
                                                                       -4.943810  0    0
                                                                       -4.491105  0    0
                                                                       -4.184905  0    0
                                                                       -4.030087  0    0
                                                                                      ..
15             15          14            13             3               4.189399  0    1
                                                        11              1.650604  0    0
                                                                        1.773155  0    0
                                                                        1.784922  0    0
                          

In [42]:
df = df.drop_duplicates(keep='last',subset=['Current_State', 'Pit_Position', 'Goal_Position', 'Wall_Position'])
df

TypeError: drop_duplicates() got an unexpected keyword argument 'subset'

In [23]:
print(j)

199


In [24]:
df.describe()

Unnamed: 0,Current_State,Pit_Position,Goal_Position,Wall_Position,Q_Max,level_5,Action
count,12939.0,12939.0,12939.0,12939.0,12939.0,12939.0,12939.0
mean,7.488909,7.550815,7.45985,7.474689,6.259128,0.0,1.602983
std,4.240537,4.553416,4.726872,4.57799,3.499149,0.0,1.08131
min,0.0,0.0,0.0,0.0,-19.146124,0.0,0.0
25%,4.0,4.0,3.0,4.0,4.578652,0.0,1.0
50%,8.0,8.0,7.0,7.0,6.797669,0.0,1.0
75%,11.0,12.0,12.0,11.0,8.714315,0.0,3.0
max,15.0,15.0,15.0,15.0,14.948278,0.0,3.0


In [25]:
df.drop('level_5', axis=1, inplace=True)

In [26]:
df.to_csv('For Offline RL.csv')

In [27]:
df = df.reset_index(drop=True)

In [28]:
df

Unnamed: 0,Current_State,Pit_Position,Goal_Position,Wall_Position,Q_Max,Action
0,0,9,8,4,3.868886,0
1,0,11,10,6,-7.845963,0
2,0,1,2,6,4.032922,3
3,0,1,11,2,3.295507,3
4,0,1,12,14,6.359116,3
...,...,...,...,...,...,...
12934,15,14,9,6,-3.075736,1
12935,15,14,10,2,6.072558,1
12936,15,14,12,10,1.508140,1
12937,15,14,13,3,4.189399,1


In [29]:
df.describe()

Unnamed: 0,Current_State,Pit_Position,Goal_Position,Wall_Position,Q_Max,Action
count,12939.0,12939.0,12939.0,12939.0,12939.0,12939.0
mean,7.488909,7.550815,7.45985,7.474689,6.259128,1.602983
std,4.240537,4.553416,4.726872,4.57799,3.499149,1.08131
min,0.0,0.0,0.0,0.0,-19.146124,0.0
25%,4.0,4.0,3.0,4.0,4.578652,1.0
50%,8.0,8.0,7.0,7.0,6.797669,1.0
75%,11.0,12.0,12.0,11.0,8.714315,3.0
max,15.0,15.0,15.0,15.0,14.948278,3.0


In [30]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [31]:
df["Current_State"].size

12939

In [32]:

w, h = 4, df["Current_State"].size
sample_list = [[0 for x in range(w)] for y in range(h)] 

for i in range(0,df["Current_State"].size):
    sample_list[i][0] = df["Current_State"][i] 
    sample_list[i][1] = df["Pit_Position"][i]
    sample_list[i][2] = df["Goal_Position"][i]
    sample_list[i][3] = df["Wall_Position"][i]

In [33]:
sample_list

[[0, 9, 8, 4],
 [0, 11, 10, 6],
 [0, 1, 2, 6],
 [0, 1, 11, 2],
 [0, 1, 12, 14],
 [0, 1, 14, 12],
 [0, 2, 3, 6],
 [0, 2, 3, 11],
 [0, 2, 3, 15],
 [0, 2, 6, 13],
 [0, 2, 9, 5],
 [0, 2, 13, 7],
 [0, 3, 1, 6],
 [0, 3, 1, 8],
 [0, 3, 1, 12],
 [0, 3, 1, 13],
 [0, 3, 5, 6],
 [0, 3, 5, 8],
 [0, 3, 5, 13],
 [0, 3, 9, 2],
 [0, 3, 10, 5],
 [0, 3, 10, 7],
 [0, 3, 11, 9],
 [0, 3, 11, 10],
 [0, 3, 11, 13],
 [0, 3, 13, 2],
 [0, 3, 13, 11],
 [0, 3, 13, 12],
 [0, 3, 14, 7],
 [0, 3, 15, 13],
 [0, 4, 2, 7],
 [0, 4, 5, 7],
 [0, 4, 5, 8],
 [0, 4, 6, 11],
 [0, 4, 6, 13],
 [0, 4, 6, 14],
 [0, 4, 7, 5],
 [0, 4, 7, 12],
 [0, 4, 9, 15],
 [0, 4, 11, 10],
 [0, 4, 11, 15],
 [0, 4, 12, 2],
 [0, 4, 12, 7],
 [0, 4, 13, 15],
 [0, 4, 14, 11],
 [0, 4, 14, 12],
 [0, 4, 15, 14],
 [0, 5, 1, 3],
 [0, 5, 1, 9],
 [0, 5, 1, 13],
 [0, 5, 1, 14],
 [0, 5, 2, 7],
 [0, 5, 2, 10],
 [0, 5, 2, 11],
 [0, 5, 3, 6],
 [0, 5, 3, 7],
 [0, 5, 3, 12],
 [0, 5, 3, 13],
 [0, 5, 4, 14],
 [0, 5, 6, 7],
 [0, 5, 6, 12],
 [0, 5, 6, 15],
 [0, 5, 7, 10

In [34]:
df["State_current"] = sample_list
df["State_next"] = sample_list1

NameError: name 'sample_list1' is not defined

In [None]:

df

In [None]:
df

In [None]:
np.asarray(df["State_next"])

In [None]:
df

In [None]:
X = df[['Current_State','Pit_Position', 'Goal_Position', 'Wall_Position']].values
y = df[['Q_Max']].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/3, random_state = 0)

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predicting the Test set results
y_pred = regressor.predict(X_test)

In [None]:
ind = np.argwhere(y_pred==np.amax(y_pred,1, keepdims=True))
index = []
for i in ind:
    index.append(i[1])
index

In [None]:
y_pred

In [None]:
y_test

In [None]:
ind = np.argwhere(y_test==np.amax(y_test,1, keepdims=True))
index2 = []
for i in ind:
    index2.append(i[1])
index2

In [None]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_pred)

In [None]:
from sklearn.metrics import mean_absolute_error as mae
mae(y_test, y_pred)

In [None]:
from math import sqrt
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)
rmse

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score( index2, index))

In [None]:
X

In [None]:
y

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

model.fit(X_train, y_train.ravel())
y_pred = model.predict(X_test)

In [None]:
y_test

In [None]:
y_pred