In [1]:
import numpy as np
import random
import sys

def randPair(s,e):
    return np.random.randint(s,e), np.random.randint(s,e)

class BoardPiece:

    def __init__(self, name, code, pos):
        self.name = name #name of the piece
        self.code = code #an ASCII character to display on the board
        self.pos = pos #2-tuple e.g. (1,4)

class BoardMask:

    def __init__(self, name, mask, code):
        self.name = name
        self.mask = mask
        self.code = code

    def get_positions(self): #returns tuple of arrays
        return np.nonzero(self.mask)

def zip_positions2d(positions): #positions is tuple of two arrays
    x,y = positions
    return list(zip(x,y))

class GridBoard:

    def __init__(self, size=4):
        self.size = size #Board dimensions, e.g. 4 x 4
        self.components = {} #name : board piece
        self.masks = {}

    def addPiece(self, name, code, pos=(0,0)):
        newPiece = BoardPiece(name, code, pos)
        self.components[name] = newPiece

    #basically a set of boundary elements
    def addMask(self, name, mask, code):
        #mask is a 2D-numpy array with 1s where the boundary elements are
        newMask = BoardMask(name, mask, code)
        self.masks[name] = newMask

    def movePiece(self, name, pos):
        move = True
        for _, mask in self.masks.items():
            if pos in zip_positions2d(mask.get_positions()):
                move = False
        if move:
            self.components[name].pos = pos

    def delPiece(self, name):
        del self.components['name']

    def render(self):
        dtype = '<U2'
        displ_board = np.zeros((self.size, self.size), dtype=dtype)
        displ_board[:] = ' '

        for name, piece in self.components.items():
            displ_board[piece.pos] = piece.code

        for name, mask in self.masks.items():
            displ_board[mask.get_positions()] = mask.code

        return displ_board

    def render_np(self):
        num_pieces = len(self.components) + len(self.masks)
        displ_board = np.zeros((num_pieces, self.size, self.size), dtype=np.uint8)
        layer = 0
        for name, piece in self.components.items():
            pos = (layer,) + piece.pos
            displ_board[pos] = 1
            layer += 1

        for name, mask in self.masks.items():
            x,y = self.masks['boundary'].get_positions()
            z = np.repeat(layer,len(x))
            a = (z,x,y)
            displ_board[a] = 1
            layer += 1
        return displ_board

def addTuple(a,b):
    return tuple([sum(x) for x in zip(a,b)])

In [2]:
class Gridworld:

    def __init__(self, size=4, mode='static'):
        if size >= 4:
            self.board = GridBoard(size=size)
        else:
            print("Minimum board size is 4. Initialized to size 4.")
            self.board = GridBoard(size=4)

        #Add pieces, positions will be updated later
        self.board.addPiece('Player','P',(0,0))
        self.board.addPiece('Goal','+',(1,0))
        self.board.addPiece('Pit','-',(2,0))
        self.board.addPiece('Wall','W',(3,0))

        if mode == 'static':
            self.initGridStatic()
        elif mode == 'player':
            self.initGridPlayer()
        else:
            self.initGridRand()

    #Initialize stationary grid, all items are placed deterministically
    def initGridStatic(self):
        #Setup static pieces
        self.board.components['Player'].pos = (0,3) #Row, Column
        self.board.components['Goal'].pos = (0,0)
        self.board.components['Pit'].pos = (0,1)
        self.board.components['Wall'].pos = (1,1)

    #Check if board is initialized appropriately (no overlapping pieces)
    #also remove impossible-to-win boards
    def validateBoard(self):
        valid = True

        player = self.board.components['Player']
        goal = self.board.components['Goal']
        wall = self.board.components['Wall']
        pit = self.board.components['Pit']

        all_positions = [piece for name,piece in self.board.components.items()]
        all_positions = [player.pos, goal.pos, wall.pos, pit.pos]
        if len(all_positions) > len(set(all_positions)):
            return False

        corners = [(0,0),(0,self.board.size), (self.board.size,0), (self.board.size,self.board.size)]
        #if player is in corner, can it move? if goal is in corner, is it blocked?
        if player.pos in corners or goal.pos in corners:
            val_move_pl = [self.validateMove('Player', addpos) for addpos in [(0,1),(1,0),(-1,0),(0,-1)]]
            val_move_go = [self.validateMove('Goal', addpos) for addpos in [(0,1),(1,0),(-1,0),(0,-1)]]
            if 0 not in val_move_pl or 0 not in val_move_go:
                #print(self.display())
                #print("Invalid board. Re-initializing...")
                valid = False

        return valid

    #Initialize player in random location, but keep wall, goal and pit stationary
    def initGridPlayer(self):
        #height x width x depth (number of pieces)
        self.initGridStatic()
        #place player
        self.board.components['Player'].pos = randPair(0,self.board.size)

        if (not self.validateBoard()):
            #print('Invalid grid. Rebuilding..')
            self.initGridPlayer()

    #Initialize grid so that goal, pit, wall, player are all randomly placed
    def initGridRand(self):
        #height x width x depth (number of pieces)
        self.board.components['Player'].pos = randPair(0,self.board.size)
        self.board.components['Goal'].pos = randPair(0,self.board.size)
        self.board.components['Pit'].pos = randPair(0,self.board.size)
        self.board.components['Wall'].pos = randPair(0,self.board.size)

        if (not self.validateBoard()):
            #print('Invalid grid. Rebuilding..')
            self.initGridRand()

    def validateMove(self, piece, addpos=(0,0)):
        outcome = 0 #0 is valid, 1 invalid, 2 lost game
        pit = self.board.components['Pit'].pos
        wall = self.board.components['Wall'].pos
        new_pos = addTuple(self.board.components[piece].pos, addpos)
        if new_pos == wall:
            outcome = 1 #block move, player can't move to wall
        elif max(new_pos) > (self.board.size-1):    #if outside bounds of board
            outcome = 1
        elif min(new_pos) < 0: #if outside bounds
            outcome = 1
        elif new_pos == pit:
            outcome = 2

        return outcome

    def makeMove(self, action):
        #need to determine what object (if any) is in the new grid spot the player is moving to
        #actions in {u,d,l,r}
        def checkMove(addpos):
            if self.validateMove('Player', addpos) in [0,2]:
                new_pos = addTuple(self.board.components['Player'].pos, addpos)
                self.board.movePiece('Player', new_pos)

        if action == 'u': #up
            checkMove((-1,0))
        elif action == 'd': #down
            checkMove((1,0))
        elif action == 'l': #left
            checkMove((0,-1))
        elif action == 'r': #right
            checkMove((0,1))
        else:
            pass

    def reward(self):
        if (self.board.components['Player'].pos == self.board.components['Pit'].pos):
            return -10
        elif (self.board.components['Player'].pos == self.board.components['Goal'].pos):
            return 10
        else:
            return -1

    def display(self):
        return self.board.render()

In [3]:
action_set = {
    0: 'u',
    1: 'd',
    2: 'l',
    3: 'r',
}

In [4]:
import numpy as np
import torch
from IPython.display import clear_output
import random
from matplotlib import pylab as plt

l1 = 64
l2 = 200
l3 = 120
l4 = 4

model = torch.nn.Sequential(
    torch.nn.Linear(l1, l2),
    torch.nn.ReLU(),
    torch.nn.Linear(l2, l3),
    torch.nn.ReLU(),
    torch.nn.Linear(l3,l4)
)
loss_fn = torch.nn.MSELoss()
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

gamma = 0.9
epsilon = 0.3

In [5]:
from collections import deque
epochs = 5000
losses = []
mem_size = 1000 #A
batch_size = 200 #B
replay = deque(maxlen=mem_size) #C
max_moves = 50 #D

h = 0
w, h = 18, 300000
matrix = [[0 for x in range(w)] for y in range(h)] 

e = 0
e,r= 10, 3000000 
experience = [[0 for x in range(e)] for y in range(r)] 

y = 0
u,y= 7, 3000000 
q_table = [[0 for x in range(u)] for y in range(y)] 
q_counter = 0 

counter = 0
num = 0
for i in range(epochs):
    game = Gridworld(size=4, mode='random')
    state1_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/100.0
    state1 = torch.from_numpy(state1_).float()
    status = 1
    mov = 0
    while(status == 1): 
        mov += 1
        qval = model(state1) #E
        qval_ = qval.data.numpy()
        
        matrix[counter][9] = qval_[0][np.argmax(qval_)]
        matrix[counter][10] = qval_[0][0]
        matrix[counter][11] = qval_[0][1]
        matrix[counter][12] = qval_[0][2]
        matrix[counter][13] = qval_[0][3]
        
        for k in range(0,4):
            q_table[q_counter][0] = i
            q_table[q_counter][1] = (str(game.board.components['Player'].pos))
            q_table[q_counter][2] = (str(game.board.components['Pit'].pos))
            q_table[q_counter][3] = (str(game.board.components['Goal'].pos))
            q_table[q_counter][4] = (str(game.board.components['Wall'].pos))
            q_table[q_counter][5] = k
            q_table[q_counter][6] = qval_[0][k]
            q_counter+=1
            
        if (random.random() < epsilon): #F
            matrix[counter][8] = 1 
            action_ = np.random.randint(0,4)
        else:
            matrix[counter][8] = 0 
            action_ = np.argmax(qval_)
        
        action = action_set[action_]
        
        matrix[counter][0] = i
        matrix[counter][1] = (str(game.board.components['Player'].pos))
        matrix[counter][2] = (str(game.board.components['Pit'].pos))
        matrix[counter][3] = (str(game.board.components['Goal'].pos))
        matrix[counter][4] = (str(game.board.components['Wall'].pos))
        matrix[counter][5] = action_
        
        game.makeMove(action)
        state2_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/100.0
        state2 = torch.from_numpy(state2_).float()
        next_state = model(state2)
        next_state_ = next_state.data.numpy()
        matrix[counter][14] = next_state_[0][0]
        matrix[counter][15] = next_state_[0][1]
        matrix[counter][16] = next_state_[0][2]
        matrix[counter][17] = next_state_[0][3]
        reward = game.reward()
        matrix[counter][6] = reward
        done = True if reward > 0 else False
        matrix[counter][7] = (str(game.board.components['Player'].pos))
        exp =  (state1, action_, reward, state2, done) #G
        replay.append(exp) #H
        state1 = state2
        
        
        
        if len(replay) > batch_size: #I
            minibatch = random.sample(replay, batch_size) #J
            state1_batch = torch.cat([s1 for (s1,a,r,s2,d) in minibatch]) #K
            action_batch = torch.Tensor([a for (s1,a,r,s2,d) in minibatch])
            reward_batch = torch.Tensor([r for (s1,a,r,s2,d) in minibatch])
            state2_batch = torch.cat([s2 for (s1,a,r,s2,d) in minibatch])
            done_batch = torch.Tensor([d for (s1,a,r,s2,d) in minibatch])
            
            Q1 = model(state1_batch) #L
            with torch.no_grad():
                Q2 = model(state2_batch) #M
            
            Y = reward_batch + gamma * ((1 - done_batch) * torch.max(Q2,dim=1)[0]) #N
            X = Q1.gather(dim=1,index=action_batch.long().unsqueeze(dim=1)).squeeze()
            loss = loss_fn(X, Y.detach())
            print(i)
            if (i >= 4500):
                for j in range (0, batch_size):
                    experience[num][0] = i
                    experience[num][1] = state1_batch[j].numpy()
                    experience[num][2] = action_batch[j].item()
                    experience[num][3] = reward_batch[j].item()
                    experience[num][4] = state2_batch[j].numpy()
                    experience[num][5] = done_batch[j].item()
                    experience[num][6] = Q1[j].detach().numpy()
                    experience[num][7] = Q2[j].detach().numpy()
                    experience[num][8] = Y[j].item()
                    experience[num][9] = X[j].item()
                    num += 1
            clear_output(wait=True)
            optimizer.zero_grad()
            loss.backward()
            losses.append(loss.item())
            optimizer.step()
        counter += 1
        if reward != -1 or mov > max_moves: #O
            status = 0
            mov = 0
losses = np.array(losses)

4999


In [6]:
q_table

[[0, '(2, 0)', '(2, 2)', '(3, 0)', '(2, 1)', 0, -0.06512834],
 [0, '(2, 0)', '(2, 2)', '(3, 0)', '(2, 1)', 1, -0.0845169],
 [0, '(2, 0)', '(2, 2)', '(3, 0)', '(2, 1)', 2, 0.05415383],
 [0, '(2, 0)', '(2, 2)', '(3, 0)', '(2, 1)', 3, 0.09179444],
 [0, '(2, 0)', '(2, 2)', '(3, 0)', '(2, 1)', 0, -0.065415725],
 [0, '(2, 0)', '(2, 2)', '(3, 0)', '(2, 1)', 1, -0.08441857],
 [0, '(2, 0)', '(2, 2)', '(3, 0)', '(2, 1)', 2, 0.05396694],
 [0, '(2, 0)', '(2, 2)', '(3, 0)', '(2, 1)', 3, 0.092020586],
 [0, '(2, 0)', '(2, 2)', '(3, 0)', '(2, 1)', 0, -0.0650423],
 [0, '(2, 0)', '(2, 2)', '(3, 0)', '(2, 1)', 1, -0.08407344],
 [0, '(2, 0)', '(2, 2)', '(3, 0)', '(2, 1)', 2, 0.054280397],
 [0, '(2, 0)', '(2, 2)', '(3, 0)', '(2, 1)', 3, 0.092051096],
 [0, '(2, 0)', '(2, 2)', '(3, 0)', '(2, 1)', 0, -0.06542055],
 [0, '(2, 0)', '(2, 2)', '(3, 0)', '(2, 1)', 1, -0.08404404],
 [0, '(2, 0)', '(2, 2)', '(3, 0)', '(2, 1)', 2, 0.054130096],
 [0, '(2, 0)', '(2, 2)', '(3, 0)', '(2, 1)', 3, 0.09203468],
 [0, '(2, 0)'

In [7]:
import pandas as pd
column_names = ["Epochs","Current_State", "Pit_Position", "Goal_Position", "Wall_Position", "Action", "Q_value"]
df = pd.DataFrame(q_table, columns = column_names)
df = df.loc[(df != 0).any(axis=1)]
df

Unnamed: 0,Epochs,Current_State,Pit_Position,Goal_Position,Wall_Position,Action,Q_value
0,0,"(2, 0)","(2, 2)","(3, 0)","(2, 1)",0,-0.065128
1,0,"(2, 0)","(2, 2)","(3, 0)","(2, 1)",1,-0.084517
2,0,"(2, 0)","(2, 2)","(3, 0)","(2, 1)",2,0.054154
3,0,"(2, 0)","(2, 2)","(3, 0)","(2, 1)",3,0.091794
4,0,"(2, 0)","(2, 2)","(3, 0)","(2, 1)",0,-0.065416
...,...,...,...,...,...,...,...
132255,4999,"(2, 1)","(3, 3)","(3, 2)","(3, 1)",3,8.909819
132256,4999,"(2, 2)","(3, 3)","(3, 2)","(3, 1)",0,3.301888
132257,4999,"(2, 2)","(3, 3)","(3, 2)","(3, 1)",1,9.178448
132258,4999,"(2, 2)","(3, 3)","(3, 2)","(3, 1)",2,5.485474


In [8]:
experience[2]

[4500,
 array([7.7631292e-03, 4.2517977e-03, 8.8438811e-03, 4.1921209e-03,
        7.0260494e-04, 6.5940199e-03, 9.4555458e-03, 3.2473539e-03,
        3.4193119e-03, 6.3995942e-03, 3.1324025e-04, 1.0063483e+00,
        4.7888085e-03, 3.6719581e-03, 5.5684294e-03, 2.6754129e-03,
        3.9946954e-03, 5.6048427e-03, 6.0588792e-03, 6.0100281e-03,
        8.0156315e-04, 2.2768923e-03, 1.5409586e-03, 4.0452266e-03,
        4.9272585e-03, 5.1835612e-03, 5.2264179e-03, 6.6643874e-03,
        3.6845654e-03, 7.8712907e-03, 4.1357232e-03, 1.0010842e+00,
        9.0588219e-03, 2.9065479e-03, 5.2964962e-03, 7.1283523e-03,
        9.2062363e-03, 7.6659792e-03, 2.7577614e-03, 6.9696540e-03,
        1.0363866e-03, 7.4231522e-03, 2.5033187e-03, 6.4283349e-03,
        1.0047539e+00, 1.5868293e-03, 4.9389568e-03, 4.0099057e-03,
        4.3813596e-03, 4.6467106e-03, 1.0047823e+00, 1.9141302e-03,
        7.8359935e-03, 8.0491146e-03, 7.6686023e-03, 8.1224255e-03,
        2.8502767e-03, 9.3399379e-03, 6.9

In [9]:
matrix

[[0,
  '(2, 0)',
  '(2, 2)',
  '(3, 0)',
  '(2, 1)',
  3,
  -1,
  '(2, 0)',
  0,
  0.09179444,
  -0.06512834,
  -0.0845169,
  0.05415383,
  0.09179444,
  -0.065415725,
  -0.08441857,
  0.05396694,
  0.092020586],
 [0,
  '(2, 0)',
  '(2, 2)',
  '(3, 0)',
  '(2, 1)',
  3,
  -1,
  '(2, 0)',
  0,
  0.092020586,
  -0.065415725,
  -0.08441857,
  0.05396694,
  0.092020586,
  -0.0650423,
  -0.08407344,
  0.054280397,
  0.092051096],
 [0,
  '(2, 0)',
  '(2, 2)',
  '(3, 0)',
  '(2, 1)',
  3,
  -1,
  '(2, 0)',
  0,
  0.092051096,
  -0.0650423,
  -0.08407344,
  0.054280397,
  0.092051096,
  -0.06542055,
  -0.08404404,
  0.054130096,
  0.09203468],
 [0,
  '(2, 0)',
  '(2, 2)',
  '(3, 0)',
  '(2, 1)',
  3,
  -1,
  '(2, 0)',
  0,
  0.09203468,
  -0.06542055,
  -0.08404404,
  0.054130096,
  0.09203468,
  -0.0655403,
  -0.084577076,
  0.054153644,
  0.09247054],
 [0,
  '(2, 0)',
  '(2, 2)',
  '(3, 0)',
  '(2, 1)',
  3,
  -1,
  '(2, 0)',
  0,
  0.09247054,
  -0.0655403,
  -0.084577076,
  0.054153644,
  

In [10]:
import pandas as pd
column_names = ["Epochs","Current_State",  "Action",  "Reward", "Next_State", "Done_Boolean", 
                "Q-Current", "Q-Next", "Y", "X"
                ]
df = pd.DataFrame(experience, columns = column_names)


In [11]:
df= df[df['Epochs'] != 0]

In [12]:
df

Unnamed: 0,Epochs,Current_State,Action,Reward,Next_State,Done_Boolean,Q-Current,Q-Next,Y,X
0,4500,"[0.0050391452, 0.0012472264, 0.00079970376, 0....",0.0,-1.0,"[0.007370607, 0.0059509976, 0.008966369, 0.002...",0.0,"[6.9700756, 5.5493155, 3.1397123, 5.2080145]","[4.8217955, 3.700978, 5.7583013, 8.1700735]",6.353066,6.970076
1,4500,"[0.003955518, 0.004775499, 0.0024701217, 0.003...",0.0,10.0,"[1.0060511, 0.005905763, 0.0078038787, 0.00401...",1.0,"[9.971596, 5.651679, 8.977305, 7.321216]","[7.9829597, 7.489813, 8.365583, 7.2007494]",10.000000,9.971596
2,4500,"[0.007763129, 0.0042517977, 0.008843881, 0.004...",3.0,-1.0,"[0.00838205, 0.0019965447, 0.0069523277, 0.006...",0.0,"[6.1132336, 10.292167, 6.407116, 8.146861]","[6.073174, 10.2623, 6.4127407, 8.118126]",8.236070,8.146861
3,4500,"[0.003546124, 0.001351108, 0.0015873745, 0.008...",2.0,-1.0,"[0.009520578, 0.008496764, 0.004745849, 0.0096...",0.0,"[0.8139766, 6.5675917, 8.22075, 3.7249482]","[5.7074895, 10.056242, 6.677065, 4.709913]",8.050617,8.220750
4,4500,"[0.005662606, 0.0058006146, 0.00512436, 0.0056...",0.0,-1.0,"[0.0059546134, 0.005328042, 0.0058663245, 0.00...",0.0,"[6.3577538, 4.6830287, 10.098886, 6.5953164]","[4.660234, 7.9770474, 3.9066114, 3.5302372]",6.179343,6.357754
...,...,...,...,...,...,...,...,...,...,...
448395,4999,"[0.0062368326, 0.0054944614, 0.008802659, 0.00...",0.0,-1.0,"[0.0019541613, 0.0019006933, 0.007402317, 0.00...",0.0,"[7.6072435, 5.9327374, 2.4020913, 2.8861272]","[9.652337, 5.794551, 6.037756, 6.492542]",7.687103,7.607244
448396,4999,"[0.0015580722, 0.0096820295, 0.00056141656, 0....",1.0,-1.0,"[0.008244314, 0.0024049245, 0.007257939, 0.008...",0.0,"[1.29854, 0.29084328, 0.25390595, 1.2850877]","[1.4372032, 0.268999, 0.2551936, 1.3186085]",0.293483,0.290843
448397,4999,"[0.002617883, 0.0028160107, 0.0009678596, 0.00...",2.0,10.0,"[0.006577461, 0.0075890105, 0.007017012, 0.007...",1.0,"[6.9008703, 6.620319, 10.0191345, 5.4295993]","[6.6176634, 7.3596163, 8.886859, 8.207654]",10.000000,10.019135
448398,4999,"[0.006945565, 0.008005309, 0.0006289227, 0.003...",2.0,-1.0,"[0.009543014, 0.00082073105, 0.00035376285, 0....",0.0,"[2.8145716, 1.7802843, 2.8684247, 2.2560003]","[4.3692126, 2.740865, -0.47274107, 1.5714526]",2.932291,2.868425


In [13]:
def test_model(model, mode='static', display=True):
    i = 0
    test_game = Gridworld(mode=mode)
    state_ = test_game.board.render_np().reshape(1,64) + np.random.rand(1,64)/10.0
    state = torch.from_numpy(state_).float()
    if display:
        print("Initial State:")
        print(test_game.display())
    status = 1
    while(status == 1): #A
        qval = model(state)
        qval_ = qval.data.numpy()
        action_ = np.argmax(qval_) #B
        action = action_set[action_]
        if display:
            print('Move #: %s; Taking action: %s' % (i, action))
        test_game.makeMove(action)
        state_ = test_game.board.render_np().reshape(1,64) + np.random.rand(1,64)/10.0
        state = torch.from_numpy(state_).float()
        if display:
            print(test_game.display())
        reward = test_game.reward()
        if reward != -1:
            if reward > 0:
                status = 2
                if display:
                    print("Game won! Reward: %s" % (reward,))
            else:
                status = 0
                if display:
                    print("Game LOST. Reward: %s" % (reward,))
        i += 1
        if (i > 15):
            if display:
                print("Game lost; too many moves.")
            break
    
    win = True if status == 2 else False
    return win

In [14]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = test_model(model, mode='random', display=False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num / 10
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 913
0.913
0.913
Games played: 1000, # of wins: 899
0.899
1.812
Games played: 1000, # of wins: 908
0.908
2.72
Games played: 1000, # of wins: 910
0.91
3.6300000000000003
Games played: 1000, # of wins: 924
0.924
4.554
Games played: 1000, # of wins: 911
0.911
5.465
Games played: 1000, # of wins: 913
0.913
6.378
Games played: 1000, # of wins: 914
0.914
7.292
Games played: 1000, # of wins: 917
0.917
8.209
Games played: 1000, # of wins: 919
0.919
9.128
Win percentage: 91.28%


In [36]:
import pandas as pd
column_names = ["Epochs","Current_State", "Pit_Position", "Goal_Position", "Wall_Position", "Action",  "Reward", "Next_State", "Epsilon_Boolean", 
                "Q_Max", "Q1_Current", "Q2_Current", "Q3_Current", "Q4_Current", "Q1_Next", "Q2_Next", "Q3_Next", "Q4_Next"
                ]
df = pd.DataFrame(matrix, columns = column_names)
df = df.loc[(df != 0).any(axis=1)]

In [37]:
df['Current_State'] = df['Current_State'].replace(['(0, 0)'],0)
df['Current_State'] = df['Current_State'].replace(['(0, 1)'],1)
df['Current_State'] = df['Current_State'].replace(['(0, 2)'],2)
df['Current_State'] = df['Current_State'].replace(['(0, 3)'],3)
df['Current_State'] = df['Current_State'].replace(['(1, 0)'],4)
df['Current_State'] = df['Current_State'].replace(['(1, 1)'],5)
df['Current_State'] = df['Current_State'].replace(['(1, 2)'],6)
df['Current_State'] = df['Current_State'].replace(['(1, 3)'],7)
df['Current_State'] = df['Current_State'].replace(['(2, 0)'],8)
df['Current_State'] = df['Current_State'].replace(['(2, 1)'],9)
df['Current_State'] = df['Current_State'].replace(['(2, 2)'],10)
df['Current_State'] = df['Current_State'].replace(['(2, 3)'],11)
df['Current_State'] = df['Current_State'].replace(['(3, 0)'],12)
df['Current_State'] = df['Current_State'].replace(['(3, 1)'],13)
df['Current_State'] = df['Current_State'].replace(['(3, 2)'],14)
df['Current_State'] = df['Current_State'].replace(['(3, 3)'],15)

df['Next_State'] = df['Next_State'].replace(['(0, 0)'],0)
df['Next_State'] = df['Next_State'].replace(['(0, 1)'],1)
df['Next_State'] = df['Next_State'].replace(['(0, 2)'],2)
df['Next_State'] = df['Next_State'].replace(['(0, 3)'],3)
df['Next_State'] = df['Next_State'].replace(['(1, 0)'],4)
df['Next_State'] = df['Next_State'].replace(['(1, 1)'],5)
df['Next_State'] = df['Next_State'].replace(['(1, 2)'],6)
df['Next_State'] = df['Next_State'].replace(['(1, 3)'],7)
df['Next_State'] = df['Next_State'].replace(['(2, 0)'],8)
df['Next_State'] = df['Next_State'].replace(['(2, 1)'],9)
df['Next_State'] = df['Next_State'].replace(['(2, 2)'],10)
df['Next_State'] = df['Next_State'].replace(['(2, 3)'],11)
df['Next_State'] = df['Next_State'].replace(['(3, 0)'],12)
df['Next_State'] = df['Next_State'].replace(['(3, 1)'],13)
df['Next_State'] = df['Next_State'].replace(['(3, 2)'],14)
df['Next_State'] = df['Next_State'].replace(['(3, 3)'],15)

df['Goal_Position'] = df['Goal_Position'].replace(['(0, 0)'],0)
df['Goal_Position'] = df['Goal_Position'].replace(['(0, 1)'],1)
df['Goal_Position'] = df['Goal_Position'].replace(['(0, 2)'],2)
df['Goal_Position'] = df['Goal_Position'].replace(['(0, 3)'],3)
df['Goal_Position'] = df['Goal_Position'].replace(['(1, 0)'],4)
df['Goal_Position'] = df['Goal_Position'].replace(['(1, 1)'],5)
df['Goal_Position'] = df['Goal_Position'].replace(['(1, 2)'],6)
df['Goal_Position'] = df['Goal_Position'].replace(['(1, 3)'],7)
df['Goal_Position'] = df['Goal_Position'].replace(['(2, 0)'],8)
df['Goal_Position'] = df['Goal_Position'].replace(['(2, 1)'],9)
df['Goal_Position'] = df['Goal_Position'].replace(['(2, 2)'],10)
df['Goal_Position'] = df['Goal_Position'].replace(['(2, 3)'],11)
df['Goal_Position'] = df['Goal_Position'].replace(['(3, 0)'],12)
df['Goal_Position'] = df['Goal_Position'].replace(['(3, 1)'],13)
df['Goal_Position'] = df['Goal_Position'].replace(['(3, 2)'],14)
df['Goal_Position'] = df['Goal_Position'].replace(['(3, 3)'],15)

df['Pit_Position'] = df['Pit_Position'].replace(['(0, 0)'],0)
df['Pit_Position'] = df['Pit_Position'].replace(['(0, 1)'],1)
df['Pit_Position'] = df['Pit_Position'].replace(['(0, 2)'],2)
df['Pit_Position'] = df['Pit_Position'].replace(['(0, 3)'],3)
df['Pit_Position'] = df['Pit_Position'].replace(['(1, 0)'],4)
df['Pit_Position'] = df['Pit_Position'].replace(['(1, 1)'],5)
df['Pit_Position'] = df['Pit_Position'].replace(['(1, 2)'],6)
df['Pit_Position'] = df['Pit_Position'].replace(['(1, 3)'],7)
df['Pit_Position'] = df['Pit_Position'].replace(['(2, 0)'],8)
df['Pit_Position'] = df['Pit_Position'].replace(['(2, 1)'],9)
df['Pit_Position'] = df['Pit_Position'].replace(['(2, 2)'],10)
df['Pit_Position'] = df['Pit_Position'].replace(['(2, 3)'],11)
df['Pit_Position'] = df['Pit_Position'].replace(['(3, 0)'],12)
df['Pit_Position'] = df['Pit_Position'].replace(['(3, 1)'],13)
df['Pit_Position'] = df['Pit_Position'].replace(['(3, 2)'],14)
df['Pit_Position'] = df['Pit_Position'].replace(['(3, 3)'],15)

df['Wall_Position'] = df['Wall_Position'].replace(['(1, 1)'],5)
df['Wall_Position'] = df['Wall_Position'].replace(['(0, 0)'],0)
df['Wall_Position'] = df['Wall_Position'].replace(['(0, 1)'],1)
df['Wall_Position'] = df['Wall_Position'].replace(['(0, 2)'],2)
df['Wall_Position'] = df['Wall_Position'].replace(['(0, 3)'],3)
df['Wall_Position'] = df['Wall_Position'].replace(['(1, 0)'],4)
df['Wall_Position'] = df['Wall_Position'].replace(['(1, 1)'],5)
df['Wall_Position'] = df['Wall_Position'].replace(['(1, 2)'],6)
df['Wall_Position'] = df['Wall_Position'].replace(['(1, 3)'],7)
df['Wall_Position'] = df['Wall_Position'].replace(['(2, 0)'],8)
df['Wall_Position'] = df['Wall_Position'].replace(['(2, 1)'],9)
df['Wall_Position'] = df['Wall_Position'].replace(['(2, 2)'],10)
df['Wall_Position'] = df['Wall_Position'].replace(['(2, 3)'],11)
df['Wall_Position'] = df['Wall_Position'].replace(['(3, 0)'],12)
df['Wall_Position'] = df['Wall_Position'].replace(['(3, 1)'],13)
df['Wall_Position'] = df['Wall_Position'].replace(['(3, 2)'],14)
df['Wall_Position'] = df['Wall_Position'].replace(['(3, 3)'],15)

In [38]:
df

Unnamed: 0,Epochs,Current_State,Pit_Position,Goal_Position,Wall_Position,Action,Reward,Next_State,Epsilon_Boolean,Q_Max,Q1_Current,Q2_Current,Q3_Current,Q4_Current,Q1_Next,Q2_Next,Q3_Next,Q4_Next
0,0,8,10,12,9,3,-1,8,0,0.091794,-0.065128,-0.084517,0.054154,0.091794,-0.065416,-0.084419,0.053967,0.092021
1,0,8,10,12,9,3,-1,8,0,0.092021,-0.065416,-0.084419,0.053967,0.092021,-0.065042,-0.084073,0.054280,0.092051
2,0,8,10,12,9,3,-1,8,0,0.092051,-0.065042,-0.084073,0.054280,0.092051,-0.065421,-0.084044,0.054130,0.092035
3,0,8,10,12,9,3,-1,8,0,0.092035,-0.065421,-0.084044,0.054130,0.092035,-0.065540,-0.084577,0.054154,0.092471
4,0,8,10,12,9,3,-1,8,0,0.092471,-0.065540,-0.084577,0.054154,0.092471,-0.065336,-0.084862,0.054133,0.091836
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33060,4999,2,15,14,13,1,-1,6,0,6.129316,3.491375,6.129316,2.507191,2.216899,3.118028,8.377428,3.581647,4.995778
33061,4999,6,15,14,13,2,-1,5,1,8.360590,3.091562,8.360590,3.576749,5.000494,2.115431,6.833839,4.550787,5.135643
33062,4999,5,15,14,13,1,-1,9,0,6.809101,2.090010,6.809101,4.536398,5.135134,2.390096,4.991859,4.299376,8.889229
33063,4999,9,15,14,13,3,-1,10,0,8.909819,2.361672,4.966386,4.306787,8.909819,3.343138,9.182156,5.461194,5.354160


In [39]:
df.to_csv('Online Dataset.csv')

In [19]:
px = df[df["Epochs"] == 0]
px

Unnamed: 0,Epochs,Current_State,Pit_Position,Goal_Position,Wall_Position,Action,Reward,Next_State,Epsilon_Boolean,Q_Max,Q1_Current,Q2_Current,Q3_Current,Q4_Current,Q1_Next,Q2_Next,Q3_Next,Q4_Next
0,0,8,10,12,9,3,-1,8,0,0.091794,-0.065128,-0.084517,0.054154,0.091794,-0.065416,-0.084419,0.053967,0.092021
1,0,8,10,12,9,3,-1,8,0,0.092021,-0.065416,-0.084419,0.053967,0.092021,-0.065042,-0.084073,0.05428,0.092051
2,0,8,10,12,9,3,-1,8,0,0.092051,-0.065042,-0.084073,0.05428,0.092051,-0.065421,-0.084044,0.05413,0.092035
3,0,8,10,12,9,3,-1,8,0,0.092035,-0.065421,-0.084044,0.05413,0.092035,-0.06554,-0.084577,0.054154,0.092471
4,0,8,10,12,9,3,-1,8,0,0.092471,-0.06554,-0.084577,0.054154,0.092471,-0.065336,-0.084862,0.054133,0.091836
5,0,8,10,12,9,3,-1,8,0,0.091836,-0.065336,-0.084862,0.054133,0.091836,-0.06535,-0.083599,0.053776,0.092189
6,0,8,10,12,9,1,10,12,1,0.092189,-0.06535,-0.083599,0.053776,0.092189,-0.05737,-0.080531,0.029166,0.101437


In [20]:
pc = df[df["Epsilon_Boolean"] == 0]
pc

Unnamed: 0,Epochs,Current_State,Pit_Position,Goal_Position,Wall_Position,Action,Reward,Next_State,Epsilon_Boolean,Q_Max,Q1_Current,Q2_Current,Q3_Current,Q4_Current,Q1_Next,Q2_Next,Q3_Next,Q4_Next
0,0,8,10,12,9,3,-1,8,0,0.091794,-0.065128,-0.084517,0.054154,0.091794,-0.065416,-0.084419,0.053967,0.092021
1,0,8,10,12,9,3,-1,8,0,0.092021,-0.065416,-0.084419,0.053967,0.092021,-0.065042,-0.084073,0.054280,0.092051
2,0,8,10,12,9,3,-1,8,0,0.092051,-0.065042,-0.084073,0.054280,0.092051,-0.065421,-0.084044,0.054130,0.092035
3,0,8,10,12,9,3,-1,8,0,0.092035,-0.065421,-0.084044,0.054130,0.092035,-0.065540,-0.084577,0.054154,0.092471
4,0,8,10,12,9,3,-1,8,0,0.092471,-0.065540,-0.084577,0.054154,0.092471,-0.065336,-0.084862,0.054133,0.091836
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33058,4998,2,13,3,9,3,10,3,0,10.648301,9.667162,8.104548,8.723117,10.648301,9.750537,7.236125,9.528656,9.365549
33060,4999,2,15,14,13,1,-1,6,0,6.129316,3.491375,6.129316,2.507191,2.216899,3.118028,8.377428,3.581647,4.995778
33062,4999,5,15,14,13,1,-1,9,0,6.809101,2.090010,6.809101,4.536398,5.135134,2.390096,4.991859,4.299376,8.889229
33063,4999,9,15,14,13,3,-1,10,0,8.909819,2.361672,4.966386,4.306787,8.909819,3.343138,9.182156,5.461194,5.354160


In [43]:
def foo(x):
    m = pd.Series.mode(x)
    if len(m) == 1: 
        return m

df = df.groupby(['Current_State', 'Next_State', 'Pit_Position', 'Goal_Position', 'Wall_Position', 'Q_Max']).Action.apply(foo).reset_index(level=1, drop=True).reset_index()

AttributeError: 'SeriesGroupBy' object has no attribute 'Action'

In [41]:
df

Current_State  Next_State  Pit_Position  Goal_Position  Wall_Position  Q_Max       
0              0           1             5              8              -5.488395  0    0
                                                                       -4.943810  0    0
                                                                       -4.491105  0    0
                                                                       -4.184905  0    0
                                                                       -4.030087  0    0
                                                                                      ..
15             15          14            13             3               4.189399  0    1
                                                        11              1.650604  0    0
                                                                        1.773155  0    0
                                                                        1.784922  0    0
                          

In [42]:
df = df.drop_duplicates(keep='last',subset=['Current_State', 'Pit_Position', 'Goal_Position', 'Wall_Position'])
df

TypeError: drop_duplicates() got an unexpected keyword argument 'subset'

In [23]:
print(j)

199


In [24]:
df.describe()

Unnamed: 0,Current_State,Pit_Position,Goal_Position,Wall_Position,Q_Max,level_5,Action
count,12939.0,12939.0,12939.0,12939.0,12939.0,12939.0,12939.0
mean,7.488909,7.550815,7.45985,7.474689,6.259128,0.0,1.602983
std,4.240537,4.553416,4.726872,4.57799,3.499149,0.0,1.08131
min,0.0,0.0,0.0,0.0,-19.146124,0.0,0.0
25%,4.0,4.0,3.0,4.0,4.578652,0.0,1.0
50%,8.0,8.0,7.0,7.0,6.797669,0.0,1.0
75%,11.0,12.0,12.0,11.0,8.714315,0.0,3.0
max,15.0,15.0,15.0,15.0,14.948278,0.0,3.0


In [25]:
df.drop('level_5', axis=1, inplace=True)

In [26]:
df.to_csv('For Offline RL.csv')

In [27]:
df = df.reset_index(drop=True)

In [28]:
df

Unnamed: 0,Current_State,Pit_Position,Goal_Position,Wall_Position,Q_Max,Action
0,0,9,8,4,3.868886,0
1,0,11,10,6,-7.845963,0
2,0,1,2,6,4.032922,3
3,0,1,11,2,3.295507,3
4,0,1,12,14,6.359116,3
...,...,...,...,...,...,...
12934,15,14,9,6,-3.075736,1
12935,15,14,10,2,6.072558,1
12936,15,14,12,10,1.508140,1
12937,15,14,13,3,4.189399,1


In [29]:
df.describe()

Unnamed: 0,Current_State,Pit_Position,Goal_Position,Wall_Position,Q_Max,Action
count,12939.0,12939.0,12939.0,12939.0,12939.0,12939.0
mean,7.488909,7.550815,7.45985,7.474689,6.259128,1.602983
std,4.240537,4.553416,4.726872,4.57799,3.499149,1.08131
min,0.0,0.0,0.0,0.0,-19.146124,0.0
25%,4.0,4.0,3.0,4.0,4.578652,1.0
50%,8.0,8.0,7.0,7.0,6.797669,1.0
75%,11.0,12.0,12.0,11.0,8.714315,3.0
max,15.0,15.0,15.0,15.0,14.948278,3.0


In [30]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [31]:
df["Current_State"].size

12939

In [32]:

w, h = 4, df["Current_State"].size
sample_list = [[0 for x in range(w)] for y in range(h)] 

for i in range(0,df["Current_State"].size):
    sample_list[i][0] = df["Current_State"][i] 
    sample_list[i][1] = df["Pit_Position"][i]
    sample_list[i][2] = df["Goal_Position"][i]
    sample_list[i][3] = df["Wall_Position"][i]

In [33]:
sample_list

[[0, 9, 8, 4],
 [0, 11, 10, 6],
 [0, 1, 2, 6],
 [0, 1, 11, 2],
 [0, 1, 12, 14],
 [0, 1, 14, 12],
 [0, 2, 3, 6],
 [0, 2, 3, 11],
 [0, 2, 3, 15],
 [0, 2, 6, 13],
 [0, 2, 9, 5],
 [0, 2, 13, 7],
 [0, 3, 1, 6],
 [0, 3, 1, 8],
 [0, 3, 1, 12],
 [0, 3, 1, 13],
 [0, 3, 5, 6],
 [0, 3, 5, 8],
 [0, 3, 5, 13],
 [0, 3, 9, 2],
 [0, 3, 10, 5],
 [0, 3, 10, 7],
 [0, 3, 11, 9],
 [0, 3, 11, 10],
 [0, 3, 11, 13],
 [0, 3, 13, 2],
 [0, 3, 13, 11],
 [0, 3, 13, 12],
 [0, 3, 14, 7],
 [0, 3, 15, 13],
 [0, 4, 2, 7],
 [0, 4, 5, 7],
 [0, 4, 5, 8],
 [0, 4, 6, 11],
 [0, 4, 6, 13],
 [0, 4, 6, 14],
 [0, 4, 7, 5],
 [0, 4, 7, 12],
 [0, 4, 9, 15],
 [0, 4, 11, 10],
 [0, 4, 11, 15],
 [0, 4, 12, 2],
 [0, 4, 12, 7],
 [0, 4, 13, 15],
 [0, 4, 14, 11],
 [0, 4, 14, 12],
 [0, 4, 15, 14],
 [0, 5, 1, 3],
 [0, 5, 1, 9],
 [0, 5, 1, 13],
 [0, 5, 1, 14],
 [0, 5, 2, 7],
 [0, 5, 2, 10],
 [0, 5, 2, 11],
 [0, 5, 3, 6],
 [0, 5, 3, 7],
 [0, 5, 3, 12],
 [0, 5, 3, 13],
 [0, 5, 4, 14],
 [0, 5, 6, 7],
 [0, 5, 6, 12],
 [0, 5, 6, 15],
 [0, 5, 7, 10

In [34]:
df["State_current"] = sample_list
df["State_next"] = sample_list1

NameError: name 'sample_list1' is not defined

In [None]:

df

In [None]:
df

In [None]:
np.asarray(df["State_next"])

In [None]:
df

In [None]:
X = df[['Current_State','Pit_Position', 'Goal_Position', 'Wall_Position']].values
y = df[['Q_Max']].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/3, random_state = 0)

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predicting the Test set results
y_pred = regressor.predict(X_test)

In [None]:
ind = np.argwhere(y_pred==np.amax(y_pred,1, keepdims=True))
index = []
for i in ind:
    index.append(i[1])
index

In [None]:
y_pred

In [None]:
y_test

In [None]:
ind = np.argwhere(y_test==np.amax(y_test,1, keepdims=True))
index2 = []
for i in ind:
    index2.append(i[1])
index2

In [None]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_pred)

In [None]:
from sklearn.metrics import mean_absolute_error as mae
mae(y_test, y_pred)

In [None]:
from math import sqrt
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)
rmse

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score( index2, index))

In [None]:
X

In [None]:
y

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

model.fit(X_train, y_train.ravel())
y_pred = model.predict(X_test)

In [None]:
y_test

In [None]:
y_pred