# Clean code of MLP

**Content**:
1. [Online Tree Building](#online)
2. [Functions](#functions)
3. [Testing of the Online Model](#testing_online)
4. []

## 1. Online Tree Building <a id=online>

In [1]:
import numpy as np
import random
import sys

def randPair(s,e):
    return np.random.randint(s,e), np.random.randint(s,e)

class BoardPiece:

    def __init__(self, name, code, pos):
        self.name = name #name of the piece
        self.code = code #an ASCII character to display on the board
        self.pos = pos #2-tuple e.g. (1,4)

class BoardMask:

    def __init__(self, name, mask, code):
        self.name = name
        self.mask = mask
        self.code = code

    def get_positions(self): #returns tuple of arrays
        return np.nonzero(self.mask)

def zip_positions2d(positions): #positions is tuple of two arrays
    x,y = positions
    return list(zip(x,y))

class GridBoard:

    def __init__(self, size=4):
        self.size = size #Board dimensions, e.g. 4 x 4
        self.components = {} #name : board piece
        self.masks = {}

    def addPiece(self, name, code, pos=(0,0)):
        newPiece = BoardPiece(name, code, pos)
        self.components[name] = newPiece

    #basically a set of boundary elements
    def addMask(self, name, mask, code):
        #mask is a 2D-numpy array with 1s where the boundary elements are
        newMask = BoardMask(name, mask, code)
        self.masks[name] = newMask

    def movePiece(self, name, pos):
        move = True
        for _, mask in self.masks.items():
            if pos in zip_positions2d(mask.get_positions()):
                move = False
        if move:
            self.components[name].pos = pos

    def delPiece(self, name):
        del self.components['name']

    def render(self):
        dtype = '<U2'
        displ_board = np.zeros((self.size, self.size), dtype=dtype)
        displ_board[:] = ' '

        for name, piece in self.components.items():
            displ_board[piece.pos] = piece.code

        for name, mask in self.masks.items():
            displ_board[mask.get_positions()] = mask.code

        return displ_board

    def render_np(self):
        num_pieces = len(self.components) + len(self.masks)
        displ_board = np.zeros((num_pieces, self.size, self.size), dtype=np.uint8)
        layer = 0
        for name, piece in self.components.items():
            pos = (layer,) + piece.pos
            displ_board[pos] = 1
            layer += 1

        for name, mask in self.masks.items():
            x,y = self.masks['boundary'].get_positions()
            z = np.repeat(layer,len(x))
            a = (z,x,y)
            displ_board[a] = 1
            layer += 1
        return displ_board

def addTuple(a,b):
    return tuple([sum(x) for x in zip(a,b)])

In [2]:
class Gridworld:

    def __init__(self, size=4, mode='static'):
        if size >= 4:
            self.board = GridBoard(size=size)
        else:
            print("Minimum board size is 4. Initialized to size 4.")
            self.board = GridBoard(size=4)

        #Add pieces, positions will be updated later
        self.board.addPiece('Player','P',(0,0))
        self.board.addPiece('Goal','+',(0,3))
        self.board.addPiece('Pit','-',(0,3))
        self.board.addPiece('Wall','W',(2,3))

        if mode == 'static':
            self.initGridStatic()
        elif mode == 'player':
            self.initGridPlayer()
        else:
            self.initGridRand()

    #Initialize stationary grid, all items are placed deterministically
    def initGridStatic(self):
        #Setup static pieces
        self.board.components['Player'].pos = randPair(0,self.board.size) #Row, Column
        self.board.components['Goal'].pos = (3,0)
        self.board.components['Pit'].pos = (3,1)
        self.board.components['Wall'].pos = (3,2)
        
        if (not self.validateBoard()):
            #print('Invalid grid. Rebuilding..')
            self.initGridStatic()


    #Check if board is initialized appropriately (no overlapping pieces)
    #also remove impossible-to-win boards
    def validateBoard(self):
        valid = True

        player = self.board.components['Player']
        goal = self.board.components['Goal']
        wall = self.board.components['Wall']
        pit = self.board.components['Pit']

        all_positions = [piece for name,piece in self.board.components.items()]
        all_positions = [player.pos, goal.pos, wall.pos, pit.pos]
        if len(all_positions) > len(set(all_positions)):
            return False

        corners = [(0,0),(0,self.board.size), (self.board.size,0), (self.board.size,self.board.size)]
        #if player is in corner, can it move? if goal is in corner, is it blocked?
        if player.pos in corners or goal.pos in corners:
            val_move_pl = [self.validateMove('Player', addpos) for addpos in [(0,1),(1,0),(-1,0),(0,-1)]]
            val_move_go = [self.validateMove('Goal', addpos) for addpos in [(0,1),(1,0),(-1,0),(0,-1)]]
            if 0 not in val_move_pl or 0 not in val_move_go:
                #print(self.display())
                #print("Invalid board. Re-initializing...")
                valid = False

        return valid

    #Initialize player in random location, but keep wall, goal and pit stationary
    def initGridPlayer(self):
        #height x width x depth (number of pieces)
        self.initGridStatic()
        #place player
        self.board.components['Player'].pos = randPair(0,self.board.size)

        if (not self.validateBoard()):
            #print('Invalid grid. Rebuilding..')
            self.initGridPlayer()

    #Initialize grid so that goal, pit, wall, player are all randomly placed
    def initGridRand(self):
        #height x width x depth (number of pieces)
        self.board.components['Player'].pos = randPair(0,self.board.size)
        self.board.components['Goal'].pos = randPair(0,self.board.size)
        self.board.components['Pit'].pos = randPair(0,self.board.size)
        self.board.components['Wall'].pos = randPair(0,self.board.size)

        if (not self.validateBoard()):
            #print('Invalid grid. Rebuilding..')
            self.initGridRand()

    def validateMove(self, piece, addpos=(0,0)):
        outcome = 0 #0 is valid, 1 invalid, 2 lost game
        pit = self.board.components['Pit'].pos
        wall = self.board.components['Wall'].pos
        new_pos = addTuple(self.board.components[piece].pos, addpos)
        if new_pos == wall:
            outcome = 1 #block move, player can't move to wall
        elif max(new_pos) > (self.board.size-1):    #if outside bounds of board
            outcome = 1
        elif min(new_pos) < 0: #if outside bounds
            outcome = 1
        elif new_pos == pit:
            outcome = 2

        return outcome

    def makeMove(self, action):
        #need to determine what object (if any) is in the new grid spot the player is moving to
        #actions in {u,d,l,r}
        def checkMove(addpos):
            if self.validateMove('Player', addpos) in [0,2]:
                new_pos = addTuple(self.board.components['Player'].pos, addpos)
                self.board.movePiece('Player', new_pos)

        if action == 'u': #up
            checkMove((-1,0))
        elif action == 'd': #down
            checkMove((1,0))
        elif action == 'l': #left
            checkMove((0,-1))
        elif action == 'r': #right
            checkMove((0,1))
        else:
            pass

    def reward(self):
        if (self.board.components['Player'].pos == self.board.components['Pit'].pos):
            return -10
        elif (self.board.components['Player'].pos == self.board.components['Goal'].pos):
            return 10
        else:
            return -1

    def display(self):
        return self.board.render()

In [3]:
action_set = {
    0: 'u',
    1: 'd',
    2: 'l',
    3: 'r',
}

In [4]:
import numpy as np
import torch
from IPython.display import clear_output
import random
from matplotlib import pylab as plt

l1 = 64
l2 = 200
l3 = 120
l4 = 4

model = torch.nn.Sequential(
    torch.nn.Linear(l1, l2),
    torch.nn.ReLU(),
    torch.nn.Linear(l2, l3),
    torch.nn.ReLU(),
    torch.nn.Linear(l3,l4)
)
loss_fn = torch.nn.MSELoss()
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

gamma = 0.9
epsilon = 0.3

In [5]:
from collections import deque
epochs = 5000
losses = []
mem_size = 1000 #A
batch_size = 200 #B
replay = deque(maxlen=mem_size) #C
max_moves = 50 #D

h = 0
w, h = 20, 300000
matrix = [[0 for x in range(w)] for y in range(h)] 

e = 0
e,r= 10, 3000000 
experience = [[0 for x in range(e)] for y in range(r)] 

y = 0
u,y= 7, 3000000 
q_table = [[0 for x in range(u)] for y in range(y)] 
q_counter = 0 

counter = 0
num = 0
for i in range(epochs):
    game = Gridworld(size=4, mode='random')
    state1_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/100.0
    state1 = torch.from_numpy(state1_).float()
    status = 1
    mov = 0
    while(status == 1): 
        mov += 1
        qval = model(state1) #E
        qval_ = qval.data.numpy()
        
        matrix[counter][9] = qval_[0][np.argmax(qval_)]
        matrix[counter][10] = qval_[0][0]
        matrix[counter][11] = qval_[0][1]
        matrix[counter][12] = qval_[0][2]
        matrix[counter][13] = qval_[0][3]
        
        for k in range(0,4):
            q_table[q_counter][0] = i
            q_table[q_counter][1] = (str(game.board.components['Player'].pos))
            q_table[q_counter][2] = (str(game.board.components['Pit'].pos))
            q_table[q_counter][3] = (str(game.board.components['Goal'].pos))
            q_table[q_counter][4] = (str(game.board.components['Wall'].pos))
            q_table[q_counter][5] = k
            q_table[q_counter][6] = qval_[0][k]
            q_counter+=1
            
        if (random.random() < epsilon): #F
            matrix[counter][8] = 1 
            action_ = np.random.randint(0,4)
        else:
            matrix[counter][8] = 0 
            action_ = np.argmax(qval_)
        
        action = action_set[action_]
        
        matrix[counter][0] = i
        matrix[counter][1] = (str(game.board.components['Player'].pos))
        matrix[counter][2] = (str(game.board.components['Pit'].pos))
        matrix[counter][3] = (str(game.board.components['Goal'].pos))
        matrix[counter][4] = (str(game.board.components['Wall'].pos))
        matrix[counter][5] = action_
        
        game.makeMove(action)
        state2_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/100.0
        state2 = torch.from_numpy(state2_).float()
        next_state = model(state2)
        next_state_ = next_state.data.numpy()
        
        matrix[counter][18] = next_state_[0][np.argmax(next_state_)]
        matrix[counter][14] = next_state_[0][0]
        matrix[counter][15] = next_state_[0][1]
        matrix[counter][16] = next_state_[0][2]
        matrix[counter][17] = next_state_[0][3]
        reward = game.reward()
        matrix[counter][6] = reward
        if reward == -1 or reward == -10: #N
            Y = reward + (gamma * next_state_[0][np.argmax(next_state_)])
        else:
            Y = reward
            
        matrix[counter][19] = Y
        done = True if reward > 0 else False
        matrix[counter][7] = (str(game.board.components['Player'].pos))
        exp =  (state1, action_, reward, state2, done) #G
        replay.append(exp) #H
        state1 = state2
        
        
        
        if len(replay) > batch_size: #I
            minibatch = random.sample(replay, batch_size) #J
            state1_batch = torch.cat([s1 for (s1,a,r,s2,d) in minibatch]) #K
            action_batch = torch.Tensor([a for (s1,a,r,s2,d) in minibatch])
            reward_batch = torch.Tensor([r for (s1,a,r,s2,d) in minibatch])
            state2_batch = torch.cat([s2 for (s1,a,r,s2,d) in minibatch])
            done_batch = torch.Tensor([d for (s1,a,r,s2,d) in minibatch])
            
            Q1 = model(state1_batch) #L
            with torch.no_grad():
                Q2 = model(state2_batch) #M
            
            Y = reward_batch + gamma * ((1 - done_batch) * torch.max(Q2,dim=1)[0]) #N
            X = Q1.gather(dim=1,index=action_batch.long().unsqueeze(dim=1)).squeeze()
            loss = loss_fn(X, Y.detach())
            print(i)
#             if (i >= 4500):
#                 for j in range (0, batch_size):
#                     experience[num][0] = i
#                     experience[num][1] = state1_batch[j].numpy()
#                     experience[num][2] = action_batch[j].item()
#                     experience[num][3] = reward_batch[j].item()
#                     experience[num][4] = state2_batch[j].numpy()
#                     experience[num][5] = done_batch[j].item()
#                     experience[num][6] = Q1[j].detach().numpy()
#                     experience[num][7] = Q2[j].detach().numpy()
#                     experience[num][8] = Y[j].item()
#                     experience[num][9] = X[j].item()
#                     num += 1
            clear_output(wait=True)
            optimizer.zero_grad()
            loss.backward()
            losses.append(loss.item())
            optimizer.step()
        counter += 1
        if reward != -1 or mov > max_moves: #O
            status = 0
            mov = 0
losses = np.array(losses)

4999


## 2. Functions <a id = "functions">

In [6]:
"""
from2dto1d:
    Args: 
        pos(tuple):2d position of the objects(Player, Wall, Goal, Pit) in the gridworld
    Return:
        return(integer): 1d position of the objects(Player, Wall, Goal, Pit)in the gridworld
""" 
def from2dto1d(pos):
    if pos == '(0, 0)':
        return 0
    if pos == '(0, 1)':
        return 1
    if pos == '(0, 2)':
        return 2
    if pos == '(0, 3)':
        return 3
    if pos == '(1, 0)':
        return 4
    if pos == '(1, 1)':
        return 5
    if pos == '(1, 2)':
        return 6
    if pos == '(1, 3)':
        return 7
    if pos == '(2, 0)':
        return 8
    if pos == '(2, 1)':
        return 9
    if pos == '(2, 2)':
        return 10
    if pos == '(2, 3)':
        return 11
    if pos == '(3, 0)':
        return 12
    if pos == '(3, 1)':
        return 13
    if pos == '(3, 2)':
        return 14
    if pos == '(3, 3)':
        return 15

In [7]:
def from1dto2d(pos):
    if pos == 0:
        return (0, 0)
    if pos == 1:
        return (0, 1)
    if pos == 2:
        return (0, 2)
    if pos == 3:
        return (0, 3)
    if pos == 4:
        return (1, 0)
    if pos == 5:
        return (1, 1)
    if pos == 6:
        return (1, 2)
    if pos == 7:
        return (1, 3)
    if pos == 8:
        return (2, 0)
    if pos == 9:
        return (2, 1)
    if pos == 10:
        return (2, 2)
    if pos == 11:
        return (2, 3)
    if pos == 12:
        return (3, 0)
    if pos == 13:
        return (3, 1)
    if pos == 14:
        return (3, 2)
    if pos == 15:
        return (3, 3)

In [8]:
"""
from_num_to_one_hot_encode:
    Args: 
        num(int): number from 0 to 15 representing the state of the objects(Player, Wall, Goal, Pit) in the gridworld
    Return:
        return(tuple): encoded binary code with the size of 16-bit 
"""
def from_num_to_one_hot_encode(num):
    en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    if num == 0:
        en0 = 1
    if num == 1:
        en1 = 1
    if num == 2:
        en2 = 1
    if num == 3:
        en3 = 1
    if num == 4:
        en4 = 1
    if num == 5:
        en5 = 1
    if num == 6:
        en6 = 1
    if num == 7:
        en7 = 1
    if num == 8:
        en8 = 1
    if num == 9:
        en9 = 1
    if num == 10:
        en10 = 1
    if num == 11:
        en11 = 1
    if num == 12:
        en12 = 1
    if num == 13:
        en13 = 1
    if num == 14:
        en14 = 1
    if num == 15:
        en15 = 1
    return en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15

## 3. Testing of the Online Model <a id = "testing_online">

In [9]:
def dqn_test_model(model, experience_dqn, mode='static', display=True):
    global counter
    i = 0
    test_game = Gridworld(mode=mode)
    state_ = test_game.board.render_np().reshape(1,64) + np.random.rand(1,64)/10.0
    state = torch.from_numpy(state_).float()
    if display:
        print("Initial State:")
        print(test_game.display())
    status = 1
    while(status == 1): #A
        qval = model(state)
        qval_ = qval.data.numpy()
        action_ = np.argmax(qval_) #B
        action = action_set[action_]
        p_curr = from2dto1d((str(test_game.board.components['Player'].pos)))
        pi = from2dto1d((str(test_game.board.components['Pit'].pos)))
        g = from2dto1d((str(test_game.board.components['Goal'].pos)))
        w = from2dto1d((str(test_game.board.components['Wall'].pos)))
        experience_dqn[counter][0] = p_curr
        experience_dqn[counter][1] = pi
        experience_dqn[counter][2] = g
        experience_dqn[counter][3] = w
        
        experience_dqn[counter][4] =  qval_[0][action_]
        experience_dqn[counter][5] = qval_[0][0]
        experience_dqn[counter][6] = qval_[0][1]
        experience_dqn[counter][7] = qval_[0][2]
        experience_dqn[counter][8] = qval_[0][3]
        experience_dqn[counter][9] = action_
        
        if display:
            print('Move #: %s; Taking action: %s' % (i, action))
        test_game.makeMove(action)
        p_next = from2dto1d((str(test_game.board.components['Player'].pos)))
        experience_dqn[counter][10] = p_next
        state_ = test_game.board.render_np().reshape(1,64) + np.random.rand(1,64)/10.0
        state = torch.from_numpy(state_).float()
        if display:
            print(test_game.display())
        reward = test_game.reward()
        experience_dqn[counter][11] = reward
        if reward != -1:
            if reward > 0:
                status = 2
                if display:
                    print("Game won! Reward: %s" % (reward,))
            else:
                status = 0
                if display:
                    print("Game LOST. Reward: %s" % (reward,))
        i += 1
        counter += 1
        if (i > 15):
            if display:
                print("Game lost; too many moves.")
            break
    
    win = True if status == 2 else False
    return win

In [10]:
h = 0
w, h = 12, 3000000
experience_dqn = [[0 for x in range(w)] for y in range(h)] 

global counter
counter = 0

In [11]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = dqn_test_model(model, experience_dqn, random, display = False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num /10
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 912
0.912
0.912
Games played: 1000, # of wins: 920
0.92
1.832
Games played: 1000, # of wins: 927
0.927
2.7590000000000003
Games played: 1000, # of wins: 919
0.919
3.6780000000000004
Games played: 1000, # of wins: 929
0.929
4.607
Games played: 1000, # of wins: 917
0.917
5.524
Games played: 1000, # of wins: 918
0.918
6.442
Games played: 1000, # of wins: 926
0.926
7.368
Games played: 1000, # of wins: 920
0.92
8.288
Games played: 1000, # of wins: 925
0.925
9.213000000000001
Win percentage: 92.13000000000001%


In [12]:
h = 0
w, h = 12, 300000
experience_dqn = [[0 for x in range(w)] for y in range(h)] 

global counter
counter = 0

In [13]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = dqn_test_model(model, experience_dqn, random, display = False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num /10
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 898
0.898
0.898
Games played: 1000, # of wins: 925
0.925
1.823
Games played: 1000, # of wins: 924
0.924
2.747
Games played: 1000, # of wins: 933
0.933
3.6799999999999997
Games played: 1000, # of wins: 932
0.932
4.612
Games played: 1000, # of wins: 928
0.928
5.54
Games played: 1000, # of wins: 919
0.919
6.459
Games played: 1000, # of wins: 930
0.93
7.388999999999999
Games played: 1000, # of wins: 932
0.932
8.321
Games played: 1000, # of wins: 939
0.939
9.26
Win percentage: 92.6%


#### Accuracy on Random mode: _95%_

## Dataset Obtained After the DQN

In [14]:
experience_dqn

[[6,
  10,
  13,
  14,
  4.436979,
  0.221092,
  -1.5787429,
  4.436979,
  3.5086904,
  2,
  5,
  -1],
 [5,
  10,
  13,
  14,
  7.5383697,
  0.1893733,
  7.5383697,
  4.579906,
  3.4692807,
  1,
  9,
  -1],
 [9,
  10,
  13,
  14,
  8.941427,
  2.132322,
  8.941427,
  6.9907417,
  0.15165722,
  1,
  13,
  10],
 [8,
  9,
  11,
  7,
  2.8720546,
  2.4566233,
  1.3845295,
  1.7330121,
  2.8720546,
  3,
  9,
  -10],
 [11, 0, 2, 13, 5.926456, 5.926456, 3.7795606, 5.2674127, 4.923854, 0, 7, -1],
 [7, 0, 2, 13, 7.230853, 7.230853, 4.6406302, 6.603965, 5.9377375, 0, 3, -1],
 [3, 0, 2, 13, 8.346738, 6.9066, 5.8361917, 8.346738, 6.5197954, 2, 2, 10],
 [3,
  13,
  8,
  10,
  4.1001897,
  3.4247534,
  4.1001897,
  3.8710742,
  2.7850974,
  1,
  7,
  -1],
 [7, 13, 8, 10, 5.847109, 2.008796, 1.1508228, 5.847109, 3.9503448, 2, 6, -1],
 [6,
  13,
  8,
  10,
  6.6094656,
  3.9184604,
  4.389516,
  6.6094656,
  4.1808367,
  2,
  5,
  -1],
 [5,
  13,
  8,
  10,
  8.1612425,
  6.050896,
  6.3795524,
  8.16

In [15]:
import pandas as pd
column_names = ["Player", "Pit", "Goal", "Wall","Q_MAX", "Q_0", "Q_1","Q_2","Q_3", "Action", "Player_Next", "Reward"]
df_experience = pd.DataFrame(experience_dqn, columns = column_names)
df_experience = df_experience.loc[(df_experience != 0).any(axis=1)]
df_experience

Unnamed: 0,Player,Pit,Goal,Wall,Q_MAX,Q_0,Q_1,Q_2,Q_3,Action,Player_Next,Reward
0,6,10,13,14,4.436979,0.221092,-1.578743,4.436979,3.508690,2,5,-1
1,5,10,13,14,7.538370,0.189373,7.538370,4.579906,3.469281,1,9,-1
2,9,10,13,14,8.941427,2.132322,8.941427,6.990742,0.151657,1,13,10
3,8,9,11,7,2.872055,2.456623,1.384529,1.733012,2.872055,3,9,-10
4,11,0,2,13,5.926456,5.926456,3.779561,5.267413,4.923854,0,7,-1
...,...,...,...,...,...,...,...,...,...,...,...,...
36787,1,11,14,4,3.971183,2.474085,3.971183,2.796935,2.273870,1,5,-1
36788,5,11,14,4,6.153851,3.057463,6.153851,3.846052,5.264953,1,9,-1
36789,9,11,14,4,7.586272,4.451817,7.586272,5.057436,6.999943,1,13,-1
36790,13,11,14,4,10.406075,5.493226,7.396718,4.364619,10.406075,3,14,10


In [16]:
df_experience[df_experience["Reward"]==-10]

Unnamed: 0,Player,Pit,Goal,Wall,Q_MAX,Q_0,Q_1,Q_2,Q_3,Action,Player_Next,Reward
3,8,9,11,7,2.872055,2.456623,1.384529,1.733012,2.872055,3,9,-10
12,1,2,3,15,5.756944,3.204739,4.638635,2.436749,5.756944,3,2,-10
84,3,7,15,10,3.055430,2.386948,3.055430,2.055154,2.626266,1,7,-10
360,15,14,13,8,5.252655,3.033700,3.563577,5.252655,3.720776,2,14,-10
433,8,9,11,5,2.249962,0.963063,1.238381,0.168977,2.249962,3,9,-10
...,...,...,...,...,...,...,...,...,...,...,...,...
34557,5,6,2,1,4.802199,4.736699,3.171475,2.521498,4.802199,3,6,-10
35294,8,9,6,3,4.408339,4.238504,2.983244,3.085101,4.408339,3,9,-10
35634,8,12,13,9,4.590396,2.994666,4.590396,4.459345,4.322078,1,12,-10
35639,8,9,3,4,0.965541,0.951049,0.838674,0.487703,0.965541,3,9,-10


In [17]:
game = Gridworld(size=4, mode='random')
game.display()
game.board.components['Player'].pos = (0,0)
game.display()

array([['P', '-', ' ', ' '],
       [' ', ' ', '+', ' '],
       [' ', ' ', 'W', ' '],
       [' ', ' ', ' ', ' ']], dtype='<U2')

## Uniform Dataset Collection On Training

In [18]:
from collections import deque


h = 0
w, h = 21, 100000
matrix = [[0 for x in range(w)] for y in range(h)] 

counter = 0
num = 0
epoch = 0
epsilon = 0
for i in range(0,16):
    for j in range(0,16):
        for k in range (0,16):
            for c in range (0,16):
                if  c != j and k != i and k!= j and i != j:
                    game = Gridworld(size=4, mode='random')
                    game.board.components['Player'].pos = from1dto2d(c)
                    game.board.components['Goal'].pos = from1dto2d(k)
                    game.board.components['Wall'].pos = from1dto2d(j)
                    game.board.components['Pit'].pos = from1dto2d(i)
                    state1_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/100.0
                    state1 = torch.from_numpy(state1_).float()
                    status = 1
                    qval = model(state1) #E
                    qval_ = qval.data.numpy()

                    matrix[counter][9] = qval_[0][np.argmax(qval_)]
                    matrix[counter][10] = qval_[0][0]
                    matrix[counter][11] = qval_[0][1]
                    matrix[counter][12] = qval_[0][2]
                    matrix[counter][13] = qval_[0][3]


                    if (random.random() < epsilon): #F
                        matrix[counter][8] = 1 
                        action_ = np.random.randint(0,4)
                    else:
                        matrix[counter][8] = 0 
                        action_ = np.argmax(qval_)

                    action = action_set[action_]

                    matrix[counter][0] = epoch
                    matrix[counter][1] = from2dto1d(str(game.board.components['Player'].pos))
                    matrix[counter][2] = from2dto1d(str(game.board.components['Pit'].pos))
                    matrix[counter][3] = from2dto1d(str(game.board.components['Goal'].pos))
                    matrix[counter][4] = from2dto1d(str(game.board.components['Wall'].pos))
                    matrix[counter][5] = action_
                    
                    reward = game.reward()
                    matrix[counter][20] = reward
                    
                    game.makeMove(action)
                    state2_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/100.0
                    state2 = torch.from_numpy(state2_).float()
                    next_state = model(state2)
                    next_state_ = next_state.data.numpy()

                    matrix[counter][18] = next_state_[0][np.argmax(next_state_)]
                    matrix[counter][14] = next_state_[0][0]
                    matrix[counter][15] = next_state_[0][1]
                    matrix[counter][16] = next_state_[0][2]
                    matrix[counter][17] = next_state_[0][3]
                    reward = game.reward()
                    matrix[counter][6] = reward
                    if reward == -1 or reward == -10: #N
                        Y = reward + (gamma * next_state_[0][np.argmax(next_state_)])
                    else:
                        Y = reward

                    matrix[counter][19] = Y
                    done = True if reward > 0 else False
                    matrix[counter][7] = from2dto1d(str(game.board.components['Player'].pos))
                    state1 = state2
                    print(epoch)
                    clear_output(wait=True)
                    counter += 1
                    epoch += 1

50399


In [19]:
matrix

[[0,
  0,
  0,
  2,
  1,
  3,
  -10,
  0,
  0,
  5.549575,
  3.7275846,
  4.591288,
  3.7898169,
  5.549575,
  3.741947,
  4.60478,
  3.758387,
  5.5917177,
  5.5917177,
  -4.967454051971435,
  -10],
 [1,
  2,
  0,
  2,
  1,
  1,
  -1,
  6,
  0,
  7.4897456,
  6.1121187,
  7.4897456,
  1.8905668,
  6.901395,
  9.616634,
  6.651252,
  5.878662,
  6.4507656,
  9.616634,
  7.654970932006837,
  10],
 [2,
  3,
  0,
  2,
  1,
  2,
  10,
  2,
  0,
  9.1284075,
  7.6296344,
  5.2751126,
  9.1284075,
  7.2674665,
  6.1573677,
  7.5361238,
  1.7933986,
  6.878743,
  7.5361238,
  10,
  -1],
 [3,
  4,
  0,
  2,
  1,
  3,
  -1,
  5,
  0,
  5.525115,
  -0.3560225,
  4.170507,
  4.578406,
  5.525115,
  6.4417944,
  5.456789,
  4.2000837,
  7.5309114,
  7.5309114,
  5.777820301055908,
  -1],
 [4,
  5,
  0,
  2,
  1,
  3,
  -1,
  6,
  0,
  7.460454,
  6.332054,
  5.3881097,
  4.154821,
  7.460454,
  9.765363,
  6.8344126,
  6.041483,
  6.573189,
  9.765363,
  7.7888264656066895,
  -1],
 [5,
  6,
  0,
 

In [20]:
import pandas as pd
column_names = ["Epochs","Current_State", "Pit_Position", "Goal_Position", "Wall_Position", "Action",  "Reward", "Next_State", "Epsilon_Boolean", 
                "Q_Max", "Q1_Current", "Q2_Current", "Q3_Current", "Q4_Current", "Q1_Next", "Q2_Next", "Q3_Next", "Q4_Next","Q_Next_Max", "Y", "Current_Reward"
                ]
df = pd.DataFrame(matrix, columns = column_names)
df = df.loc[(df != 0).any(axis=1)]

In [21]:
df

Unnamed: 0,Epochs,Current_State,Pit_Position,Goal_Position,Wall_Position,Action,Reward,Next_State,Epsilon_Boolean,Q_Max,...,Q2_Current,Q3_Current,Q4_Current,Q1_Next,Q2_Next,Q3_Next,Q4_Next,Q_Next_Max,Y,Current_Reward
0,0,0,0,2,1,3,-10,0,0,5.549575,...,4.591288,3.789817,5.549575,3.741947,4.604780,3.758387,5.591718,5.591718,-4.967454,-10
1,1,2,0,2,1,1,-1,6,0,7.489746,...,7.489746,1.890567,6.901395,9.616634,6.651252,5.878662,6.450766,9.616634,7.654971,10
2,2,3,0,2,1,2,10,2,0,9.128407,...,5.275113,9.128407,7.267467,6.157368,7.536124,1.793399,6.878743,7.536124,10.000000,-1
3,3,4,0,2,1,3,-1,5,0,5.525115,...,4.170507,4.578406,5.525115,6.441794,5.456789,4.200084,7.530911,7.530911,5.777820,-1
4,4,5,0,2,1,3,-1,6,0,7.460454,...,5.388110,4.154821,7.460454,9.765363,6.834413,6.041483,6.573189,9.765363,7.788826,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50395,50395,10,15,13,14,2,-1,9,0,8.503292,...,7.462064,8.503292,6.897066,3.967404,10.355541,6.813191,7.236180,10.355541,8.319987,-1
50396,50396,11,15,13,14,2,-1,10,0,6.954754,...,2.446715,6.954754,4.659037,4.523438,7.372555,8.440053,6.941505,8.440053,6.596048,-1
50397,50397,12,15,13,14,3,10,13,0,9.970391,...,7.699754,7.663779,9.970391,6.476510,8.845600,10.435889,5.373288,10.435889,10.000000,-1
50398,50398,13,15,13,14,2,-1,12,0,10.481168,...,8.897238,10.481168,5.423398,6.392642,7.615518,7.595974,9.902771,9.902771,7.912494,10


In [66]:
df.to_csv('UniformDataset.csv')

In [23]:
df[df["Reward"] == -10]

Unnamed: 0,Epochs,Current_State,Pit_Position,Goal_Position,Wall_Position,Action,Reward,Next_State,Epsilon_Boolean,Q_Max,...,Q2_Current,Q3_Current,Q4_Current,Q1_Next,Q2_Next,Q3_Next,Q4_Next,Q_Next_Max,Y,Current_Reward
0,0,0,0,2,1,3,-10,0,0,5.549575,...,4.591288,3.789817,5.549575,3.741947,4.604780,3.758387,5.591718,5.591718,-4.967454,-10
15,15,0,0,3,1,3,-10,0,0,4.436290,...,3.909576,2.922956,4.436290,2.261247,3.919589,3.014143,4.489211,4.489211,-5.959710,-10
60,60,0,0,6,1,3,-10,0,0,5.907141,...,4.736316,4.505623,5.907141,3.919048,4.776436,4.474711,5.828372,5.828372,-4.754465,-10
75,75,0,0,7,1,3,-10,0,0,3.686614,...,3.137179,2.290276,3.686614,2.153476,3.145100,2.377677,3.768331,3.768331,-6.608502,-10
720,720,0,0,8,4,1,-10,0,0,5.688527,...,5.688527,4.129992,3.893270,3.571620,5.787607,4.201667,3.923309,5.787607,-4.791154,-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50279,50279,15,15,5,14,3,-10,15,0,2.909825,...,1.886392,2.468117,2.909825,2.527474,1.904485,2.447905,2.863519,2.863519,-7.422833,-10
50324,50324,15,15,8,14,2,-10,15,0,2.258207,...,2.197757,2.258207,1.079436,1.349171,2.090620,2.127683,1.068417,2.127683,-8.085086,-10
50339,50339,15,15,9,14,2,-10,15,0,3.761820,...,2.008564,3.761820,2.700319,2.933564,2.120845,3.910921,2.658796,3.910921,-6.480171,-10
50384,50384,15,15,12,14,2,-10,15,0,4.135138,...,2.586971,4.135138,2.119261,3.079973,2.553863,4.114085,2.192336,4.114085,-6.297324,-10


In [24]:
df[df['Reward'] == 10]

Unnamed: 0,Epochs,Current_State,Pit_Position,Goal_Position,Wall_Position,Action,Reward,Next_State,Epsilon_Boolean,Q_Max,...,Q2_Current,Q3_Current,Q4_Current,Q1_Next,Q2_Next,Q3_Next,Q4_Next,Q_Next_Max,Y,Current_Reward
2,2,3,0,2,1,2,10,2,0,9.128407,...,5.275113,9.128407,7.267467,6.157368,7.536124,1.793399,6.878743,7.536124,10.0,-1
5,5,6,0,2,1,0,10,2,0,9.709291,...,6.764973,5.941126,6.568333,6.092171,7.455241,1.927353,6.827739,7.455241,10.0,-1
16,16,2,0,3,1,3,10,3,0,10.238266,...,7.286027,0.641021,10.238266,7.888554,7.936424,5.400782,8.267491,8.267491,10.0,-1
17,17,3,0,3,1,3,10,3,0,8.102343,...,7.776753,5.296433,8.102343,7.749496,7.830593,5.383177,8.167582,8.167582,10.0,10
21,21,7,0,3,1,0,10,3,0,11.125080,...,6.833360,5.596282,9.588528,7.735203,7.800159,5.332862,8.133412,8.133412,10.0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50378,50378,8,15,12,14,1,10,12,0,9.698362,...,9.698362,8.274734,6.514931,4.652517,5.615434,4.619531,4.691898,5.615434,10.0,-1
50382,50382,12,15,12,14,1,10,12,0,5.600005,...,5.600005,4.643267,4.733867,4.732127,5.652909,4.688465,4.826358,5.652909,10.0,10
50383,50383,13,15,12,14,2,10,12,0,10.980048,...,8.228654,10.980048,4.027821,4.731445,5.632502,4.690788,4.796225,5.632502,10.0,-1
50394,50394,9,15,13,14,1,10,13,0,10.340865,...,10.340865,6.855905,7.256612,6.595668,9.009781,10.625491,5.607327,10.625491,10.0,-1


In [25]:
def dqn_test_model(model, experience_dqn, mode='static', display=True):
    global counter
    i = 0
    test_game = Gridworld(mode=mode)
    state_ = test_game.board.render_np().reshape(1,64) + np.random.rand(1,64)/10.0
    state = torch.from_numpy(state_).float()
    if display:
        print("Initial State:")
        print(test_game.display())
    status = 1
    while(status == 1): #A
        qval = model(state)
        qval_ = qval.data.numpy()
        action_ = np.argmax(qval_) #B
        action = action_set[action_]
        p_curr = from2dto1d((str(test_game.board.components['Player'].pos)))
        pi = from2dto1d((str(test_game.board.components['Pit'].pos)))
        g = from2dto1d((str(test_game.board.components['Goal'].pos)))
        w = from2dto1d((str(test_game.board.components['Wall'].pos)))
        experience_dqn[counter][0] = p_curr
        experience_dqn[counter][1] = pi
        experience_dqn[counter][2] = g
        experience_dqn[counter][3] = w
        
        experience_dqn[counter][4] =  qval_[0][action_]
        experience_dqn[counter][5] = qval_[0][0]
        experience_dqn[counter][6] = qval_[0][1]
        experience_dqn[counter][7] = qval_[0][2]
        experience_dqn[counter][8] = qval_[0][3]
        experience_dqn[counter][9] = action_
        
        if display:
            print('Move #: %s; Taking action: %s' % (i, action))
        test_game.makeMove(action)
        p_next = from2dto1d((str(test_game.board.components['Player'].pos)))
        experience_dqn[counter][10] = p_next
        state_ = test_game.board.render_np().reshape(1,64) + np.random.rand(1,64)/10.0
        state = torch.from_numpy(state_).float()
        if display:
            print(test_game.display())
        reward = test_game.reward()
        experience_dqn[counter][11] = reward
        if reward != -1:
            if reward > 0:
                status = 2
                if display:
                    print("Game won! Reward: %s" % (reward,))
            else:
                status = 0
                if display:
                    print("Game LOST. Reward: %s" % (reward,))
        i += 1
        counter += 1
        if (i > 15):
            if display:
                print("Game lost; too many moves.")
            break
    
    win = True if status == 2 else False
    return win

In [26]:
h = 0
w, h = 12, 3000000
experience_dqn = [[0 for x in range(w)] for y in range(h)] 

global counter
counter = 0

In [27]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = dqn_test_model(model, experience_dqn, random, display = False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num /10
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 927
0.927
0.927
Games played: 1000, # of wins: 925
0.925
1.852
Games played: 1000, # of wins: 903
0.903
2.755
Games played: 1000, # of wins: 924
0.924
3.679
Games played: 1000, # of wins: 911
0.911
4.59
Games played: 1000, # of wins: 915
0.915
5.505
Games played: 1000, # of wins: 928
0.928
6.433
Games played: 1000, # of wins: 921
0.921
7.354
Games played: 1000, # of wins: 926
0.926
8.28
Games played: 1000, # of wins: 911
0.911
9.190999999999999
Win percentage: 91.91%


## Transformer and Offline Data

In [28]:
online = pd.read_csv("Online_Dataset(5000-10000).csv")

In [29]:
from sklearn.preprocessing import OneHotEncoder
x = online[["Current_State"]]
y = OneHotEncoder().fit_transform(x).toarray()
dataset = pd.DataFrame({'Encode_0': y[:, 0], 'Encode_1': y[:, 1], 'Encode_2': y[:, 2], 'Encode_3': y[:, 3], 'Encode_4': y[:, 4], 'Encode_5': y[:, 5], 'Encode_6': y[:, 6], 'Encode_7': y[:, 7], 'Encode_8': y[:, 8], 'Encode_9': y[:, 9], 'Encode_10': y[:, 10], 'Encode_11': y[:, 11], 'Encode_12': y[:, 12], 'Encode_13': y[:, 13], 'Encode_14': y[:, 14], 'Encode_15': y[:, 15]})
online['Encode_0'] = dataset["Encode_0"]
online['Encode_1'] = dataset["Encode_1"]
online['Encode_2'] = dataset["Encode_2"]
online['Encode_3'] = dataset["Encode_3"]
online['Encode_4'] = dataset["Encode_4"]
online['Encode_5'] = dataset["Encode_5"]
online['Encode_6'] = dataset["Encode_6"]
online['Encode_7'] = dataset["Encode_7"]
online['Encode_8'] = dataset["Encode_8"]
online['Encode_9'] = dataset["Encode_9"]
online['Encode_10'] = dataset["Encode_10"]
online['Encode_11'] = dataset["Encode_11"]
online['Encode_12'] = dataset["Encode_12"]
online['Encode_13'] = dataset["Encode_13"]
online['Encode_14'] = dataset["Encode_14"]
online['Encode_15'] = dataset["Encode_15"]
online

Unnamed: 0.1,Unnamed: 0,Epochs,Current_State,Pit_Position,Goal_Position,Wall_Position,Action,Reward,Next_State,Epsilon_Boolean,...,Encode_6,Encode_7,Encode_8,Encode_9,Encode_10,Encode_11,Encode_12,Encode_13,Encode_14,Encode_15
0,0,0,6,3,14,12,1,-1,10,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0,10,3,14,12,1,10,14,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,2,1,8,13,14,4,1,-1,12,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,1,12,13,14,4,1,-1,12,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,4,1,12,13,14,4,3,-10,13,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22515,22515,4998,6,3,13,1,1,-1,10,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22516,22516,4998,10,3,13,1,1,-1,14,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
22517,22517,4998,14,3,13,1,2,10,13,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
22518,22518,4999,7,3,9,13,3,-1,7,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
class CustomFeatureEngineering(BaseEstimator, TransformerMixin):
    """My custom description
    Args:
        include_interaction (bool): True if I want to combine Department and JobRole
    Returns:

    """
    def __init__(self, epsilon_case=False, duplicate_case = False):
        self.epsilon_case = epsilon_case
        self.duplicate_case = duplicate_case
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        X_copy = X.copy()
        if 'Unnamed: 0' in set(X_copy.columns):
            X_copy.drop('Unnamed: 0', axis=1, inplace=True)            
        
        if self.epsilon_case:
            if 'Epsilon_Boolean' in set(X_copy.columns): 
                X_copy = X_copy[X_copy["Epsilon_Boolean"] == 0]
        
        if self.duplicate_case:
            if 'Current_State' in set(X_copy.columns) and 'Pit_Position' in set(X_copy.columns) and 'Goal_Position' in set(X_copy.columns) and 'Wall_Position' in set(X_copy.columns) and 'Next_State' in set(X_copy.columns):  
                X_copy = X_copy.drop_duplicates(keep='last',subset=[ 'Current_State', 'Pit_Position', 'Goal_Position', 'Wall_Position', 'Next_State'])
        
        return X_copy

In [31]:
X_transformed = CustomFeatureEngineering(epsilon_case=True,duplicate_case=True).transform(df)

In [32]:
X_transformed

Unnamed: 0,Epochs,Current_State,Pit_Position,Goal_Position,Wall_Position,Action,Reward,Next_State,Epsilon_Boolean,Q_Max,...,Q2_Current,Q3_Current,Q4_Current,Q1_Next,Q2_Next,Q3_Next,Q4_Next,Q_Next_Max,Y,Current_Reward
0,0,0,0,2,1,3,-10,0,0,5.549575,...,4.591288,3.789817,5.549575,3.741947,4.604780,3.758387,5.591718,5.591718,-4.967454,-10
1,1,2,0,2,1,1,-1,6,0,7.489746,...,7.489746,1.890567,6.901395,9.616634,6.651252,5.878662,6.450766,9.616634,7.654971,10
2,2,3,0,2,1,2,10,2,0,9.128407,...,5.275113,9.128407,7.267467,6.157368,7.536124,1.793399,6.878743,7.536124,10.000000,-1
3,3,4,0,2,1,3,-1,5,0,5.525115,...,4.170507,4.578406,5.525115,6.441794,5.456789,4.200084,7.530911,7.530911,5.777820,-1
4,4,5,0,2,1,3,-1,6,0,7.460454,...,5.388110,4.154821,7.460454,9.765363,6.834413,6.041483,6.573189,9.765363,7.788826,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50395,50395,10,15,13,14,2,-1,9,0,8.503292,...,7.462064,8.503292,6.897066,3.967404,10.355541,6.813191,7.236180,10.355541,8.319987,-1
50396,50396,11,15,13,14,2,-1,10,0,6.954754,...,2.446715,6.954754,4.659037,4.523438,7.372555,8.440053,6.941505,8.440053,6.596048,-1
50397,50397,12,15,13,14,3,10,13,0,9.970391,...,7.699754,7.663779,9.970391,6.476510,8.845600,10.435889,5.373288,10.435889,10.000000,-1
50398,50398,13,15,13,14,2,-1,12,0,10.481168,...,8.897238,10.481168,5.423398,6.392642,7.615518,7.595974,9.902771,9.902771,7.912494,10


## Mlp

In [33]:
from sklearn.neural_network import MLPRegressor
mlp = MLPRegressor(hidden_layer_sizes=(200,120), max_iter=300, activation = 'relu', solver='adam', random_state=123)

In [34]:
from sklearn.model_selection import train_test_split

X_transformed = CustomFeatureEngineering(epsilon_case=True,duplicate_case=True).transform(online)
X = X_transformed[['Encode_0','Encode_1','Encode_2','Encode_3','Encode_4','Encode_5','Encode_6','Encode_7','Encode_8','Encode_9','Encode_10',
        'Encode_11','Encode_12','Encode_13','Encode_14','Encode_15','Pit_Position', 'Goal_Position', 'Wall_Position', 'Reward']]
y = X_transformed[['Y']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/5, random_state = 0)

In [35]:
X_train.values

array([[ 0.,  0.,  0., ...,  6., 14., 10.],
       [ 0.,  0.,  0., ..., 11.,  9., 10.],
       [ 0.,  0.,  1., ...,  1.,  6., 10.],
       ...,
       [ 0.,  0.,  0., ...,  2.,  4., -1.],
       [ 0.,  0.,  0., ..., 12.,  2., -1.],
       [ 0.,  0.,  0., ...,  1.,  6., -1.]])

In [36]:
mlp.fit(X_train.values,y_train.values.ravel())

In [37]:
X_train

Unnamed: 0,Encode_0,Encode_1,Encode_2,Encode_3,Encode_4,Encode_5,Encode_6,Encode_7,Encode_8,Encode_9,Encode_10,Encode_11,Encode_12,Encode_13,Encode_14,Encode_15,Pit_Position,Goal_Position,Wall_Position,Reward
14898,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,13,6,14,10
20299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13,11,9,10
5993,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15,1,6,10
1094,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11,12,3,10
15324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,7,11,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10971,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9,13,10,-1
7665,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9,8,12,-1
20462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,6,2,4,-1
22072,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,8,12,2,-1


In [38]:
def calculate_smape(actual, predicted) -> float:
    if not all([isinstance(actual, np.ndarray), 
                isinstance(predicted, np.ndarray)]):
        actual, predicted = np.array(actual),
        np.array(predicted)
  
    return round(
        np.mean(
            np.abs(predicted - actual) / 
            ((np.abs(predicted) + np.abs(actual))/2)
        )*100, 2
    )

In [39]:
y_pred = mlp.predict(X_test.values)

In [40]:
calculate_smape(y_test.values.ravel(),y_pred)

17.47

## Accuracy Results

In [41]:
def mlp_test_model(model_b, experience, mode='static', display=True):
    global counter
    i = 0
    test_game = Gridworld(mode=mode)
#     state_ = test_game.board.render_np().reshape(1,64) + np.random.rand(1,64)/10.0
#     state = torch.from_numpy(state_).float()
    if display:
        print("Initial State:")
        print(test_game.display())
    status = 1
    j = 0
    while(status == 1): #A
        p_curr = from2dto1d((str(test_game.board.components['Player'].pos)))
        pi = from2dto1d((str(test_game.board.components['Pit'].pos)))
        g = from2dto1d((str(test_game.board.components['Goal'].pos)))
        w = from2dto1d((str(test_game.board.components['Wall'].pos)))
        #q_current = model.predict([[p_curr,pi,g,w]])[0][0]
        
#         q_value_current = []
        q_value_next = []
#         q_value = []
        test_game.makeMove('u')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)

        reward = test_game.reward()
       # q_value_current.append(model.predict([[p,pi,g,w]])[0][0])
        q_value_next.append(model_b.predict([[en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15,pi,g,w,reward]])[0])
        #q_value.append(abs(model_b.predict([[p,pi,g,w,reward]])[0][0]-q_current))
        if p_curr != p:
            test_game.makeMove('d')
        
        test_game.makeMove('d')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)

        reward = test_game.reward()
       # q_value_current.append(model.predict([[p,pi,g,w]])[0][0])
        q_value_next.append(model_b.predict([[en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15,pi,g,w,reward]])[0])
        #q_value.append(abs(model_b.predict([[p,pi,g,w,reward]])[0][0]-q_current))
        if p_curr != p:
            test_game.makeMove('u')
        
        test_game.makeMove('l')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)

        reward = test_game.reward()
       # q_value_current.append(model.predict([[p,pi,g,w]])[0][0])
        q_value_next.append(model_b.predict([[en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15,pi,g,w,reward]])[0])
        #q_value.append(abs(model_b.predict([[p,pi,g,w,reward]])[0][0]-q_current))
        if p_curr != p:
            test_game.makeMove('r')
        
        test_game.makeMove('r')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)

        reward = test_game.reward()
        # q_value_current.append(model.predict([[p,pi,g,w]])[0][0])
        q_value_next.append(model_b.predict([[en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15,pi,g,w,reward]])[0])
        #q_value.append(abs(model_b.predict([[p,pi,g,w,reward]])[0][0]-q_current))
        if p_curr != p:
            test_game.makeMove('l')
        
#         print(q_value_next)
        
        
        action_ = np.argmax(q_value_next)
#         print(action_)
        action = action_set[action_]
        
        experience[counter][0] = p_curr
        experience[counter][1] = pi
        experience[counter][2] = g
        experience[counter][3] = w
        experience[counter][4] = action_
        
    
        if display:
            print('Move #: %s; Taking action: %s' % (i, action))
        test_game.makeMove(action)
        p_next = from2dto1d((str(test_game.board.components['Player'].pos)))
        experience[counter][5] = p_next
        
        j += 1
        if display:
            print(test_game.display())
        reward = test_game.reward()
        experience[counter][6] = reward
        if reward > -1:
            experience[counter][7] = True
        else:
            experience[counter][7] = False
        experience[counter][8] =  q_value_next[action_]
        experience[counter][9] = q_value_next[0]
        experience[counter][10] = q_value_next[1]
        experience[counter][11] = q_value_next[2]
        experience[counter][12] = q_value_next[3]
        if reward != -1:
            if reward > 0:
                status = 2
                if display:
                    print("Game won! Reward: %s" % (reward,))
            else:
                status = 0
                if display:
                    print("Game LOST. Reward: %s" % (reward,))
        counter += 1
        i += 1
        if (i > 15):
            if display:
                print("Game lost; too many moves.")
            break
    
    win = True if status == 2 else False
    return win

In [42]:
h = 0
w, h = 13, 3000000
experience_mlp = [[0 for x in range(w)] for y in range(h)] 

global counter
counter = 0

In [43]:
mlp_test_model(mlp, experience_mlp, 'random')

Initial State:
[[' ' ' ' '-' '+']
 [' ' ' ' ' ' ' ']
 ['W' 'P' ' ' ' ']
 [' ' ' ' ' ' ' ']]
Move #: 0; Taking action: r
[[' ' ' ' '-' '+']
 [' ' ' ' ' ' ' ']
 ['W' ' ' 'P' ' ']
 [' ' ' ' ' ' ' ']]
Move #: 1; Taking action: u
[[' ' ' ' '-' '+']
 [' ' ' ' 'P' ' ']
 ['W' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']]
Move #: 2; Taking action: d
[[' ' ' ' '-' '+']
 [' ' ' ' ' ' ' ']
 ['W' ' ' 'P' ' ']
 [' ' ' ' ' ' ' ']]
Move #: 3; Taking action: u
[[' ' ' ' '-' '+']
 [' ' ' ' 'P' ' ']
 ['W' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']]
Move #: 4; Taking action: d
[[' ' ' ' '-' '+']
 [' ' ' ' ' ' ' ']
 ['W' ' ' 'P' ' ']
 [' ' ' ' ' ' ' ']]
Move #: 5; Taking action: u
[[' ' ' ' '-' '+']
 [' ' ' ' 'P' ' ']
 ['W' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']]
Move #: 6; Taking action: d
[[' ' ' ' '-' '+']
 [' ' ' ' ' ' ' ']
 ['W' ' ' 'P' ' ']
 [' ' ' ' ' ' ' ']]
Move #: 7; Taking action: u
[[' ' ' ' '-' '+']
 [' ' ' ' 'P' ' ']
 ['W' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']]
Move #: 8; Taking action: d
[[' ' ' ' '-' '+']
 [' ' ' ' ' ' ' ']
 [

False

In [44]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = mlp_test_model(mlp, experience_mlp, 'random', display = False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num / 10
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 597
0.597
0.597
Games played: 1000, # of wins: 578
0.578
1.1749999999999998
Games played: 1000, # of wins: 579
0.579
1.7539999999999998
Games played: 1000, # of wins: 585
0.585
2.3389999999999995
Games played: 1000, # of wins: 571
0.571
2.9099999999999993
Games played: 1000, # of wins: 586
0.586
3.495999999999999
Games played: 1000, # of wins: 583
0.583
4.078999999999999
Games played: 1000, # of wins: 568
0.568
4.6469999999999985
Games played: 1000, # of wins: 596
0.596
5.2429999999999986
Games played: 1000, # of wins: 607
0.607
5.849999999999999
Win percentage: 58.499999999999986%


## Another MLP

In [45]:
mlp_1 = MLPRegressor(hidden_layer_sizes=(200,120), max_iter=300, activation = 'relu', solver='adam', random_state=123)
from sklearn.model_selection import train_test_split

X_transformed = CustomFeatureEngineering(epsilon_case=True,duplicate_case=True).transform(online)
X = X_transformed[['Encode_0','Encode_1','Encode_2','Encode_3','Encode_4','Encode_5','Encode_6','Encode_7','Encode_8','Encode_9','Encode_10',
        'Encode_11','Encode_12','Encode_13','Encode_14','Encode_15','Pit_Position', 'Goal_Position', 'Wall_Position', 'Reward']]
y = X_transformed[['Y']]
mlp_1.fit(X.values,y.values.ravel())

In [46]:
h = 0
w, h = 13, 3000000
experience_mlp = [[0 for x in range(w)] for y in range(h)] 

global counter
counter = 0

In [47]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = mlp_test_model(mlp_1, experience_mlp, 'random', display = False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num / 10
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 623
0.623
0.623
Games played: 1000, # of wins: 593
0.593
1.216
Games played: 1000, # of wins: 580
0.58
1.7959999999999998
Games played: 1000, # of wins: 604
0.604
2.4
Games played: 1000, # of wins: 615
0.615
3.0149999999999997
Games played: 1000, # of wins: 610
0.61
3.6249999999999996
Games played: 1000, # of wins: 586
0.586
4.210999999999999
Games played: 1000, # of wins: 633
0.633
4.843999999999999
Games played: 1000, # of wins: 613
0.613
5.456999999999999
Games played: 1000, # of wins: 604
0.604
6.060999999999999
Win percentage: 60.609999999999985%


## MAX - Q MLP

In [48]:
mlp_2 = MLPRegressor(hidden_layer_sizes=(200,120), max_iter=300, activation = 'relu', solver='adam', random_state=123)
from sklearn.model_selection import train_test_split

X_transformed = CustomFeatureEngineering(epsilon_case=True,duplicate_case=True).transform(online)
X = X_transformed[['Encode_0','Encode_1','Encode_2','Encode_3','Encode_4','Encode_5','Encode_6','Encode_7','Encode_8','Encode_9','Encode_10',
        'Encode_11','Encode_12','Encode_13','Encode_14','Encode_15','Pit_Position', 'Goal_Position', 'Wall_Position', 'Reward']]
y = X_transformed[['Q_Max']]
mlp_2.fit(X.values,y.values.ravel())

In [49]:
X_transformed

Unnamed: 0,Epochs,Current_State,Pit_Position,Goal_Position,Wall_Position,Action,Reward,Next_State,Epsilon_Boolean,Q_Max,...,Encode_6,Encode_7,Encode_8,Encode_9,Encode_10,Encode_11,Encode_12,Encode_13,Encode_14,Encode_15
0,0,6,3,14,12,1,-1,10,0,8.452592,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,10,3,14,12,1,10,14,0,10.203282,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,2,13,0,5,1,0,-1,9,0,8.848181,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,2,9,0,5,1,0,10,5,0,10.177234,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
13,5,15,11,14,2,2,10,14,0,10.531881,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22513,4997,14,1,15,12,3,10,15,0,10.571692,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
22514,4998,7,3,13,1,2,-1,6,0,3.782595,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22515,4998,6,3,13,1,1,-1,10,0,5.842478,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22516,4998,10,3,13,1,1,-1,14,0,7.821082,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [50]:
h = 0
w, h = 13, 3000000
experience_mlp = [[0 for x in range(w)] for y in range(h)] 

global counter
counter = 0

In [51]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = mlp_test_model(mlp_2, experience_mlp, 'random', display = False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num / 10
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 573
0.573
0.573
Games played: 1000, # of wins: 613
0.613
1.186
Games played: 1000, # of wins: 585
0.585
1.771
Games played: 1000, # of wins: 620
0.62
2.391
Games played: 1000, # of wins: 579
0.579
2.9699999999999998
Games played: 1000, # of wins: 599
0.599
3.569
Games played: 1000, # of wins: 575
0.575
4.144
Games played: 1000, # of wins: 609
0.609
4.753
Games played: 1000, # of wins: 594
0.594
5.347
Games played: 1000, # of wins: 605
0.605
5.952
Win percentage: 59.519999999999996%


## More Data MLP

In [52]:
from sklearn.preprocessing import OneHotEncoder
x = df[["Current_State"]]
y = OneHotEncoder().fit_transform(x).toarray()
dataset = pd.DataFrame({'Encode_0': y[:, 0], 'Encode_1': y[:, 1], 'Encode_2': y[:, 2], 'Encode_3': y[:, 3], 'Encode_4': y[:, 4], 'Encode_5': y[:, 5], 'Encode_6': y[:, 6], 'Encode_7': y[:, 7], 'Encode_8': y[:, 8], 'Encode_9': y[:, 9], 'Encode_10': y[:, 10], 'Encode_11': y[:, 11], 'Encode_12': y[:, 12], 'Encode_13': y[:, 13], 'Encode_14': y[:, 14], 'Encode_15': y[:, 15]})
df['Encode_0'] = dataset["Encode_0"]
df['Encode_1'] = dataset["Encode_1"]
df['Encode_2'] = dataset["Encode_2"]
df['Encode_3'] = dataset["Encode_3"]
df['Encode_4'] = dataset["Encode_4"]
df['Encode_5'] = dataset["Encode_5"]
df['Encode_6'] = dataset["Encode_6"]
df['Encode_7'] = dataset["Encode_7"]
df['Encode_8'] = dataset["Encode_8"]
df['Encode_9'] = dataset["Encode_9"]
df['Encode_10'] = dataset["Encode_10"]
df['Encode_11'] = dataset["Encode_11"]
df['Encode_12'] = dataset["Encode_12"]
df['Encode_13'] = dataset["Encode_13"]
df['Encode_14'] = dataset["Encode_14"]
df['Encode_15'] = dataset["Encode_15"]
df

Unnamed: 0,Epochs,Current_State,Pit_Position,Goal_Position,Wall_Position,Action,Reward,Next_State,Epsilon_Boolean,Q_Max,...,Encode_6,Encode_7,Encode_8,Encode_9,Encode_10,Encode_11,Encode_12,Encode_13,Encode_14,Encode_15
0,0,0,0,2,1,3,-10,0,0,5.549575,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2,0,2,1,1,-1,6,0,7.489746,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,3,0,2,1,2,10,2,0,9.128407,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,4,0,2,1,3,-1,5,0,5.525115,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,5,0,2,1,3,-1,6,0,7.460454,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50395,50395,10,15,13,14,2,-1,9,0,8.503292,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
50396,50396,11,15,13,14,2,-1,10,0,6.954754,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50397,50397,12,15,13,14,3,10,13,0,9.970391,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50398,50398,13,15,13,14,2,-1,12,0,10.481168,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [53]:
mlp_3 = MLPRegressor(hidden_layer_sizes=(200,120), max_iter=300, activation = 'relu', solver='adam', random_state=123)
from sklearn.model_selection import train_test_split

X_transform = CustomFeatureEngineering(epsilon_case=True,duplicate_case=True).transform(df)
X = X_transformed[['Encode_0','Encode_1','Encode_2','Encode_3','Encode_4','Encode_5','Encode_6','Encode_7','Encode_8','Encode_9','Encode_10',
        'Encode_11','Encode_12','Encode_13','Encode_14','Encode_15','Pit_Position', 'Goal_Position', 'Wall_Position', 'Reward']]
y = X_transformed[['Q_Max']]
mlp_3.fit(X.values,y.values.ravel())

In [54]:
X_transform

Unnamed: 0,Epochs,Current_State,Pit_Position,Goal_Position,Wall_Position,Action,Reward,Next_State,Epsilon_Boolean,Q_Max,...,Encode_6,Encode_7,Encode_8,Encode_9,Encode_10,Encode_11,Encode_12,Encode_13,Encode_14,Encode_15
0,0,0,0,2,1,3,-10,0,0,5.549575,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2,0,2,1,1,-1,6,0,7.489746,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,3,0,2,1,2,10,2,0,9.128407,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,4,0,2,1,3,-1,5,0,5.525115,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,5,0,2,1,3,-1,6,0,7.460454,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50395,50395,10,15,13,14,2,-1,9,0,8.503292,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
50396,50396,11,15,13,14,2,-1,10,0,6.954754,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50397,50397,12,15,13,14,3,10,13,0,9.970391,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50398,50398,13,15,13,14,2,-1,12,0,10.481168,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [55]:
X_transformed.to_csv('UniformDataset.csv')

In [56]:
X_transformed.groupby(['Pit_Position'])['Pit_Position'].count().sort_values(ascending=False) 

Pit_Position
10    818
6     774
4     731
0     714
1     711
14    708
8     706
9     689
15    686
2     682
11    680
5     669
12    647
13    633
7     628
3     611
Name: Pit_Position, dtype: int64

In [57]:
X_transformed.groupby(['Current_State'])['Current_State'].count().sort_values(ascending=False) 

Current_State
5     1065
10    1042
9     1039
6     1029
8      686
13     681
2      675
4      663
7      660
11     656
14     654
1      618
3      413
12     405
0      402
15     399
Name: Current_State, dtype: int64

In [58]:
X_transformed.groupby(['Goal_Position'])['Goal_Position'].count().sort_values(ascending=False) 

Goal_Position
15    892
2     794
12    785
0     782
3     764
4     743
14    737
11    704
1     693
7     690
13    645
8     601
5     592
9     588
10    543
6     534
Name: Goal_Position, dtype: int64

In [59]:
X_transformed.groupby(['Wall_Position'])['Wall_Position'].count().sort_values(ascending=False) 

Wall_Position
1     772
14    766
13    765
5     758
4     739
6     728
12    713
11    711
9     672
8     665
3     655
2     643
15    638
10    636
7     625
0     601
Name: Wall_Position, dtype: int64

In [60]:
X_transformed.groupby([ 'Reward'])['Reward'].count().sort_values(ascending=False) 

Reward
-1     7964
 10    3094
-10      29
Name: Reward, dtype: int64

In [61]:
X_transformed.groupby([ 'Action'])['Action'].count().sort_values(ascending=False) 

Action
3    2859
0    2794
2    2734
1    2700
Name: Action, dtype: int64

In [62]:
h = 0
w, h = 13, 3000000
experience_mlp = [[0 for x in range(w)] for y in range(h)] 

global counter
counter = 0

In [63]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = mlp_test_model(mlp_3, experience_mlp, 'random', display = False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num / 10
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 608
0.608
0.608
Games played: 1000, # of wins: 603
0.603
1.2109999999999999
Games played: 1000, # of wins: 576
0.576
1.787
Games played: 1000, # of wins: 582
0.582
2.3689999999999998
Games played: 1000, # of wins: 593
0.593
2.9619999999999997
Games played: 1000, # of wins: 596
0.596
3.558
Games played: 1000, # of wins: 586
0.586
4.144
Games played: 1000, # of wins: 576
0.576
4.72
Games played: 1000, # of wins: 592
0.592
5.311999999999999
Games played: 1000, # of wins: 581
0.581
5.892999999999999
Win percentage: 58.92999999999999%


In [64]:
mlp_4 = MLPRegressor(hidden_layer_sizes=(200,120), max_iter=300, activation = 'relu', solver='adam', random_state=123)
from sklearn.model_selection import train_test_split

X_transform = CustomFeatureEngineering(epsilon_case=True,duplicate_case=True).transform(df)
X = X_transformed[['Encode_0','Encode_1','Encode_2','Encode_3','Encode_4','Encode_5','Encode_6','Encode_7','Encode_8','Encode_9','Encode_10',
        'Encode_11','Encode_12','Encode_13','Encode_14','Encode_15','Pit_Position', 'Goal_Position', 'Wall_Position', 'Reward']]
y = X_transformed[['Y']]
mlp_4.fit(X.values,y.values.ravel())

In [65]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = mlp_test_model(mlp_4, experience_mlp, 'random', display = False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num / 10
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 605
0.605
0.605
Games played: 1000, # of wins: 599
0.599
1.204
Games played: 1000, # of wins: 591
0.591
1.795
Games played: 1000, # of wins: 612
0.612
2.407
Games played: 1000, # of wins: 621
0.621
3.028
Games played: 1000, # of wins: 611
0.611
3.6390000000000002
Games played: 1000, # of wins: 600
0.6
4.239
Games played: 1000, # of wins: 592
0.592
4.8309999999999995
Games played: 1000, # of wins: 564
0.564
5.395
Games played: 1000, # of wins: 575
0.575
5.97
Win percentage: 59.699999999999996%
