# Dataset_Transformsation

**Content**:
1. [Libraries](#libraries)
2. [Gridworld Creation](#gridworld)
3. [Functions](#funct)
4. [Uniform Dataset Transformations](#transformations)
5. [MLP code](#mlp)
6. [Dataset Manipulation](#manipulation)
7. [Bellman Operator](#bellman)

## 1. Libraries <a id = "libraries"> 

In [1]:
import numpy as np
import random
import sys
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

## 2. Gridworld Creation <a id = "gridworld">

In [2]:
def randPair(s,e):
    return np.random.randint(s,e), np.random.randint(s,e)

class BoardPiece:

    def __init__(self, name, code, pos):
        self.name = name #name of the piece
        self.code = code #an ASCII character to display on the board
        self.pos = pos #2-tuple e.g. (1,4)

class BoardMask:

    def __init__(self, name, mask, code):
        self.name = name
        self.mask = mask
        self.code = code

    def get_positions(self): #returns tuple of arrays
        return np.nonzero(self.mask)

def zip_positions2d(positions): #positions is tuple of two arrays
    x,y = positions
    return list(zip(x,y))

class GridBoard:

    def __init__(self, size=4):
        self.size = size #Board dimensions, e.g. 4 x 4
        self.components = {} #name : board piece
        self.masks = {}

    def addPiece(self, name, code, pos=(0,0)):
        newPiece = BoardPiece(name, code, pos)
        self.components[name] = newPiece

    #basically a set of boundary elements
    def addMask(self, name, mask, code):
        #mask is a 2D-numpy array with 1s where the boundary elements are
        newMask = BoardMask(name, mask, code)
        self.masks[name] = newMask

    def movePiece(self, name, pos):
        move = True
        for _, mask in self.masks.items():
            if pos in zip_positions2d(mask.get_positions()):
                move = False
        if move:
            self.components[name].pos = pos

    def delPiece(self, name):
        del self.components['name']

    def render(self):
        dtype = '<U2'
        displ_board = np.zeros((self.size, self.size), dtype=dtype)
        displ_board[:] = ' '

        for name, piece in self.components.items():
            displ_board[piece.pos] = piece.code

        for name, mask in self.masks.items():
            displ_board[mask.get_positions()] = mask.code

        return displ_board

    def render_np(self):
        num_pieces = len(self.components) + len(self.masks)
        displ_board = np.zeros((num_pieces, self.size, self.size), dtype=np.uint8)
        layer = 0
        for name, piece in self.components.items():
            pos = (layer,) + piece.pos
            displ_board[pos] = 1
            layer += 1

        for name, mask in self.masks.items():
            x,y = self.masks['boundary'].get_positions()
            z = np.repeat(layer,len(x))
            a = (z,x,y)
            displ_board[a] = 1
            layer += 1
        return displ_board

def addTuple(a,b):
    return tuple([sum(x) for x in zip(a,b)])

In [3]:
class Gridworld:

    def __init__(self, size=4, mode='static'):
        if size >= 4:
            self.board = GridBoard(size=size)
        else:
            print("Minimum board size is 4. Initialized to size 4.")
            self.board = GridBoard(size=4)

        #Add pieces, positions will be updated later
        self.board.addPiece('Player','P',(0,0))
        self.board.addPiece('Goal','+',(0,3))
        self.board.addPiece('Pit','-',(0,3))
        self.board.addPiece('Wall','W',(2,3))

        if mode == 'static':
            self.initGridStatic()
        elif mode == 'player':
            self.initGridPlayer()
        else:
            self.initGridRand()

    #Initialize stationary grid, all items are placed deterministically
    def initGridStatic(self):
        #Setup static pieces
        self.board.components['Player'].pos = randPair(0,self.board.size) #Row, Column
        self.board.components['Goal'].pos = (3,0)
        self.board.components['Pit'].pos = (3,1)
        self.board.components['Wall'].pos = (3,2)
        
        if (not self.validateBoard()):
            #print('Invalid grid. Rebuilding..')
            self.initGridStatic()


    #Check if board is initialized appropriately (no overlapping pieces)
    #also remove impossible-to-win boards
    def validateBoard(self):
        valid = True

        player = self.board.components['Player']
        goal = self.board.components['Goal']
        wall = self.board.components['Wall']
        pit = self.board.components['Pit']

        all_positions = [piece for name,piece in self.board.components.items()]
        all_positions = [player.pos, goal.pos, wall.pos, pit.pos]
        if len(all_positions) > len(set(all_positions)):
            return False

        corners = [(0,0),(0,self.board.size), (self.board.size,0), (self.board.size,self.board.size)]
        #if player is in corner, can it move? if goal is in corner, is it blocked?
        if player.pos in corners or goal.pos in corners:
            val_move_pl = [self.validateMove('Player', addpos) for addpos in [(0,1),(1,0),(-1,0),(0,-1)]]
            val_move_go = [self.validateMove('Goal', addpos) for addpos in [(0,1),(1,0),(-1,0),(0,-1)]]
            if 0 not in val_move_pl or 0 not in val_move_go:
                #print(self.display())
                #print("Invalid board. Re-initializing...")
                valid = False

        return valid

    #Initialize player in random location, but keep wall, goal and pit stationary
    def initGridPlayer(self):
        #height x width x depth (number of pieces)
        self.initGridStatic()
        #place player
        self.board.components['Player'].pos = randPair(0,self.board.size)

        if (not self.validateBoard()):
            #print('Invalid grid. Rebuilding..')
            self.initGridPlayer()

    #Initialize grid so that goal, pit, wall, player are all randomly placed
    def initGridRand(self):
        #height x width x depth (number of pieces)
        self.board.components['Player'].pos = randPair(0,self.board.size)
        self.board.components['Goal'].pos = randPair(0,self.board.size)
        self.board.components['Pit'].pos = randPair(0,self.board.size)
        self.board.components['Wall'].pos = randPair(0,self.board.size)

        if (not self.validateBoard()):
            #print('Invalid grid. Rebuilding..')
            self.initGridRand()

    def validateMove(self, piece, addpos=(0,0)):
        outcome = 0 #0 is valid, 1 invalid, 2 lost game
        pit = self.board.components['Pit'].pos
        wall = self.board.components['Wall'].pos
        new_pos = addTuple(self.board.components[piece].pos, addpos)
        if new_pos == wall:
            outcome = 1 #block move, player can't move to wall
        elif max(new_pos) > (self.board.size-1):    #if outside bounds of board
            outcome = 1
        elif min(new_pos) < 0: #if outside bounds
            outcome = 1
        elif new_pos == pit:
            outcome = 2

        return outcome

    def makeMove(self, action):
        #need to determine what object (if any) is in the new grid spot the player is moving to
        #actions in {u,d,l,r}
        def checkMove(addpos):
            if self.validateMove('Player', addpos) in [0,2]:
                new_pos = addTuple(self.board.components['Player'].pos, addpos)
                self.board.movePiece('Player', new_pos)

        if action == 'u': #up
            checkMove((-1,0))
        elif action == 'd': #down
            checkMove((1,0))
        elif action == 'l': #left
            checkMove((0,-1))
        elif action == 'r': #right
            checkMove((0,1))
        else:
            pass

    def reward(self):
        if (self.board.components['Player'].pos == self.board.components['Pit'].pos):
            return -10
        elif (self.board.components['Player'].pos == self.board.components['Goal'].pos):
            return 10
        else:
            return -1

    def display(self):
        return self.board.render()

In [4]:
action_set = {
    0: 'u',
    1: 'd',
    2: 'l',
    3: 'r',
}

## 3. Functions <a id = 'funct'> 

In [5]:
"""
from2dto1d:
    Args: 
        pos(tuple):2d position of the objects(Player, Wall, Goal, Pit) in the gridworld
    Return:
        return(integer): 1d position of the objects(Player, Wall, Goal, Pit)in the gridworld
""" 
def from2dto1d(pos):
    if pos == '(0, 0)':
        return 0
    if pos == '(0, 1)':
        return 1
    if pos == '(0, 2)':
        return 2
    if pos == '(0, 3)':
        return 3
    if pos == '(1, 0)':
        return 4
    if pos == '(1, 1)':
        return 5
    if pos == '(1, 2)':
        return 6
    if pos == '(1, 3)':
        return 7
    if pos == '(2, 0)':
        return 8
    if pos == '(2, 1)':
        return 9
    if pos == '(2, 2)':
        return 10
    if pos == '(2, 3)':
        return 11
    if pos == '(3, 0)':
        return 12
    if pos == '(3, 1)':
        return 13
    if pos == '(3, 2)':
        return 14
    if pos == '(3, 3)':
        return 15

In [6]:
def from1dto2d(pos):
    if pos == 0:
        return (0, 0)
    if pos == 1:
        return (0, 1)
    if pos == 2:
        return (0, 2)
    if pos == 3:
        return (0, 3)
    if pos == 4:
        return (1, 0)
    if pos == 5:
        return (1, 1)
    if pos == 6:
        return (1, 2)
    if pos == 7:
        return (1, 3)
    if pos == 8:
        return (2, 0)
    if pos == 9:
        return (2, 1)
    if pos == 10:
        return (2, 2)
    if pos == 11:
        return (2, 3)
    if pos == 12:
        return (3, 0)
    if pos == 13:
        return (3, 1)
    if pos == 14:
        return (3, 2)
    if pos == 15:
        return (3, 3)

In [7]:
"""
from_num_to_one_hot_encode:
    Args: 
        num(int): number from 0 to 15 representing the state of the objects(Player, Wall, Goal, Pit) in the gridworld
    Return:
        return(tuple): encoded binary code with the size of 16-bit 
"""
def from_num_to_one_hot_encode(num):
    en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    if num == 0:
        en0 = 1
    if num == 1:
        en1 = 1
    if num == 2:
        en2 = 1
    if num == 3:
        en3 = 1
    if num == 4:
        en4 = 1
    if num == 5:
        en5 = 1
    if num == 6:
        en6 = 1
    if num == 7:
        en7 = 1
    if num == 8:
        en8 = 1
    if num == 9:
        en9 = 1
    if num == 10:
        en10 = 1
    if num == 11:
        en11 = 1
    if num == 12:
        en12 = 1
    if num == 13:
        en13 = 1
    if num == 14:
        en14 = 1
    if num == 15:
        en15 = 1
    return en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15

## 4. Uniform Dataset Transformations <a id = "transformations">

In [8]:
df = pd.read_csv("UniformDataset.csv")
df.columns

Index(['Unnamed: 0', 'Epochs', 'Current_State', 'Pit_Position',
       'Goal_Position', 'Wall_Position', 'Action', 'Reward', 'Next_State',
       'Epsilon_Boolean', 'Q_Max', 'Q1_Current', 'Q2_Current', 'Q3_Current',
       'Q4_Current', 'Q1_Next', 'Q2_Next', 'Q3_Next', 'Q4_Next', 'Q_Next_Max',
       'Y', 'Current_Reward', 'Encode_0', 'Encode_1', 'Encode_2', 'Encode_3',
       'Encode_4', 'Encode_5', 'Encode_6', 'Encode_7', 'Encode_8', 'Encode_9',
       'Encode_10', 'Encode_11', 'Encode_12', 'Encode_13', 'Encode_14',
       'Encode_15'],
      dtype='object')

In [9]:
df

Unnamed: 0.1,Unnamed: 0,Epochs,Current_State,Pit_Position,Goal_Position,Wall_Position,Action,Reward,Next_State,Epsilon_Boolean,...,Encode_6,Encode_7,Encode_8,Encode_9,Encode_10,Encode_11,Encode_12,Encode_13,Encode_14,Encode_15
0,0,0,0,0,2,1,3,-10,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1,2,0,2,1,1,-1,6,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,2,3,0,2,1,2,10,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,3,4,0,2,1,3,-1,5,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,4,5,0,2,1,3,-1,6,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50395,50395,50395,10,15,13,14,2,-1,9,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
50396,50396,50396,11,15,13,14,2,-1,10,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50397,50397,50397,12,15,13,14,3,10,13,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50398,50398,50398,13,15,13,14,2,-1,12,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [10]:
new_df = df[['Current_State', 'Pit_Position', 'Goal_Position', 'Wall_Position','Current_Reward','Q_Max', 
             'Q1_Current', 'Q2_Current', 'Q3_Current','Q4_Current']].copy()
new_df

Unnamed: 0,Current_State,Pit_Position,Goal_Position,Wall_Position,Current_Reward,Q_Max,Q1_Current,Q2_Current,Q3_Current,Q4_Current
0,0,0,2,1,-10,5.549575,3.727585,4.591288,3.789817,5.549575
1,2,0,2,1,10,7.489746,6.112119,7.489746,1.890567,6.901395
2,3,0,2,1,-1,9.128407,7.629634,5.275113,9.128407,7.267467
3,4,0,2,1,-1,5.525115,-0.356023,4.170507,4.578406,5.525115
4,5,0,2,1,-1,7.460454,6.332054,5.388110,4.154821,7.460454
...,...,...,...,...,...,...,...,...,...,...
50395,10,15,13,14,-1,8.503292,4.584646,7.462064,8.503292,6.897066
50396,11,15,13,14,-1,6.954754,3.595412,2.446715,6.954754,4.659037
50397,12,15,13,14,-1,9.970391,6.419645,7.699754,7.663779,9.970391
50398,13,15,13,14,10,10.481168,6.489353,8.897238,10.481168,5.423398


### 4.1 One Hot Encoder

In [11]:
x = df[["Current_State"]]
y = OneHotEncoder().fit_transform(x).toarray()
dataset = pd.DataFrame({'Encode_0': y[:, 0], 'Encode_1': y[:, 1], 'Encode_2': y[:, 2], 'Encode_3': y[:, 3], 'Encode_4': y[:, 4], 'Encode_5': y[:, 5], 'Encode_6': y[:, 6], 'Encode_7': y[:, 7], 'Encode_8': y[:, 8], 'Encode_9': y[:, 9], 'Encode_10': y[:, 10], 'Encode_11': y[:, 11], 'Encode_12': y[:, 12], 'Encode_13': y[:, 13], 'Encode_14': y[:, 14], 'Encode_15': y[:, 15]})
df['Encode_0'] = dataset["Encode_0"]
df['Encode_1'] = dataset["Encode_1"]
df['Encode_2'] = dataset["Encode_2"]
df['Encode_3'] = dataset["Encode_3"]
df['Encode_4'] = dataset["Encode_4"]
df['Encode_5'] = dataset["Encode_5"]
df['Encode_6'] = dataset["Encode_6"]
df['Encode_7'] = dataset["Encode_7"]
df['Encode_8'] = dataset["Encode_8"]
df['Encode_9'] = dataset["Encode_9"]
df['Encode_10'] = dataset["Encode_10"]
df['Encode_11'] = dataset["Encode_11"]
df['Encode_12'] = dataset["Encode_12"]
df['Encode_13'] = dataset["Encode_13"]
df['Encode_14'] = dataset["Encode_14"]
df['Encode_15'] = dataset["Encode_15"]

## 5. MLP Code <a id = "mlp">

### 5.1 With Reward

In [12]:
mlp = MLPRegressor(hidden_layer_sizes=(200,120), max_iter=300, activation = 'relu', solver='adam', random_state=123)

In [13]:
X = df[['Encode_0','Encode_1','Encode_2','Encode_3','Encode_4','Encode_5','Encode_6','Encode_7','Encode_8','Encode_9','Encode_10',
        'Encode_11','Encode_12','Encode_13','Encode_14','Encode_15','Pit_Position', 'Goal_Position', 'Wall_Position', 'Reward']]
y = df[['Y']]
mlp.fit(X.values,y.values.ravel())

In [14]:
def mlp_test_model(model_b, experience, mode='static', display=True):
    global counter
    i = 0
    test_game = Gridworld(mode=mode)
    if display:
        print("Initial State:")
        print(test_game.display())
    status = 1
    j = 0
    while(status == 1): #A
        p_curr = from2dto1d((str(test_game.board.components['Player'].pos)))
        pi = from2dto1d((str(test_game.board.components['Pit'].pos)))
        g = from2dto1d((str(test_game.board.components['Goal'].pos)))
        w = from2dto1d((str(test_game.board.components['Wall'].pos)))

        q_value_next = []

        test_game.makeMove('u')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        
        if p_curr != p:
            en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)
            reward = test_game.reward()
            q_value_next.append(model_b.predict([[en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15,pi,g,w,reward]])[0])
            test_game.makeMove('d')
        else:
            q_value_next.append(-10)
        
        test_game.makeMove('d')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        
        if p_curr != p:
            en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)
            reward = test_game.reward()
            q_value_next.append(model_b.predict([[en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15,pi,g,w,reward]])[0])
            test_game.makeMove('u')
        else:
            q_value_next.append(-10)
        
        test_game.makeMove('l')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        

        if p_curr != p:
            test_game.makeMove('r')
            en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)
            reward = test_game.reward()
            q_value_next.append(model_b.predict([[en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15,pi,g,w,reward]])[0])
        else:
            q_value_next.append(-10)
        
        test_game.makeMove('r')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        

        if p_curr != p:
            en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)
            reward = test_game.reward()
            q_value_next.append(model_b.predict([[en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15,pi,g,w,reward]])[0])
            test_game.makeMove('l')
        else:
            q_value_next.append(-10)
        
#         print(q_value_next)
        
        
        action_ = np.argmax(q_value_next)
#         print(action_)
        action = action_set[action_]
        
        experience[counter][0] = p_curr
        experience[counter][1] = pi
        experience[counter][2] = g
        experience[counter][3] = w
        experience[counter][4] = action_
        
    
        if display:
            print('Move #: %s; Taking action: %s' % (i, action))
        test_game.makeMove(action)
        p_next = from2dto1d((str(test_game.board.components['Player'].pos)))
        experience[counter][5] = p_next
        
        j += 1
        if display:
            print(test_game.display())
        reward = test_game.reward()
        experience[counter][6] = reward
        if reward > -1:
            experience[counter][7] = True
        else:
            experience[counter][7] = False
        experience[counter][8] =  q_value_next[action_]
        experience[counter][9] = q_value_next[0]
        experience[counter][10] = q_value_next[1]
        experience[counter][11] = q_value_next[2]
        experience[counter][12] = q_value_next[3]
        if reward != -1:
            if reward > 0:
                status = 2
                if display:
                    print("Game won! Reward: %s" % (reward,))
            else:
                status = 0
                if display:
                    print("Game LOST. Reward: %s" % (reward,))
        counter += 1
        i += 1
        if (i > 15):
            if display:
                print("Game lost; too many moves.")
            break
    
    win = True if status == 2 else False
    return win

In [15]:
h = 0
w, h = 13, 3000000
experience_mlp = [[0 for x in range(w)] for y in range(h)] 

global counter
counter = 0

In [16]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = mlp_test_model(mlp, experience_mlp, 'random', display = False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num / 10
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 753
0.753
0.753
Games played: 1000, # of wins: 774
0.774
1.5270000000000001
Games played: 1000, # of wins: 762
0.762
2.289
Games played: 1000, # of wins: 758
0.758
3.047
Games played: 1000, # of wins: 768
0.768
3.8150000000000004
Games played: 1000, # of wins: 780
0.78
4.595000000000001
Games played: 1000, # of wins: 773
0.773
5.368
Games played: 1000, # of wins: 788
0.788
6.156000000000001
Games played: 1000, # of wins: 751
0.751
6.907000000000001
Games played: 1000, # of wins: 756
0.756
7.663000000000001
Win percentage: 76.63000000000001%


### 5.2 Without Reward

In [17]:
from sklearn.neural_network import MLPRegressor
mlp = MLPRegressor(hidden_layer_sizes=(200,120), max_iter=300, activation = 'relu', solver='adam', random_state=123)

In [18]:
from sklearn.model_selection import train_test_split

X = df[['Encode_0','Encode_1','Encode_2','Encode_3','Encode_4','Encode_5','Encode_6','Encode_7','Encode_8','Encode_9','Encode_10',
        'Encode_11','Encode_12','Encode_13','Encode_14','Encode_15','Pit_Position', 'Goal_Position', 'Wall_Position']]
y = df[['Y']]
mlp.fit(X.values,y.values.ravel())

In [19]:
h = 0
w, h = 13, 3000000
experience_mlp = [[0 for x in range(w)] for y in range(h)] 

global counter
counter = 0

In [20]:
def mlp_test_model(model_b, experience, mode='static', display=True):
    global counter
    i = 0
    test_game = Gridworld(mode=mode)
#     state_ = test_game.board.render_np().reshape(1,64) + np.random.rand(1,64)/10.0
#     state = torch.from_numpy(state_).float()
    if display:
        print("Initial State:")
        print(test_game.display())
    status = 1
    j = 0
    while(status == 1): #A
        p_curr = from2dto1d((str(test_game.board.components['Player'].pos)))
        pi = from2dto1d((str(test_game.board.components['Pit'].pos)))
        g = from2dto1d((str(test_game.board.components['Goal'].pos)))
        w = from2dto1d((str(test_game.board.components['Wall'].pos)))
        #q_current = model.predict([[p_curr,pi,g,w]])[0][0]
        
#         q_value_current = []
        q_value_next = []
#         q_value = []
        test_game.makeMove('u')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)

        reward = test_game.reward()
       # q_value_current.append(model.predict([[p,pi,g,w]])[0][0])
        q_value_next.append(model_b.predict([[en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15,pi,g,w]])[0])
        #q_value.append(abs(model_b.predict([[p,pi,g,w,reward]])[0][0]-q_current))
        if p_curr != p:
            test_game.makeMove('d')
        
        test_game.makeMove('d')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)

        reward = test_game.reward()
       # q_value_current.append(model.predict([[p,pi,g,w]])[0][0])
        q_value_next.append(model_b.predict([[en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15,pi,g,w]])[0])
        #q_value.append(abs(model_b.predict([[p,pi,g,w,reward]])[0][0]-q_current))
        if p_curr != p:
            test_game.makeMove('u')
        
        test_game.makeMove('l')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)

        reward = test_game.reward()
       # q_value_current.append(model.predict([[p,pi,g,w]])[0][0])
        q_value_next.append(model_b.predict([[en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15,pi,g,w]])[0])
        #q_value.append(abs(model_b.predict([[p,pi,g,w,reward]])[0][0]-q_current))
        if p_curr != p:
            test_game.makeMove('r')
        
        test_game.makeMove('r')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)

        reward = test_game.reward()
        # q_value_current.append(model.predict([[p,pi,g,w]])[0][0])
        q_value_next.append(model_b.predict([[en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15,pi,g,w]])[0])
        #q_value.append(abs(model_b.predict([[p,pi,g,w,reward]])[0][0]-q_current))
        if p_curr != p:
            test_game.makeMove('l')
        
#         print(q_value_next)
        
        
        action_ = np.argmax(q_value_next)
#         print(action_)
        action = action_set[action_]
        
        experience[counter][0] = p_curr
        experience[counter][1] = pi
        experience[counter][2] = g
        experience[counter][3] = w
        experience[counter][4] = action_
        
    
        if display:
            print('Move #: %s; Taking action: %s' % (i, action))
        test_game.makeMove(action)
        p_next = from2dto1d((str(test_game.board.components['Player'].pos)))
        experience[counter][5] = p_next
        
        j += 1
        if display:
            print(test_game.display())
        reward = test_game.reward()
        experience[counter][6] = reward
        if reward > -1:
            experience[counter][7] = True
        else:
            experience[counter][7] = False
        experience[counter][8] =  q_value_next[action_]
        experience[counter][9] = q_value_next[0]
        experience[counter][10] = q_value_next[1]
        experience[counter][11] = q_value_next[2]
        experience[counter][12] = q_value_next[3]
        if reward != -1:
            if reward > 0:
                status = 2
                if display:
                    print("Game won! Reward: %s" % (reward,))
            else:
                status = 0
                if display:
                    print("Game LOST. Reward: %s" % (reward,))
        counter += 1
        i += 1
        if (i > 15):
            if display:
                print("Game lost; too many moves.")
            break
    
    win = True if status == 2 else False
    return win

In [21]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = mlp_test_model(mlp, experience_mlp, 'random', display = False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num / 10
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 212
0.212
0.212
Games played: 1000, # of wins: 211
0.211
0.423
Games played: 1000, # of wins: 206
0.206
0.629
Games played: 1000, # of wins: 206
0.206
0.835
Games played: 1000, # of wins: 229
0.229
1.064
Games played: 1000, # of wins: 200
0.2
1.264
Games played: 1000, # of wins: 213
0.213
1.477
Games played: 1000, # of wins: 201
0.201
1.6780000000000002
Games played: 1000, # of wins: 201
0.201
1.8790000000000002
Games played: 1000, # of wins: 211
0.211
2.0900000000000003
Win percentage: 20.900000000000002%


### 5.3 Predict on Action

In [22]:
df.columns

Index(['Unnamed: 0', 'Epochs', 'Current_State', 'Pit_Position',
       'Goal_Position', 'Wall_Position', 'Action', 'Reward', 'Next_State',
       'Epsilon_Boolean', 'Q_Max', 'Q1_Current', 'Q2_Current', 'Q3_Current',
       'Q4_Current', 'Q1_Next', 'Q2_Next', 'Q3_Next', 'Q4_Next', 'Q_Next_Max',
       'Y', 'Current_Reward', 'Encode_0', 'Encode_1', 'Encode_2', 'Encode_3',
       'Encode_4', 'Encode_5', 'Encode_6', 'Encode_7', 'Encode_8', 'Encode_9',
       'Encode_10', 'Encode_11', 'Encode_12', 'Encode_13', 'Encode_14',
       'Encode_15'],
      dtype='object')

In [23]:
mlp = MLPClassifier(hidden_layer_sizes=(200,120), max_iter=300, activation = 'relu', solver='adam', random_state=123)

In [24]:
X = df[['Encode_0','Encode_1','Encode_2','Encode_3','Encode_4','Encode_5','Encode_6','Encode_7','Encode_8','Encode_9','Encode_10',
        'Encode_11','Encode_12','Encode_13','Encode_14','Encode_15','Pit_Position', 'Goal_Position', 'Wall_Position']]
y = df[['Action']]
mlp.fit(X.values,y.values.ravel())

In [25]:
h = 0
w, h = 7, 3000000
experience_action_mlp = [[0 for x in range(w)] for y in range(h)] 

global counter
counter = 0

In [26]:
def mlp_test_model(model_b, experience, mode='static', display=True):
    global counter
    i = 0
    test_game = Gridworld(mode=mode)
#     state_ = test_game.board.render_np().reshape(1,64) + np.random.rand(1,64)/10.0
#     state = torch.from_numpy(state_).float()
    if display:
        print("Initial State:")
        print(test_game.display())
    status = 1
    j = 0
    while(status == 1): #A
        p_curr = from2dto1d((str(test_game.board.components['Player'].pos)))
        pi = from2dto1d((str(test_game.board.components['Pit'].pos)))
        g = from2dto1d((str(test_game.board.components['Goal'].pos)))
        w = from2dto1d((str(test_game.board.components['Wall'].pos)))
        en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p_curr)
        action_ = model_b.predict([[en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15,pi,g,w]])[0]
        action = action_set[action_]
        
        experience[counter][0] = p_curr
        experience[counter][1] = pi
        experience[counter][2] = g
        experience[counter][3] = w
        experience[counter][4] = action_
        
    
        if display:
            print('Move #: %s; Taking action: %s' % (i, action))
        test_game.makeMove(action)
        p_next = from2dto1d((str(test_game.board.components['Player'].pos)))

        j += 1
        if display:
            print(test_game.display())
        reward = test_game.reward()
        experience[counter][5] = reward
        if reward > -1:
            experience[counter][6] = True
        else:
            experience[counter][6] = False
        if reward != -1:
            if reward > 0:
                status = 2
                if display:
                    print("Game won! Reward: %s" % (reward,))
            else:
                status = 0
                if display:
                    print("Game LOST. Reward: %s" % (reward,))
        counter += 1
        i += 1
        if (i > 15):
            if display:
                print("Game lost; too many moves.")
            break
    win = True if status == 2 else False
    return win

In [27]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = mlp_test_model(mlp, experience_action_mlp, 'random', display = False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num / 10
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 875
0.875
0.875
Games played: 1000, # of wins: 845
0.845
1.72
Games played: 1000, # of wins: 868
0.868
2.588
Games played: 1000, # of wins: 886
0.886
3.474
Games played: 1000, # of wins: 856
0.856
4.33
Games played: 1000, # of wins: 856
0.856
5.186
Games played: 1000, # of wins: 874
0.874
6.06
Games played: 1000, # of wins: 855
0.855
6.914999999999999
Games played: 1000, # of wins: 868
0.868
7.7829999999999995
Games played: 1000, # of wins: 864
0.864
8.647
Win percentage: 86.47%


## 6. Dataset Manipulation <a id = "manipulation"> 

In [28]:
df.columns

Index(['Unnamed: 0', 'Epochs', 'Current_State', 'Pit_Position',
       'Goal_Position', 'Wall_Position', 'Action', 'Reward', 'Next_State',
       'Epsilon_Boolean', 'Q_Max', 'Q1_Current', 'Q2_Current', 'Q3_Current',
       'Q4_Current', 'Q1_Next', 'Q2_Next', 'Q3_Next', 'Q4_Next', 'Q_Next_Max',
       'Y', 'Current_Reward', 'Encode_0', 'Encode_1', 'Encode_2', 'Encode_3',
       'Encode_4', 'Encode_5', 'Encode_6', 'Encode_7', 'Encode_8', 'Encode_9',
       'Encode_10', 'Encode_11', 'Encode_12', 'Encode_13', 'Encode_14',
       'Encode_15'],
      dtype='object')

In [29]:
action_set = {
    0: 'u',
    1: 'd',
    2: 'l',
    3: 'r',
}

In [30]:
df

Unnamed: 0.1,Unnamed: 0,Epochs,Current_State,Pit_Position,Goal_Position,Wall_Position,Action,Reward,Next_State,Epsilon_Boolean,...,Encode_6,Encode_7,Encode_8,Encode_9,Encode_10,Encode_11,Encode_12,Encode_13,Encode_14,Encode_15
0,0,0,0,0,2,1,3,-10,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1,2,0,2,1,1,-1,6,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,2,3,0,2,1,2,10,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,3,4,0,2,1,3,-1,5,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,4,5,0,2,1,3,-1,6,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50395,50395,50395,10,15,13,14,2,-1,9,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
50396,50396,50396,11,15,13,14,2,-1,10,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50397,50397,50397,12,15,13,14,3,10,13,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50398,50398,50398,13,15,13,14,2,-1,12,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [31]:
df.shape[0]

50400

In [32]:
counter = 0

h = 0
w, h = 6, 3000000
matrix = [[0 for x in range(w)] for y in range(h)] 

for i in range (0, df.shape[0]):
    game = Gridworld(size=4, mode='random')
    game.board.components['Player'].pos = from1dto2d(df["Current_State"][i])
    game.board.components['Goal'].pos = from1dto2d(df["Goal_Position"][i])
    game.board.components['Wall'].pos = from1dto2d(df["Wall_Position"][i])
    game.board.components['Pit'].pos = from1dto2d(df["Pit_Position"][i])
    
    pi = from2dto1d((str(game.board.components['Player'].pos)))
    game.makeMove('u')
    p = from2dto1d((str(game.board.components['Player'].pos)))
    if p == pi:
        pass
    else:
        matrix[counter][0] = from2dto1d((str(game.board.components['Player'].pos)))
        matrix[counter][1] = from2dto1d((str(game.board.components['Goal'].pos)))
        matrix[counter][2] = from2dto1d((str(game.board.components['Wall'].pos)))
        matrix[counter][3] = from2dto1d((str(game.board.components['Pit'].pos)))
        matrix[counter][4] = df["Q1_Current"][i]
        reward = game.reward()
        matrix[counter][5] = reward
        game.makeMove('d')
        counter += 1 
        
    pi = from2dto1d((str(game.board.components['Player'].pos)))    
    game.makeMove('d')
    p = from2dto1d((str(game.board.components['Player'].pos)))
    if p == pi:
        pass
    else:
        matrix[counter][0] = from2dto1d((str(game.board.components['Player'].pos)))
        matrix[counter][1] = from2dto1d((str(game.board.components['Goal'].pos)))
        matrix[counter][2] = from2dto1d((str(game.board.components['Wall'].pos)))
        matrix[counter][3] = from2dto1d((str(game.board.components['Pit'].pos)))
        matrix[counter][4] = df["Q2_Current"][i]
        reward = game.reward()
        matrix[counter][5] = reward
        game.makeMove('u')
        counter += 1 
        
    pi = from2dto1d((str(game.board.components['Player'].pos)))    
    game.makeMove('l')
    p = from2dto1d((str(game.board.components['Player'].pos)))
    if p == pi:
        pass
    else:
        matrix[counter][0] = from2dto1d((str(game.board.components['Player'].pos)))
        matrix[counter][1] = from2dto1d((str(game.board.components['Goal'].pos)))
        matrix[counter][2] = from2dto1d((str(game.board.components['Wall'].pos)))
        matrix[counter][3] = from2dto1d((str(game.board.components['Pit'].pos)))
        matrix[counter][4] = df["Q3_Current"][i]
        reward = game.reward()
        matrix[counter][5] = reward
        game.makeMove('r')
        counter += 1 
        
    pi = from2dto1d((str(game.board.components['Player'].pos)))   
    
    game.makeMove('r')
    p = from2dto1d((str(game.board.components['Player'].pos)))
    if p == pi:
        pass
    else:
        matrix[counter][0] = from2dto1d((str(game.board.components['Player'].pos)))
        matrix[counter][1] = from2dto1d((str(game.board.components['Goal'].pos)))
        matrix[counter][2] = from2dto1d((str(game.board.components['Wall'].pos)))
        matrix[counter][3] = from2dto1d((str(game.board.components['Pit'].pos)))
        matrix[counter][4] = df["Q4_Current"][i]
        reward = game.reward()
        matrix[counter][5] = reward
        game.makeMove('l')
        counter += 1 

In [33]:
column_names = ["Player",  "Goal", "Wall", "Pit", "Q_value", "Reward"]
df_matrix = pd.DataFrame(matrix, columns = column_names)
df_matrix = df_matrix.loc[(df_matrix != 0).any(axis=1)]
df_matrix

Unnamed: 0,Player,Goal,Wall,Pit,Q_value,Reward
0,4,2,1,0,4.591288,-1
1,6,2,1,0,7.489746,-1
2,3,2,1,0,6.901395,-1
3,7,2,1,0,5.275113,-1
4,2,2,1,0,9.128407,10
...,...,...,...,...,...,...
141115,8,13,14,15,6.419645,-1
141116,13,13,14,15,9.970391,10
141117,9,13,14,15,6.489353,-1
141118,12,13,14,15,10.481168,-1


In [34]:
df_matrix.to_csv('ImprovedDataset.csv')

In [35]:
original_df = df_matrix.copy()
original_df

Unnamed: 0,Player,Goal,Wall,Pit,Q_value,Reward
0,4,2,1,0,4.591288,-1
1,6,2,1,0,7.489746,-1
2,3,2,1,0,6.901395,-1
3,7,2,1,0,5.275113,-1
4,2,2,1,0,9.128407,10
...,...,...,...,...,...,...
141115,8,13,14,15,6.419645,-1
141116,13,13,14,15,9.970391,10
141117,9,13,14,15,6.489353,-1
141118,12,13,14,15,10.481168,-1


In [37]:
no_dup_df

NameError: name 'no_dup_df' is not defined

In [38]:
new_df

Unnamed: 0,Current_State,Pit_Position,Goal_Position,Wall_Position,Current_Reward,Q_Max,Q1_Current,Q2_Current,Q3_Current,Q4_Current
0,0,0,2,1,-10,5.549575,3.727585,4.591288,3.789817,5.549575
1,2,0,2,1,10,7.489746,6.112119,7.489746,1.890567,6.901395
2,3,0,2,1,-1,9.128407,7.629634,5.275113,9.128407,7.267467
3,4,0,2,1,-1,5.525115,-0.356023,4.170507,4.578406,5.525115
4,5,0,2,1,-1,7.460454,6.332054,5.388110,4.154821,7.460454
...,...,...,...,...,...,...,...,...,...,...
50395,10,15,13,14,-1,8.503292,4.584646,7.462064,8.503292,6.897066
50396,11,15,13,14,-1,6.954754,3.595412,2.446715,6.954754,4.659037
50397,12,15,13,14,-1,9.970391,6.419645,7.699754,7.663779,9.970391
50398,13,15,13,14,10,10.481168,6.489353,8.897238,10.481168,5.423398


### 6.1 One hot Encoder 

In [None]:
x = original_df[["Player"]]
y = OneHotEncoder().fit_transform(x).toarray()
dataset = pd.DataFrame({'Encode_0': y[:, 0], 'Encode_1': y[:, 1], 'Encode_2': y[:, 2], 'Encode_3': y[:, 3], 'Encode_4': y[:, 4], 'Encode_5': y[:, 5], 'Encode_6': y[:, 6], 'Encode_7': y[:, 7], 'Encode_8': y[:, 8], 'Encode_9': y[:, 9], 'Encode_10': y[:, 10], 'Encode_11': y[:, 11], 'Encode_12': y[:, 12], 'Encode_13': y[:, 13], 'Encode_14': y[:, 14], 'Encode_15': y[:, 15]})
original_df['Encode_0'] = dataset["Encode_0"]
original_df['Encode_1'] = dataset["Encode_1"]
original_df['Encode_2'] = dataset["Encode_2"]
original_df['Encode_3'] = dataset["Encode_3"]
original_df['Encode_4'] = dataset["Encode_4"]
original_df['Encode_5'] = dataset["Encode_5"]
original_df['Encode_6'] = dataset["Encode_6"]
original_df['Encode_7'] = dataset["Encode_7"]
original_df['Encode_8'] = dataset["Encode_8"]
original_df['Encode_9'] = dataset["Encode_9"]
original_df['Encode_10'] = dataset["Encode_10"]
original_df['Encode_11'] = dataset["Encode_11"]
original_df['Encode_12'] = dataset["Encode_12"]
original_df['Encode_13'] = dataset["Encode_13"]
original_df['Encode_14'] = dataset["Encode_14"]
original_df['Encode_15'] = dataset["Encode_15"]
original_df

### 6.2 Testing on MLP

#### 6.2.1 Testing with One hot Encoder 

In [None]:
mlp = MLPRegressor(hidden_layer_sizes=(200,120), max_iter=500, activation = 'relu', solver='adam', random_state=123)

In [None]:
X = original_df[['Encode_0','Encode_1','Encode_2','Encode_3','Encode_4','Encode_5','Encode_6','Encode_7','Encode_8',
                 'Encode_9','Encode_10','Encode_11','Encode_12','Encode_13','Encode_14','Encode_15','Pit', 'Goal', 'Wall', 'Reward']]
y = original_df[['Q_value']]
mlp.fit(X.values,y.values.ravel())

In [None]:
def mlp_test_model(model_b, experience, mode='static', display=True):
    global counter
    i = 0
    test_game = Gridworld(mode=mode)
    if display:
        print("Initial State:")
        print(test_game.display())
    status = 1
    j = 0
    while(status == 1): #A
        p_curr = from2dto1d((str(test_game.board.components['Player'].pos)))
        pi = from2dto1d((str(test_game.board.components['Pit'].pos)))
        g = from2dto1d((str(test_game.board.components['Goal'].pos)))
        w = from2dto1d((str(test_game.board.components['Wall'].pos)))

        q_value_next = []

        test_game.makeMove('u')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        
        if p_curr != p:
            en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)
            reward = test_game.reward()
            q_value_next.append(model_b.predict([[en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15,pi,g,w,reward]])[0])
            test_game.makeMove('d')
        else:
            q_value_next.append(-10)
        
        test_game.makeMove('d')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        
        if p_curr != p:
            en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)
            reward = test_game.reward()
            q_value_next.append(model_b.predict([[en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15,pi,g,w,reward]])[0])
            test_game.makeMove('u')
        else:
            q_value_next.append(-10)
        
        test_game.makeMove('l')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        

        if p_curr != p:
            test_game.makeMove('r')
            en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)
            reward = test_game.reward()
            q_value_next.append(model_b.predict([[en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15,pi,g,w,reward]])[0])
        else:
            q_value_next.append(-10)
        
        test_game.makeMove('r')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        

        if p_curr != p:
            en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)
            reward = test_game.reward()
            q_value_next.append(model_b.predict([[en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15,pi,g,w,reward]])[0])
            test_game.makeMove('l')
        else:
            q_value_next.append(-10)
        
#         print(q_value_next)
        
        
        action_ = np.argmax(q_value_next)
#         print(action_)
        action = action_set[action_]
        
        experience[counter][0] = p_curr
        experience[counter][1] = pi
        experience[counter][2] = g
        experience[counter][3] = w
        experience[counter][4] = action_
        
    
        if display:
            print('Move #: %s; Taking action: %s' % (i, action))
        test_game.makeMove(action)
        p_next = from2dto1d((str(test_game.board.components['Player'].pos)))
        experience[counter][5] = p_next
        
        j += 1
        if display:
            print(test_game.display())
        reward = test_game.reward()
        experience[counter][6] = reward
        if reward > -1:
            experience[counter][7] = True
        else:
            experience[counter][7] = False
        experience[counter][8] =  q_value_next[action_]
        experience[counter][9] = q_value_next[0]
        experience[counter][10] = q_value_next[1]
        experience[counter][11] = q_value_next[2]
        experience[counter][12] = q_value_next[3]
        if reward != -1:
            if reward > 0:
                status = 2
                if display:
                    print("Game won! Reward: %s" % (reward,))
            else:
                status = 0
                if display:
                    print("Game LOST. Reward: %s" % (reward,))
        counter += 1
        i += 1
        if (i > 15):
            if display:
                print("Game lost; too many moves.")
            break
    
    win = True if status == 2 else False
    return win

In [None]:
h = 0
w, h = 13, 3000000
experience_mlp = [[0 for x in range(w)] for y in range(h)] 

global counter
counter = 0

In [None]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = mlp_test_model(mlp, experience_mlp, 'random', display = False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num / 10
print("Win percentage: {}%".format(win_num*100))

#### 6.2.2 Testing without One hot Encoder

In [None]:
mlp = MLPRegressor(hidden_layer_sizes=(200,120), max_iter=300, activation = 'relu', solver='adam', random_state=123)

In [None]:
X = original_df[['Player','Pit', 'Goal', 'Wall', 'Reward']]
y = original_df[['Q_value']]
mlp.fit(X.values,y.values.ravel())

In [None]:
def mlp_test_model(model_b, experience, mode='static', display=True):
    global counter
    i = 0
    test_game = Gridworld(mode=mode)
    if display:
        print("Initial State:")
        print(test_game.display())
    status = 1
    j = 0
    while(status == 1): #A
        p_curr = from2dto1d((str(test_game.board.components['Player'].pos)))
        pi = from2dto1d((str(test_game.board.components['Pit'].pos)))
        g = from2dto1d((str(test_game.board.components['Goal'].pos)))
        w = from2dto1d((str(test_game.board.components['Wall'].pos)))

        q_value_next = []

        test_game.makeMove('u')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        
        if p_curr != p:
            en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)
            reward = test_game.reward()
            q_value_next.append(model_b.predict([[p,pi,g,w,reward]])[0])
            test_game.makeMove('d')
        else:
            q_value_next.append(-10)
        
        test_game.makeMove('d')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        
        if p_curr != p:
            en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)
            reward = test_game.reward()
            q_value_next.append(model_b.predict([[p,pi,g,w,reward]])[0])
            test_game.makeMove('u')
        else:
            q_value_next.append(-10)
        
        test_game.makeMove('l')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        

        if p_curr != p:
            test_game.makeMove('r')
            en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)
            reward = test_game.reward()
            q_value_next.append(model_b.predict([[p,pi,g,w,reward]])[0])
        else:
            q_value_next.append(-10)
        
        test_game.makeMove('r')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        

        if p_curr != p:
            en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)
            reward = test_game.reward()
            q_value_next.append(model_b.predict([[p,pi,g,w,reward]])[0])
            test_game.makeMove('l')
        else:
            q_value_next.append(-10)
        
#         print(q_value_next)
        
        
        action_ = np.argmax(q_value_next)
#         print(action_)
        action = action_set[action_]
        
        experience[counter][0] = p_curr
        experience[counter][1] = pi
        experience[counter][2] = g
        experience[counter][3] = w
        experience[counter][4] = action_
        
    
        if display:
            print('Move #: %s; Taking action: %s' % (i, action))
        test_game.makeMove(action)
        p_next = from2dto1d((str(test_game.board.components['Player'].pos)))
        experience[counter][5] = p_next
        
        j += 1
        if display:
            print(test_game.display())
        reward = test_game.reward()
        experience[counter][6] = reward
        if reward > -1:
            experience[counter][7] = True
        else:
            experience[counter][7] = False
        experience[counter][8] =  q_value_next[action_]
        experience[counter][9] = q_value_next[0]
        experience[counter][10] = q_value_next[1]
        experience[counter][11] = q_value_next[2]
        experience[counter][12] = q_value_next[3]
        if reward != -1:
            if reward > 0:
                status = 2
                if display:
                    print("Game won! Reward: %s" % (reward,))
            else:
                status = 0
                if display:
                    print("Game LOST. Reward: %s" % (reward,))
        counter += 1
        i += 1
        if (i > 15):
            if display:
                print("Game lost; too many moves.")
            break
    
    win = True if status == 2 else False
    return win

In [None]:
h = 0
w, h = 13, 3000000
experience_mlp = [[0 for x in range(w)] for y in range(h)] 

global counter
counter = 0

In [None]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = mlp_test_model(mlp, experience_mlp, 'random', display = False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num / 10
print("Win percentage: {}%".format(win_num*100))

## 7. Bellman Opeartor  <a id = "bellman"> 

In [39]:
original_df

Unnamed: 0,Player,Goal,Wall,Pit,Q_value,Reward
0,4,2,1,0,4.591288,-1
1,6,2,1,0,7.489746,-1
2,3,2,1,0,6.901395,-1
3,7,2,1,0,5.275113,-1
4,2,2,1,0,9.128407,10
...,...,...,...,...,...,...
141115,8,13,14,15,6.419645,-1
141116,13,13,14,15,9.970391,10
141117,9,13,14,15,6.489353,-1
141118,12,13,14,15,10.481168,-1


In [40]:
df

Unnamed: 0.1,Unnamed: 0,Epochs,Current_State,Pit_Position,Goal_Position,Wall_Position,Action,Reward,Next_State,Epsilon_Boolean,...,Encode_6,Encode_7,Encode_8,Encode_9,Encode_10,Encode_11,Encode_12,Encode_13,Encode_14,Encode_15
0,0,0,0,0,2,1,3,-10,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1,2,0,2,1,1,-1,6,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,2,3,0,2,1,2,10,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,3,4,0,2,1,3,-1,5,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,4,5,0,2,1,3,-1,6,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50395,50395,50395,10,15,13,14,2,-1,9,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
50396,50396,50396,11,15,13,14,2,-1,10,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50397,50397,50397,12,15,13,14,3,10,13,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50398,50398,50398,13,15,13,14,2,-1,12,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [41]:
xx = df.drop_duplicates(keep='last',subset=[ 'Current_State', 'Goal_Position', 'Wall_Position', 'Pit_Position'])
xx

Unnamed: 0.1,Unnamed: 0,Epochs,Current_State,Pit_Position,Goal_Position,Wall_Position,Action,Reward,Next_State,Epsilon_Boolean,...,Encode_6,Encode_7,Encode_8,Encode_9,Encode_10,Encode_11,Encode_12,Encode_13,Encode_14,Encode_15
0,0,0,0,0,2,1,3,-10,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1,2,0,2,1,1,-1,6,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,2,3,0,2,1,2,10,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,3,4,0,2,1,3,-1,5,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,4,5,0,2,1,3,-1,6,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50395,50395,50395,10,15,13,14,2,-1,9,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
50396,50396,50396,11,15,13,14,2,-1,10,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50397,50397,50397,12,15,13,14,3,10,13,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50398,50398,50398,13,15,13,14,2,-1,12,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [42]:
duplicate = original_df[original_df.duplicated(['Player', 'Goal', 'Wall', 'Pit'])]
duplicate

Unnamed: 0,Player,Goal,Wall,Pit,Q_value,Reward
9,4,2,1,0,4.154821,-1
10,6,2,1,0,7.460454,-1
11,2,2,1,0,9.709291,10
13,5,2,1,0,5.941126,-1
14,7,2,1,0,6.568333,-1
...,...,...,...,...,...,...
141115,8,13,14,15,6.419645,-1
141116,13,13,14,15,9.970391,10
141117,9,13,14,15,6.489353,-1
141118,12,13,14,15,10.481168,-1


In [43]:
original_df

Unnamed: 0,Player,Goal,Wall,Pit,Q_value,Reward
0,4,2,1,0,4.591288,-1
1,6,2,1,0,7.489746,-1
2,3,2,1,0,6.901395,-1
3,7,2,1,0,5.275113,-1
4,2,2,1,0,9.128407,10
...,...,...,...,...,...,...
141115,8,13,14,15,6.419645,-1
141116,13,13,14,15,9.970391,10
141117,9,13,14,15,6.489353,-1
141118,12,13,14,15,10.481168,-1


In [44]:
no_dup_df = original_df.drop_duplicates(keep='last',subset=[ 'Player', 'Goal', 'Wall', 'Pit'])

In [45]:
no_dup_df

Unnamed: 0,Player,Goal,Wall,Pit,Q_value,Reward
5,0,2,1,0,-0.356023,-10
11,2,2,1,0,9.709291,10
15,3,2,1,0,7.962252,-1
18,4,2,1,0,2.647438,-1
21,5,2,1,0,6.384776,-1
...,...,...,...,...,...,...
141115,8,13,14,15,6.419645,-1
141116,13,13,14,15,9.970391,10
141117,9,13,14,15,6.489353,-1
141118,12,13,14,15,10.481168,-1


### 7.1 No duplicate dataset Testing 

In [46]:
mlp = MLPRegressor(hidden_layer_sizes=(200,120), max_iter=300, activation = 'relu', solver='adam', random_state=123)

In [47]:
X = no_dup_df[['Player','Pit', 'Goal', 'Wall', 'Reward']]
y = no_dup_df[['Q_value']]
mlp.fit(X.values,y.values.ravel())

In [48]:
h = 0
w, h = 13, 3000000
experience_mlp = [[0 for x in range(w)] for y in range(h)] 

global counter
counter = 0

In [49]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = mlp_test_model(mlp, experience_mlp, 'random', display = False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num / 10
print("Win percentage: {}%".format(win_num*100))

ValueError: X has 19 features, but MLPRegressor is expecting 5 features as input.

In [50]:
no_dup_df[(no_dup_df["Player"] == 12) & (no_dup_df["Goal"] == 8)  & (no_dup_df["Wall"] == 2) & (no_dup_df["Pit"] == 9)]

Unnamed: 0,Player,Goal,Wall,Pit,Q_value,Reward
80913,12,8,2,9,2.310449,-1


In [51]:
x = no_dup_df[(no_dup_df["Player"] == 12) & (no_dup_df["Goal"] == 8)  & (no_dup_df["Wall"] == 2) & (no_dup_df["Pit"] == 9)].Q_value
a = []
a.append(x)

In [52]:
dc = no_dup_df[(no_dup_df["Player"] == 12) & (no_dup_df["Goal"] == 8)  & (no_dup_df["Wall"] == 2) & (no_dup_df["Pit"] == 9)]['Q_value'].values[0]
dc

2.3104491233825684

In [53]:
x = no_dup_df[(no_dup_df["Player"] == 13) & (no_dup_df["Goal"] == 8)  & (no_dup_df["Wall"] == 2) & (no_dup_df["Pit"] == 9)].Q_value
a.append(x)
x = no_dup_df[(no_dup_df["Player"] == 14) & (no_dup_df["Goal"] == 8)  & (no_dup_df["Wall"] == 2) & (no_dup_df["Pit"] == 9)].Q_value
a.append(x)
x = no_dup_df[(no_dup_df["Player"] == 8) & (no_dup_df["Goal"] == 8)  & (no_dup_df["Wall"] == 2) & (no_dup_df["Pit"] == 9)].Q_value
a.append(x)

In [54]:
a

[80913    2.310449
 Name: Q_value, dtype: float64,
 80916    2.799523
 Name: Q_value, dtype: float64,
 80919    1.749157
 Name: Q_value, dtype: float64,
 80910    6.302371
 Name: Q_value, dtype: float64]

In [55]:
np.argmax(a)

3

In [56]:
no_dup_df.shape[0]

50400

In [57]:
no_dup_df = no_dup_df.reset_index()
no_dup_df

Unnamed: 0,index,Player,Goal,Wall,Pit,Q_value,Reward
0,5,0,2,1,0,-0.356023,-10
1,11,2,2,1,0,9.709291,10
2,15,3,2,1,0,7.962252,-1
3,18,4,2,1,0,2.647438,-1
4,21,5,2,1,0,6.384776,-1
...,...,...,...,...,...,...,...
50395,141115,8,13,14,15,6.419645,-1
50396,141116,13,13,14,15,9.970391,10
50397,141117,9,13,14,15,6.489353,-1
50398,141118,12,13,14,15,10.481168,-1


In [58]:
no_dup_df["Player"]

0         0
1         2
2         3
3         4
4         5
         ..
50395     8
50396    13
50397     9
50398    12
50399    11
Name: Player, Length: 50400, dtype: int64

In [59]:
no_dup_df[(no_dup_df["Player"] == 12) & (no_dup_df["Goal"] == 1)  & (no_dup_df["Wall"] == 2) & (no_dup_df["Pit"] == 3)]['Q_value'].values[0]

2.95438814163208

In [60]:
type(no_dup_df["Goal"][6])

numpy.int64

In [61]:
no_dup_df

Unnamed: 0,index,Player,Goal,Wall,Pit,Q_value,Reward
0,5,0,2,1,0,-0.356023,-10
1,11,2,2,1,0,9.709291,10
2,15,3,2,1,0,7.962252,-1
3,18,4,2,1,0,2.647438,-1
4,21,5,2,1,0,6.384776,-1
...,...,...,...,...,...,...,...
50395,141115,8,13,14,15,6.419645,-1
50396,141116,13,13,14,15,9.970391,10
50397,141117,9,13,14,15,6.489353,-1
50398,141118,12,13,14,15,10.481168,-1


### 7.2 Bellman Operator Dataset Creation

In [62]:
counter = 0

h = 0
w, h = 17, 300000
matrix = [[0 for x in range(w)] for y in range(h)] 

for i in range (0, no_dup_df.shape[0]):
    game = Gridworld(size=4, mode='random')
    game.board.components['Player'].pos = from1dto2d(no_dup_df["Player"][i])
    game.board.components['Goal'].pos = from1dto2d(no_dup_df["Goal"][i])
    game.board.components['Wall'].pos = from1dto2d(no_dup_df["Wall"][i])
    game.board.components['Pit'].pos = from1dto2d(no_dup_df["Pit"][i])
    
    q_value = []
    reward_arr = []
    
    pi = from2dto1d((str(game.board.components['Player'].pos)))
    
    goal = int(no_dup_df["Goal"][i])
    pit = int(no_dup_df["Pit"][i])
    wall = int(no_dup_df["Wall"][i])
    
    matrix[counter][0] = pi
    matrix[counter][1] = no_dup_df["Goal"][i]
    matrix[counter][2] = no_dup_df["Wall"][i]
    matrix[counter][3] = no_dup_df["Pit"][i]
    matrix[counter][4] = no_dup_df["Q_value"][i]
    matrix[counter][5] = no_dup_df["Reward"][i]
    
    game.makeMove('u')
    p = from2dto1d((str(game.board.components['Player'].pos)))

    if p == pi:
        matrix[counter][6] = -10
        q_value.append(-10)
        matrix[counter][7] = -100
        reward_arr.append(-100)
    else:
        matrix[counter][6] = no_dup_df[(no_dup_df["Player"] == int(p)) & (no_dup_df["Goal"] == goal)  & (no_dup_df["Wall"] == wall) & (no_dup_df["Pit"] == pit)]['Q_value'].values[0]
        q_value.append(no_dup_df[(no_dup_df["Player"] == int(p)) & (no_dup_df["Goal"] == goal)  & (no_dup_df["Wall"] == wall) & (no_dup_df["Pit"] == pit)]['Q_value'].values[0])
        reward = game.reward()
        matrix[counter][7] = reward
        reward_arr.append(reward)
        game.makeMove('d')
        
        
    pi = from2dto1d((str(game.board.components['Player'].pos)))    
    game.makeMove('d')
    p = from2dto1d((str(game.board.components['Player'].pos)))
            
    if p == pi:
        matrix[counter][8] = -10
        q_value.append(-10)
        matrix[counter][9] = -100
        reward_arr.append(-100)
    else:
        matrix[counter][8] = no_dup_df[(no_dup_df["Player"] == p) & (no_dup_df["Goal"] == goal)  & (no_dup_df["Wall"] == wall) & (no_dup_df["Pit"] == pit)]['Q_value'].values[0]
        q_value.append(no_dup_df[(no_dup_df["Player"] == p) & (no_dup_df["Goal"] == goal)  & (no_dup_df["Wall"] == wall) & (no_dup_df["Pit"] == pit)]['Q_value'].values[0])
        reward = game.reward()
        matrix[counter][9] = reward
        reward_arr.append(reward)
        game.makeMove('u')
        
        
    pi = from2dto1d((str(game.board.components['Player'].pos)))    
    game.makeMove('l')
    p = from2dto1d((str(game.board.components['Player'].pos)))
            
    if p == pi:
        matrix[counter][10] = -10
        q_value.append(-10)
        matrix[counter][11] = -100
        reward_arr.append(-100)
    else:
        matrix[counter][10] = no_dup_df[(no_dup_df["Player"] == p) & (no_dup_df["Goal"] == goal)  & (no_dup_df["Wall"] == wall) & (no_dup_df["Pit"] == pit)]['Q_value'].values[0]
        q_value.append(no_dup_df[(no_dup_df["Player"] == p) & (no_dup_df["Goal"] == goal)  & (no_dup_df["Wall"] == wall) & (no_dup_df["Pit"] == pit)]['Q_value'].values[0])
        reward = game.reward()
        matrix[counter][11] = reward
        reward_arr.append(reward)
        game.makeMove('r')
        
        
    pi = from2dto1d((str(game.board.components['Player'].pos)))       
    game.makeMove('r')
    p = from2dto1d((str(game.board.components['Player'].pos)))
            
    if p == pi:
        matrix[counter][12] = -10
        q_value.append(-10)
        matrix[counter][13] = -100
        reward_arr.append(-100)
    else:
        matrix[counter][12] = no_dup_df[(no_dup_df["Player"] == p) & (no_dup_df["Goal"] == goal)  & (no_dup_df["Wall"] == wall) & (no_dup_df["Pit"] == pit)]['Q_value'].values[0]
        q_value.append(no_dup_df[(no_dup_df["Player"] == p) & (no_dup_df["Goal"] == goal)  & (no_dup_df["Wall"] == wall) & (no_dup_df["Pit"] == pit)]['Q_value'].values[0])
        reward = game.reward()
        matrix[counter][13] = reward
        reward_arr.append(reward)
        game.makeMove('l')
        
    
    max_qvalue = q_value[np.argmax(q_value)]
    matrix[counter][14] = max_qvalue
    reward_max = reward_arr[np.argmax(q_value)]
    matrix[counter][15] = reward_arr[np.argmax(q_value)]
    bellman = max_qvalue*0.9 + reward_max
    if pi == goal or bellman > 10:
        matrix[counter][16] = 10
    else:
        matrix[counter][16] = bellman
    counter += 1
    
    
    

In [63]:
import pandas as pd
column_names = ["Player","Goal", "Wall", "Pit", "Q_Current", "Reward_Current", "Q1_Next", "Reward_1_Next", 
                "Q2_Next", "Reward_2_Next", "Q3_Next", "Reward_3_Next", "Q4_Next", "Reward_4_Next", 
                "Max_Q_Next", "Reward_Max_Next", "Bellman_Operator"
                ]
bell_df = pd.DataFrame(matrix, columns = column_names)
bell_df = bell_df.loc[(bell_df != 0).any(axis=1)]

In [64]:
bell_df

Unnamed: 0,Player,Goal,Wall,Pit,Q_Current,Reward_Current,Q1_Next,Reward_1_Next,Q2_Next,Reward_2_Next,Q3_Next,Reward_3_Next,Q4_Next,Reward_4_Next,Max_Q_Next,Reward_Max_Next,Bellman_Operator
0,0,2,1,0,-0.356023,-10,-10.000000,-100,2.647438,-1,-10.000000,-100,-10.000000,-100,2.647438,-1,1.382694
1,2,2,1,0,9.709291,10,-10.000000,-100,7.422089,-1,-10.000000,-100,7.962252,-1,7.962252,-1,10.000000
2,3,2,1,0,7.962252,-1,-10.000000,-100,6.114790,-1,9.709291,10,-10.000000,-100,9.709291,10,10.000000
3,4,2,1,0,2.647438,-1,-0.356023,-10,2.080958,-1,-10.000000,-100,6.384776,-1,6.384776,-1,4.746299
4,5,2,1,0,6.384776,-1,-10.000000,-100,3.147437,-1,2.647438,-1,7.422089,-1,7.422089,-1,5.679880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50395,8,13,14,15,6.419645,-1,4.150536,-1,10.481168,-1,-10.000000,-100,6.489353,-1,10.481168,-1,8.433051
50396,13,13,14,15,9.970391,10,6.489353,-1,-10.000000,-100,10.481168,-1,-10.000000,-100,10.481168,-1,10.000000
50397,9,13,14,15,6.489353,-1,3.969529,-1,9.970391,10,6.419645,-1,6.954754,-1,9.970391,10,10.000000
50398,12,13,14,15,10.481168,-1,6.419645,-1,-10.000000,-100,-10.000000,-100,9.970391,10,9.970391,10,10.000000


In [65]:
case = bell_df[((bell_df["Goal"] == 2)  & (bell_df["Wall"] == 1) & (bell_df["Pit"] == 0))]

In [66]:
case.to_csv('Bellman_Operator_Case.csv')

In [67]:
bell_df.to_csv('Bellman_Operator.csv')

### 7.3 Bellman Operator Testing

In [68]:
mlp = MLPRegressor(hidden_layer_sizes=(200,120), max_iter=300, activation = 'relu', solver='adam', random_state=123)

In [69]:
X = bell_df[['Player','Pit', 'Goal', 'Wall']]
y = bell_df[['Bellman_Operator']]
mlp.fit(X.values,y.values.ravel())

In [70]:
h = 0
w, h = 13, 300000
experience_mlp = [[0 for x in range(w)] for y in range(h)] 

global counter
counter = 0

In [71]:
def mlp_test_model(model_b, experience, mode='static', display=True):
    global counter
    i = 0
    test_game = Gridworld(mode=mode)
    if display:
        print("Initial State:")
        print(test_game.display())
    status = 1
    j = 0
    while(status == 1): #A
        p_curr = from2dto1d((str(test_game.board.components['Player'].pos)))
        pi = from2dto1d((str(test_game.board.components['Pit'].pos)))
        g = from2dto1d((str(test_game.board.components['Goal'].pos)))
        w = from2dto1d((str(test_game.board.components['Wall'].pos)))

        q_value_next = []

        test_game.makeMove('u')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        
        if p_curr != p:
            en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)
            reward = test_game.reward()
            q_value_next.append(model_b.predict([[p,pi,g,w]])[0])
            test_game.makeMove('d')
        else:
            q_value_next.append(-10)
        
        test_game.makeMove('d')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        
        if p_curr != p:
            en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)
            reward = test_game.reward()
            q_value_next.append(model_b.predict([[p,pi,g,w]])[0])
            test_game.makeMove('u')
        else:
            q_value_next.append(-10)
        
        test_game.makeMove('l')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        

        if p_curr != p:
            test_game.makeMove('r')
            en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)
            reward = test_game.reward()
            q_value_next.append(model_b.predict([[p,pi,g,w]])[0])
        else:
            q_value_next.append(-10)
        
        test_game.makeMove('r')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        

        if p_curr != p:
            en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)
            reward = test_game.reward()
            q_value_next.append(model_b.predict([[p,pi,g,w]])[0])
            test_game.makeMove('l')
        else:
            q_value_next.append(-10)
        
#         print(q_value_next)
        
        
        action_ = np.argmax(q_value_next)
#         print(action_)
        action = action_set[action_]
        
        experience[counter][0] = p_curr
        experience[counter][1] = pi
        experience[counter][2] = g
        experience[counter][3] = w
        experience[counter][4] = action_
        
    
        if display:
            print('Move #: %s; Taking action: %s' % (i, action))
        test_game.makeMove(action)
        p_next = from2dto1d((str(test_game.board.components['Player'].pos)))
        experience[counter][5] = p_next
        
        j += 1
        if display:
            print(test_game.display())
        reward = test_game.reward()
        experience[counter][6] = reward
        if reward > -1:
            experience[counter][7] = True
        else:
            experience[counter][7] = False
        experience[counter][8] =  q_value_next[action_]
        experience[counter][9] = q_value_next[0]
        experience[counter][10] = q_value_next[1]
        experience[counter][11] = q_value_next[2]
        experience[counter][12] = q_value_next[3]
        if reward != -1:
            if reward > 0:
                status = 2
                if display:
                    print("Game won! Reward: %s" % (reward,))
            else:
                status = 0
                if display:
                    print("Game LOST. Reward: %s" % (reward,))
        counter += 1
        i += 1
        if (i > 15):
            if display:
                print("Game lost; too many moves.")
            break
    
    win = True if status == 2 else False
    return win

In [72]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = mlp_test_model(mlp, experience_mlp, 'random', display = False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num / 10
print("Win percentage: {}%".format(win_num*100))

Games played: 1000, # of wins: 869
0.869
0.869
Games played: 1000, # of wins: 866
0.866
1.7349999999999999
Games played: 1000, # of wins: 866
0.866
2.601
Games played: 1000, # of wins: 872
0.872
3.473
Games played: 1000, # of wins: 882
0.882
4.3549999999999995
Games played: 1000, # of wins: 876
0.876
5.231
Games played: 1000, # of wins: 849
0.849
6.08
Games played: 1000, # of wins: 895
0.895
6.975
Games played: 1000, # of wins: 887
0.887
7.862
Games played: 1000, # of wins: 880
0.88
8.742
Win percentage: 87.42000000000002%


### 7.4 Bellman Operator Obtained from DQN Dataset

In [None]:
counter = 0

h = 0
w, h = 7, 3000000
matrix = [[0 for x in range(w)] for y in range(h)] 

for i in range (0, df.shape[0]):
    game = Gridworld(size=4, mode='random')
    game.board.components['Player'].pos = from1dto2d(df["Current_State"][i])
    game.board.components['Goal'].pos = from1dto2d(df["Goal_Position"][i])
    game.board.components['Wall'].pos = from1dto2d(df["Wall_Position"][i])
    game.board.components['Pit'].pos = from1dto2d(df["Pit_Position"][i])
    
    pi = from2dto1d((str(game.board.components['Player'].pos)))
    game.makeMove('u')
    p = from2dto1d((str(game.board.components['Player'].pos)))
    if p == pi:
        pass
    else:
        matrix[counter][0] = from2dto1d((str(game.board.components['Player'].pos)))
        matrix[counter][1] = from2dto1d((str(game.board.components['Goal'].pos)))
        matrix[counter][2] = from2dto1d((str(game.board.components['Wall'].pos)))
        matrix[counter][3] = from2dto1d((str(game.board.components['Pit'].pos)))
        matrix[counter][4] = df["Q1_Current"][i]
        matrix[counter][6] = df[(df["Current_State"] == p) & (df["Goal_Position"] == from2dto1d((str(game.board.components['Goal'].pos))))  & (df["Wall_Position"] == from2dto1d((str(game.board.components['Wall'].pos)))) & (df["Pit_Position"] == from2dto1d((str(game.board.components['Pit'].pos))))]['Y'].values[0]
        reward = game.reward()
        matrix[counter][5] = reward
        game.makeMove('d')
        counter += 1 
        
    pi = from2dto1d((str(game.board.components['Player'].pos)))    
    game.makeMove('d')
    p = from2dto1d((str(game.board.components['Player'].pos)))
    if p == pi:
        pass
    else:
        matrix[counter][0] = from2dto1d((str(game.board.components['Player'].pos)))
        matrix[counter][1] = from2dto1d((str(game.board.components['Goal'].pos)))
        matrix[counter][2] = from2dto1d((str(game.board.components['Wall'].pos)))
        matrix[counter][3] = from2dto1d((str(game.board.components['Pit'].pos)))
        matrix[counter][4] = df["Q2_Current"][i]
        matrix[counter][6] = df[(df["Current_State"] == p) & (df["Goal_Position"] == from2dto1d((str(game.board.components['Goal'].pos))))  & (df["Wall_Position"] == from2dto1d((str(game.board.components['Wall'].pos)))) & (df["Pit_Position"] == from2dto1d((str(game.board.components['Pit'].pos))))]['Y'].values[0]
        reward = game.reward()
        matrix[counter][5] = reward
        game.makeMove('u')
        counter += 1 
        
    pi = from2dto1d((str(game.board.components['Player'].pos)))    
    game.makeMove('l')
    p = from2dto1d((str(game.board.components['Player'].pos)))
    if p == pi:
        pass
    else:
        matrix[counter][0] = from2dto1d((str(game.board.components['Player'].pos)))
        matrix[counter][1] = from2dto1d((str(game.board.components['Goal'].pos)))
        matrix[counter][2] = from2dto1d((str(game.board.components['Wall'].pos)))
        matrix[counter][3] = from2dto1d((str(game.board.components['Pit'].pos)))
        matrix[counter][4] = df["Q3_Current"][i]
        matrix[counter][6] = df[(df["Current_State"] == p) & (df["Goal_Position"] == from2dto1d((str(game.board.components['Goal'].pos))))  & (df["Wall_Position"] == from2dto1d((str(game.board.components['Wall'].pos)))) & (df["Pit_Position"] == from2dto1d((str(game.board.components['Pit'].pos))))]['Y'].values[0]
        reward = game.reward()
        matrix[counter][5] = reward
        game.makeMove('r')
        counter += 1 
        
    pi = from2dto1d((str(game.board.components['Player'].pos)))   
    
    game.makeMove('r')
    p = from2dto1d((str(game.board.components['Player'].pos)))
    if p == pi:
        pass
    else:
        matrix[counter][0] = from2dto1d((str(game.board.components['Player'].pos)))
        matrix[counter][1] = from2dto1d((str(game.board.components['Goal'].pos)))
        matrix[counter][2] = from2dto1d((str(game.board.components['Wall'].pos)))
        matrix[counter][3] = from2dto1d((str(game.board.components['Pit'].pos)))
        matrix[counter][4] = df["Q4_Current"][i]
        matrix[counter][6] = df[(df["Current_State"] == p) & (df["Goal_Position"] == from2dto1d((str(game.board.components['Goal'].pos))))  & (df["Wall_Position"] == from2dto1d((str(game.board.components['Wall'].pos)))) & (df["Pit_Position"] == from2dto1d((str(game.board.components['Pit'].pos))))]['Y'].values[0]
        reward = game.reward()
        matrix[counter][5] = reward
        game.makeMove('l')
        counter += 1 

In [None]:
column_names = ["Player",  "Goal", "Wall", "Pit", "Q_value", "Reward", "Bellman_Opeartor"]
df_bell_dqn = pd.DataFrame(matrix, columns = column_names)
df_bell_dqn = df_bell_dqn.loc[(df_bell_dqn != 0).any(axis=1)]
df_bell_dqn

In [None]:
no_dup_bell_df = df_bell_dqn.drop_duplicates(keep='last',subset=[ 'Player', 'Goal', 'Wall', 'Pit'])

In [None]:
no_dup_bell_df = no_dup_bell_df.reset_index()
no_dup_bell_df

In [None]:
mlp = MLPRegressor(hidden_layer_sizes=(200,120), max_iter=300, activation = 'relu', solver='adam', random_state=123)

In [None]:
X = no_dup_bell_df[['Player','Pit', 'Goal', 'Wall', 'Reward']]
y = no_dup_bell_df[['Bellman_Opeartor']]
mlp.fit(X.values,y.values.ravel())

In [None]:
def mlp_test_model(model_b, experience, mode='static', display=True):
    global counter
    i = 0
    test_game = Gridworld(mode=mode)
    if display:
        print("Initial State:")
        print(test_game.display())
    status = 1
    j = 0
    while(status == 1): #A
        p_curr = from2dto1d((str(test_game.board.components['Player'].pos)))
        pi = from2dto1d((str(test_game.board.components['Pit'].pos)))
        g = from2dto1d((str(test_game.board.components['Goal'].pos)))
        w = from2dto1d((str(test_game.board.components['Wall'].pos)))

        q_value_next = []

        test_game.makeMove('u')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        
        if p_curr != p:
            en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)
            reward = test_game.reward()
            q_value_next.append(model_b.predict([[p,pi,g,w,reward]])[0])
            test_game.makeMove('d')
        else:
            q_value_next.append(-10)
        
        test_game.makeMove('d')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        
        if p_curr != p:
            en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)
            reward = test_game.reward()
            q_value_next.append(model_b.predict([[p,pi,g,w,reward]])[0])
            test_game.makeMove('u')
        else:
            q_value_next.append(-10)
        
        test_game.makeMove('l')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        

        if p_curr != p:
            test_game.makeMove('r')
            en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)
            reward = test_game.reward()
            q_value_next.append(model_b.predict([[p,pi,g,w,reward]])[0])
        else:
            q_value_next.append(-10)
        
        test_game.makeMove('r')
        p = from2dto1d((str(test_game.board.components['Player'].pos)))
        

        if p_curr != p:
            en0,en1,en2,en3,en4,en5,en6,en7,en8,en9,en10,en11,en12,en13,en14,en15 = from_num_to_one_hot_encode(p)
            reward = test_game.reward()
            q_value_next.append(model_b.predict([[p,pi,g,w,reward]])[0])
            test_game.makeMove('l')
        else:
            q_value_next.append(-10)
        
#         print(q_value_next)
        
        
        action_ = np.argmax(q_value_next)
#         print(action_)
        action = action_set[action_]
        
        experience[counter][0] = p_curr
        experience[counter][1] = pi
        experience[counter][2] = g
        experience[counter][3] = w
        experience[counter][4] = action_
        
    
        if display:
            print('Move #: %s; Taking action: %s' % (i, action))
        test_game.makeMove(action)
        p_next = from2dto1d((str(test_game.board.components['Player'].pos)))
        experience[counter][5] = p_next
        
        j += 1
        if display:
            print(test_game.display())
        reward = test_game.reward()
        experience[counter][6] = reward
        if reward > -1:
            experience[counter][7] = True
        else:
            experience[counter][7] = False
        experience[counter][8] =  q_value_next[action_]
        experience[counter][9] = q_value_next[0]
        experience[counter][10] = q_value_next[1]
        experience[counter][11] = q_value_next[2]
        experience[counter][12] = q_value_next[3]
        if reward != -1:
            if reward > 0:
                status = 2
                if display:
                    print("Game won! Reward: %s" % (reward,))
            else:
                status = 0
                if display:
                    print("Game LOST. Reward: %s" % (reward,))
        counter += 1
        i += 1
        if (i > 15):
            if display:
                print("Game lost; too many moves.")
            break
    
    win = True if status == 2 else False
    return win

In [None]:
h = 0
w, h = 13, 3000000
experience_mlp = [[0 for x in range(w)] for y in range(h)] 

global counter
counter = 0

In [None]:
win_num = 0
for i in range(0,10):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = mlp_test_model(mlp, experience_mlp, 'random', display = False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    win_num = win_num + win_perc
    print("Games played: {0}, # of wins: {1}".format(max_games,wins))
    print(win_perc)
    print(win_num)
win_num = win_num / 10
print("Win percentage: {}%".format(win_num*100))