In [1]:
import numpy as np
import random
import sys
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

In [2]:
import gym
import matplotlib.pyplot as plt
import numpy as np
from gym_minigrid.wrappers import RGBImgPartialObsWrapper
from gym_minigrid.minigrid import Grid, Wall, Goal, Lava
import random

class CustomMiniGridEnv(gym.Env):
    def __init__(self, grid_size=4):
        super(CustomMiniGridEnv, self).__init__()

        self.grid_size = grid_size
        self.action_space = gym.spaces.Discrete(4)
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(grid_size * 10, grid_size * 10, 3), dtype=np.uint8)
        self.colors = {
            Wall: [255, 255, 0],   # Wall color (Yellow)
            Lava: [255, 0, 0],     # Lava (Pit) color (Red)
            Goal: [0, 255, 0],     # Goal color (Green)
        }
        
        self.reset()

    def reset(self):
        self.grid = Grid(width=self.grid_size, height=self.grid_size)
        self.player_pos = self.generate_random_position([])
        self.wall_pos = self.generate_random_position([self.player_pos])
        self.goal_pos = self.generate_random_position([self.player_pos, self.wall_pos])
        self.pit_pos = self.generate_random_position([self.player_pos, self.wall_pos, self.goal_pos])

        self.grid.set(*self.wall_pos, Wall())
        self.grid.set(*self.pit_pos, Lava())
        self.grid.set(*self.goal_pos, Goal())

        self.reward_counter = 0
        self.reward = 0
        self.step_counter = 0

        return self.get_observation()

    def step(self, action):
        self.perform_action(action)
        self.step_counter += 1

        done = False
        if self.reward == 10 or self.step_counter > 15 or self.reward == -10:
            done = True

        return self.get_observation(), self.reward, done, {}

    def render(self, mode='human'):
        plt.imshow(self.grid_img)
        plt.axis('off')
        plt.show()

    def generate_random_position(self, existing_positions):
        while True:
            pos = (random.randint(0, self.grid_size - 1), random.randint(0, self.grid_size - 1))
            if pos not in existing_positions:
                return pos

    def perform_action(self, action):
        # Clear the previous player position
        self.grid_img[self.player_pos[0] * 10: (self.player_pos[0] + 1) * 10, self.player_pos[1] * 10: (self.player_pos[1] + 1) * 10] = [0, 0, 0]

        # Update the player position based on the action
        if action == 0:  # Up
            new_pos = (max(self.player_pos[0] - 1, 0), self.player_pos[1])
        elif action == 1:  # Down
            new_pos = (min(self.player_pos[0] + 1, self.grid_size - 1), self.player_pos[1])
        elif action == 2:  # Left
            new_pos = (self.player_pos[0], max(self.player_pos[1] - 1, 0))
        elif action == 3:  # Right
            new_pos = (self.player_pos[0], min(self.player_pos[1] + 1, self.grid_size - 1))

        # Check if the new position is a wall
        if isinstance(self.grid.get(*new_pos), Wall):
            new_pos = self.player_pos  # Stay in the same position if hitting a wall

        # Check if the new position is lava
        if isinstance(self.grid.get(*new_pos), Lava):
            self.reward = -10
        elif isinstance(self.grid.get(*new_pos), Goal):
            self.reward = 10
        else:
            self.reward = -1

        # Update the player position
        self.player_pos = new_pos

        # Highlight the new player position
        self.grid_img[self.player_pos[0] * 10: (self.player_pos[0] + 1) * 10, self.player_pos[1] * 10: (self.player_pos[1] + 1) * 10] = [0, 0, 255]

        # Update the reward counter
        self.reward_counter += self.reward

#         # Print the sum of rewards and the current player position's reward
#         print("Total Reward:", self.reward_counter)
#         print("Current Position's Reward:", self.reward)
#         print("Action Done:", action)

    def get_observation(self):
        self.grid_img = np.zeros((self.grid_size * 10, self.grid_size * 10, 3), dtype=np.uint8)

        # Update the grid image with colors
        for row in range(self.grid_size):
            for col in range(self.grid_size):
                cell_type = self.grid.get(row, col).__class__
                self.grid_img[row * 10: (row + 1) * 10, col * 10: (col + 1) * 10] = self.colors.get(cell_type, [0, 0, 0])

        # Highlight the wall, pit, goal, and player positions
        self.grid_img[self.wall_pos[0] * 10: (self.wall_pos[0] + 1) * 10, self.wall_pos[1] * 10: (self.wall_pos[1] + 1) * 10] = [255, 255, 0]
        self.grid_img[self.pit_pos[0] * 10: (self.pit_pos[0] + 1) * 10, self.pit_pos[1] * 10: (self.pit_pos[1] + 1) * 10] = [255, 0, 0]
        self.grid_img[self.goal_pos[0] * 10: (self.goal_pos[0] + 1) * 10, self.goal_pos[1] * 10: (self.goal_pos[1] + 1) * 10] = [0, 255, 0]
        self.grid_img[self.player_pos[0] * 10: (self.player_pos[0] + 1) * 10, self.player_pos[1] * 10: (self.player_pos[1] + 1) * 10] = [0, 0, 255]

        return self.grid_img
    
    def render_np(self):
        player_slice = np.zeros((self.grid_size, self.grid_size), dtype=np.uint8)
        player_slice[self.player_pos] = 1

        goal_slice = np.zeros((self.grid_size, self.grid_size), dtype=np.uint8)
        goal_slice[self.goal_pos] = 1

        pit_slice = np.zeros((self.grid_size, self.grid_size), dtype=np.uint8)
        pit_slice[self.pit_pos] = 1

        wall_slice = np.zeros((self.grid_size, self.grid_size), dtype=np.uint8)
        wall_slice[self.wall_pos] = 1

        return np.stack([player_slice, goal_slice, pit_slice, wall_slice], axis=0)
    
    def get_positions(self):
        player_pos = self.player_pos[0] * self.grid_size + self.player_pos[1]
        lava_pos = self.pit_pos[0] * self.grid_size + self.pit_pos[1]
        goal_pos = self.goal_pos[0] * self.grid_size + self.goal_pos[1]
        wall_pos = self.wall_pos[0] * self.grid_size + self.wall_pos[1]
        
        return player_pos, lava_pos, goal_pos, wall_pos
    
    def set_positions(self, player_pos, goal_pos, wall_pos, pit_pos):
        # Check if the provided positions are within the grid boundaries
        if not self.is_within_grid(player_pos) or not self.is_within_grid(goal_pos) or not self.is_within_grid(pit_pos) or not self.is_within_grid(wall_pos):
            raise ValueError("Invalid positions provided. Positions must be within the grid boundaries.")

        # Update the grid with the new positions
        self.player_pos = player_pos
        self.goal_pos = goal_pos
        self.pit_pos = pit_pos
        self.wall_pos = wall_pos

        # Clear the previous grid layout
        self.grid = Grid(width=self.grid_size, height=self.grid_size)

        # Set the new positions in the grid
        self.grid.set(*self.player_pos, None)  # Clear any previous entity at the player position
        self.grid.set(*self.goal_pos, Goal())
        self.grid.set(*self.pit_pos, Lava())
        self.grid.set(*self.wall_pos, Wall())

        # Reset other variables
        self.reward_counter = 0
        self.reward = 0
        self.step_counter = 0
        self.get_observation()  # Update the grid image

    def is_within_grid(self, position):
        row, col = position
        return 0 <= row < self.grid_size and 0 <= col < self.grid_size

  fn()


## Uniform Dataset Preprocessing

In [3]:
df = pd.read_csv("DatasetBeforeUniform_GYM.csv")
df.drop('Unnamed: 0', axis=1, inplace=True)   
df

Unnamed: 0,Current_State,Pit_Position,Goal_Position,Wall_Position,Current_Reward,Q1_UP,Q2_DOWN,Q3_LEFT,Q4_RIGHT
0,0,0,2,1,-10,4.946462,4.695930,4.974383,4.760919
1,2,0,2,1,-1,8.128312,8.142357,7.115776,7.350907
2,3,0,2,1,10,6.755848,5.773548,9.332119,7.258504
3,4,0,2,1,-1,3.076944,2.692700,4.710314,6.191459
4,5,0,2,1,-1,6.541164,4.304107,5.105873,8.097490
...,...,...,...,...,...,...,...,...,...
50395,10,15,13,14,-1,5.938396,5.431857,7.606215,4.054748
50396,11,15,13,14,-1,2.553046,2.540612,5.090322,4.031656
50397,12,15,13,14,10,4.916274,4.017373,3.690299,6.726671
50398,13,15,13,14,-1,8.272136,6.379996,4.600275,1.773542


### Rearrangement of dataset

In [7]:
def from1dto2d(pos):
    if pos == 0:
        return (0, 0)
    if pos == 1:
        return (0, 1)
    if pos == 2:
        return (0, 2)
    if pos == 3:
        return (0, 3)
    if pos == 4:
        return (1, 0)
    if pos == 5:
        return (1, 1)
    if pos == 6:
        return (1, 2)
    if pos == 7:
        return (1, 3)
    if pos == 8:
        return (2, 0)
    if pos == 9:
        return (2, 1)
    if pos == 10:
        return (2, 2)
    if pos == 11:
        return (2, 3)
    if pos == 12:
        return (3, 0)
    if pos == 13:
        return (3, 1)
    if pos == 14:
        return (3, 2)
    if pos == 15:
        return (3, 3)

In [9]:
counter = 0

h = 0
w, h = 6, 3000000
matrix = [[0 for x in range(w)] for y in range(h)] 

for i in range (0, df.shape[0]):
    game = CustomMiniGridEnv(grid_size=4)
    
    player_pos = from1dto2d(df["Current_State"][i])
    goal_pos = from1dto2d(df["Goal_Position"][i])
    wall_pos = from1dto2d(df["Wall_Position"][i])
    pit_pos = from1dto2d(df["Pit_Position"][i])
    
    game.set_positions(player_pos, goal_pos, wall_pos, pit_pos)
    
    pi = str(game.get_positions()[0])
    obs, reward, done, _ = game.step(0)
    p = str(game.get_positions()[0])
    if p == pi:
        pass
    else:
        matrix[counter][0] = str(game.get_positions()[0])
        matrix[counter][1] = str(game.get_positions()[2])
        matrix[counter][2] = str(game.get_positions()[3])
        matrix[counter][3] = str(game.get_positions()[1])
        matrix[counter][4] = df["Q1_UP"][i]
        matrix[counter][5] = reward
        obs, reward, done, _ = game.step(1)
        counter += 1 
        
    pi = str(game.get_positions()[0]) 
    obs, reward, done, _ = game.step(1)
    p = str(game.get_positions()[0])
    if p == pi:
        pass
    else:
        matrix[counter][0] = str(game.get_positions()[0])
        matrix[counter][1] = str(game.get_positions()[2])
        matrix[counter][2] = str(game.get_positions()[3])
        matrix[counter][3] = str(game.get_positions()[1])
        matrix[counter][4] = df["Q2_DOWN"][i]
        matrix[counter][5] = reward
        obs, reward, done, _ = game.step(0)
        counter += 1 
        
    pi = str(game.get_positions()[0]) 
    obs, reward, done, _ = game.step(2)
    p = str(game.get_positions()[0])
    if p == pi:
        pass
    else:
        matrix[counter][0] = str(game.get_positions()[0])
        matrix[counter][1] = str(game.get_positions()[2])
        matrix[counter][2] = str(game.get_positions()[3])
        matrix[counter][3] = str(game.get_positions()[1])
        matrix[counter][4] = df["Q3_LEFT"][i]
        matrix[counter][5] = reward
        obs, reward, done, _ = game.step(3)
        counter += 1 
        
    pi = str(game.get_positions()[0]) 
    
    obs, reward, done, _ = game.step(3)
    p = str(game.get_positions()[0]) 
    if p == pi:
        pass
    else:
        matrix[counter][0] = str(game.get_positions()[0])
        matrix[counter][1] = str(game.get_positions()[2])
        matrix[counter][2] = str(game.get_positions()[3])
        matrix[counter][3] = str(game.get_positions()[1])
        matrix[counter][4] = df["Q4_RIGHT"][i]
        matrix[counter][5] = reward
        obs, reward, done, _ = game.step(2)
        counter += 1 

In [10]:
column_names = ["Player",  "Goal", "Wall", "Pit", "Q_value", "Reward"]
df_matrix = pd.DataFrame(matrix, columns = column_names)
df_matrix = df_matrix.loc[(df_matrix != 0).any(axis=1)]
df_matrix

Unnamed: 0,Player,Goal,Wall,Pit,Q_value,Reward
0,4,2,1,0,4.695930,-1
1,6,2,1,0,8.142357,-1
2,3,2,1,0,7.350907,-1
3,7,2,1,0,5.773548,-1
4,2,2,1,0,9.332119,10
...,...,...,...,...,...,...
141115,8,13,14,15,4.916274,-1
141116,13,13,14,15,6.726671,10
141117,9,13,14,15,8.272136,-1
141118,12,13,14,15,4.600275,-1


In [11]:
# Drop duplicate positions
df_new = df_matrix.drop_duplicates(keep='last',subset=[ 'Player', 'Goal', 'Wall', 'Pit'])
df_new = df_new.reset_index()
df_new.drop('index', axis=1, inplace=True)   
df_new

Unnamed: 0,Player,Goal,Wall,Pit,Q_value,Reward
0,0,2,1,0,3.076944,-10
1,2,2,1,0,9.783095,10
2,3,2,1,0,8.355206,-1
3,4,2,1,0,3.889865,-1
4,5,2,1,0,6.721507,-1
...,...,...,...,...,...,...
50395,8,13,14,15,4.916274,-1
50396,13,13,14,15,6.726671,10
50397,9,13,14,15,8.272136,-1
50398,12,13,14,15,4.600275,-1


### Bellman Operator Dataset Creation

In [12]:
no_dup_df = df_new
no_dup_df

Unnamed: 0,Player,Goal,Wall,Pit,Q_value,Reward
0,0,2,1,0,3.076944,-10
1,2,2,1,0,9.783095,10
2,3,2,1,0,8.355206,-1
3,4,2,1,0,3.889865,-1
4,5,2,1,0,6.721507,-1
...,...,...,...,...,...,...
50395,8,13,14,15,4.916274,-1
50396,13,13,14,15,6.726671,10
50397,9,13,14,15,8.272136,-1
50398,12,13,14,15,4.600275,-1


In [13]:
counter = 0

h = 0
w, h = 7, 300000
matrix = [[0 for x in range(w)] for y in range(h)] 

for i in range (0, no_dup_df.shape[0]):
    game = CustomMiniGridEnv(grid_size=4)
    
    player_pos = from1dto2d(df["Current_State"][i])
    goal_pos = from1dto2d(df["Goal_Position"][i])
    wall_pos = from1dto2d(df["Wall_Position"][i])
    pit_pos = from1dto2d(df["Pit_Position"][i])
    
    game.set_positions(player_pos, goal_pos, wall_pos, pit_pos)
    
    q_value = []
    reward_arr = []
    
    pi = str(game.get_positions()[0])
    
    goal = int(no_dup_df["Goal"][i])
    pit = int(no_dup_df["Pit"][i])
    wall = int(no_dup_df["Wall"][i])
    
    matrix[counter][0] = pi
    matrix[counter][1] = no_dup_df["Goal"][i]
    matrix[counter][2] = no_dup_df["Wall"][i]
    matrix[counter][3] = no_dup_df["Pit"][i]
    matrix[counter][4] = no_dup_df["Q_value"][i]
    matrix[counter][5] = no_dup_df["Reward"][i]
    
    obs, reward, done, _ = game.step(0)
    
    p = str(game.get_positions()[0])

    if p == pi:
        q_value.append(-10)
        reward_arr.append(-100)
    else:
        q_value.append(no_dup_df[(no_dup_df["Player"] == int(p)) & (no_dup_df["Goal"] == goal)  & (no_dup_df["Wall"] == wall) & (no_dup_df["Pit"] == pit)]['Q_value'].values[0])
        reward_arr.append(reward)
        obs, reward, done, _ = game.step(1)
        
        
    pi = str(game.get_positions()[0])  
    obs, reward, done, _ = game.step(1)
    p = str(game.get_positions()[0])
            
    if p == pi:
        q_value.append(-10)
        reward_arr.append(-100)
    else:
        q_value.append(no_dup_df[(no_dup_df["Player"] == p) & (no_dup_df["Goal"] == goal)  & (no_dup_df["Wall"] == wall) & (no_dup_df["Pit"] == pit)]['Q_value'].values[0])
        reward_arr.append(reward)
        obs, reward, done, _ = game.step(0)
        
        
    pi = str(game.get_positions()[0])   
    obs, reward, done, _ = game.step(2)
    p = str(game.get_positions()[0])
            
    if p == pi:
        q_value.append(-10)
        reward_arr.append(-100)
    else:
        q_value.append(no_dup_df[(no_dup_df["Player"] == p) & (no_dup_df["Goal"] == goal)  & (no_dup_df["Wall"] == wall) & (no_dup_df["Pit"] == pit)]['Q_value'].values[0])
        reward_arr.append(reward)
        obs, reward, done, _ = game.step(3)
        
        
    pi = str(game.get_positions()[0])       
    obs, reward, done, _ = game.step(3)
    p = str(game.get_positions()[0])
            
    if p == pi:
        q_value.append(-10)
        reward_arr.append(-100)
    else:
        q_value.append(no_dup_df[(no_dup_df["Player"] == p) & (no_dup_df["Goal"] == goal)  & (no_dup_df["Wall"] == wall) & (no_dup_df["Pit"] == pit)]['Q_value'].values[0])
        reward_arr.append(reward)
        obs, reward, done, _ = game.step(2)
        
    
    max_qvalue = q_value[np.argmax(q_value)]
    reward_max = reward_arr[np.argmax(q_value)]
    bellman = max_qvalue*0.9 + reward_max
    if pi == goal or bellman > 10:
        matrix[counter][6] = 10
    else:
        matrix[counter][6] = bellman
    counter += 1

IndexError: index 0 is out of bounds for axis 0 with size 0

In [14]:
no_dup_df[(no_dup_df["Player"] == p) & (no_dup_df["Goal"] == goal)  & (no_dup_df["Wall"] == wall) & (no_dup_df["Pit"] == pit)]['Q_value'].values[0]

IndexError: index 0 is out of bounds for axis 0 with size 0