## 1. Import Dependencies

In [None]:
import gym
from gym import Env
from gym.spaces import Discrete, Box
import numpy as np

## 2. Building an Environment

In [None]:
class Hot_or_Cold_Env(Env):
    def __init__(self):
        # Actions we can take: left, right, forward, backward, and diagonal directions
        self.action_space = Discrete(8)  # 4 basic directions and 4 diagonal ones
        
        # Observation space: 100x100 grid
        self.observation_space = Box(low=0, high=99, shape=(2,), dtype=np.int32)
       
        # Randomly initialize the agent's starting position within the 100x100 grid
        self.state = np.array([np.random.randint(0, 100), np.random.randint(0, 100)])
        
        # Set the target position randomly
        self.target = np.array([np.random.randint(0, 100), np.random.randint(0, 100)])
        
        # Ensure the target is not in the same position as the agent
        while np.array_equal(self.state, self.target):
            self.target = np.array([np.random.randint(0, 100), np.random.randint(0, 100)])

        # Set the maximum time period (in steps)
        self.period_duration = 100
        
        # Initialize step count
        self.steps_taken = 0
        
        # Initialize previous distance to an infinite value
        self.previous_distance_to_target = np.inf
        
    def step(self, action):
        # Apply the action based on the index
        if action == 0:  # move down
            self.state[1] = max(0, self.state[1] - 1)  # decrement y-coordinate
        elif action == 1:  # move right
            self.state[0] = min(99, self.state[0] + 1)  # increment x-coordinate
        elif action == 2:  # move up
            self.state[1] = min(99, self.state[1] + 1)  # increment y-coordinate
        elif action == 3:  # move left
            self.state[0] = max(0, self.state[0] - 1)  # decrement x-coordinate
        elif action == 4:  # move down-right (diagonal)
            self.state[0] = min(99, self.state[0] + 1)
            self.state[1] = max(0, self.state[1] - 1)
        elif action == 5:  # move up-right (diagonal)
            self.state[0] = min(99, self.state[0] + 1)
            self.state[1] = min(99, self.state[1] + 1)
        elif action == 6:  # move down-left (diagonal)
            self.state[0] = max(0, self.state[0] - 1)
            self.state[1] = max(0, self.state[1] - 1)
        elif action == 7:  # move up-left (diagonal)
            self.state[0] = max(0, self.state[0] - 1)
            self.state[1] = min(99, self.state[1] + 1)
        
        # Reduce the remaining time steps
        self.period_duration -= 1 
        
        # Increment the step count
        self.steps_taken += 1
        
        # Calculate the base reward
        reward = -0.1  # Small negative reward for each step taken
        
        # Calculate the Euclidean distance to the target
        distance_to_target = np.linalg.norm(self.state - self.target)
        
        # Check if the agent has reached the target
        if np.array_equal(self.state, self.target):
            reward += 100  # Large positive reward for reaching the target
            done = True  # End the episode
        else:
            done = False  # Episode continues
            
            # Check if the agent is getting closer to the target
            if self.steps_taken > 1:  # Skip this check for the first step
                if distance_to_target < self.previous_distance_to_target:
                    reward += 1  # Positive reward for getting closer
                else:
                    reward -= 0.5  # Penalty for getting further away
            
        # Update the previous distance to the target
        self.previous_distance_to_target = distance_to_target  
        
        # Check if the episode has timed out
        if self.period_duration <= 0:
            done = True
        
        # Set an empty info dictionary (can be used for debugging)
        info = {}
        
        # Return the new state, reward, done flag, and info
        return self.state, reward, done, info

    def reset(self):
        # Reset the agent's position randomly within the 100x100 grid
        self.state = np.array([np.random.randint(0, 100), np.random.randint(0, 100)])
        
        # Randomly set a new target position within the grid
        self.target = np.array([np.random.randint(0, 100), np.random.randint(0, 100)])  
        
        # Ensure the target is not in the same position as the agent
        while np.array_equal(self.state, self.target):
            self.target = np.array([np.random.randint(0, 100), np.random.randint(0, 100)])

        # Reset the period duration to 100 steps
        self.period_duration = 100  
        
        # Reset the step count
        self.steps_taken = 0  
        
        # Reset the previous distance to an infinite value
        self.previous_distance_to_target = np.inf  
        
        # Return the initial state
        return self.state


## 3.Test Environment

In [None]:
# The Hot_or_Cold_Env class definition should be placed here.

# Test function for the Hot or Cold environment
def test_hot_or_cold_env():
    env = Hot_or_Cold_Env()  # Create an instance of the environment
    state = env.reset()  # Reset the environment to get the initial state
    
    print("Initial State:", state)
    print("Target:", env.target)

    total_reward = 0  # Variable to track the total reward
    for step in range(10):  # Perform a maximum of 10 steps
        action = np.random.choice(4)  # Randomly select an action
        next_state, reward, done, info = env.step(action)  # Take the action and get the next state and reward

        # Update the total reward
        total_reward += reward

        # Print the current state and reward information
        print(f"Step {step + 1}:")
        print(f"  Action: {action}")
        print(f"  New State: {next_state}")
        print(f"  Reward: {reward}")
        print(f"  Total Reward: {total_reward}")
        print(f"  Distance to Target: {np.linalg.norm(next_state - env.target)}")
        
        if done:  # Check if the episode is done
            print("Agent reached the target or time has run out.")
            break  # Exit the loop if done

# Run the test
test_hot_or_cold_env()