## 1. Import Dependencies

In [3]:
import gym 
from gym import Env
from gym.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete 
import numpy as np
import random
import os

## 2. Building an Environment

In [4]:
class Hot_or_Cold_Env(Env):
    def __init__(self):
        # Actions we can take: left, right, forward, backward
        self.action_space = Discrete(8)  # 4 temel ve 4 çapraz yön
        
        # Observation space: 50x50 grid
        self.observation_space = Box(low=0, high=49, shape=(2,), dtype=np.int32)
       
        # Randomly initialize the agent's starting position within the 50x50 grid
        self.state = np.array([np.random.randint(0, 50), np.random.randint(0, 50)])
        
        # Set the target position randomly
        self.target = np.array([np.random.randint(0, 50), np.random.randint(0, 50)])
        
        # Ensure the target is not in the same position as the agent
        while np.array_equal(self.state, self.target):
            self.target = np.array([np.random.randint(0, 50), np.random.randint(0, 50)])

        # Set the period duration (in seconds)
        self.period_duration = 100
        
        # Initialize step count
        self.steps_taken = 0
        
        # Initialize previous distance
        self.previous_distance_to_target = np.inf
        
    def step(self, action):
        # Apply action
        if action == 0:  # backward
            self.state[1] = max(0, self.state[1] - 1)  # y ekseninde -1
        elif action == 1:  # right
            self.state[0] = min(49, self.state[0] + 1)  # x ekseninde +1
        elif action == 2:  # forward
            self.state[1] = min(49, self.state[1] + 1)  # y ekseninde +1
        elif action == 3:  # left
            self.state[0] = max(0, self.state[0] - 1)  # x ekseninde -1
        elif action == 4:  # right-forward (diagonal)
            self.state[0] = min(49, self.state[0] + 1)
            self.state[1] = min(49, self.state[1] + 1)
        elif action == 5:  # right-backward (diagonal)
            self.state[0] = min(49, self.state[0] + 1)
            self.state[1] = max(0, self.state[1] - 1)
        elif action == 6:  # left-forward (diagonal)
            self.state[0] = max(0, self.state[0] - 1)
            self.state[1] = min(49, self.state[1] + 1)
        elif action == 7:  # left-backward (diagonal)
            self.state[0] = max(0, self.state[0] - 1)
            self.state[1] = max(0, self.state[1] - 1)
        
        # Reduce game length by 1 second
        self.period_duration -= 1 
        
        # Increment step count
        self.steps_taken += 1
        
        # Calculate the reward
        reward = -0.1  # Small negative reward for each step taken
        
        # Calculate distance to the target
        distance_to_target = np.linalg.norm(self.state - self.target)
        
        # Check if the agent reached the target
        if np.array_equal(self.state, self.target):
            reward += 100  # Large positive reward for reaching the target
            done = True  # End the episode
        else:
            done = False  # Not done yet
            
            # Update previous distance
            if self.steps_taken > 1:  # Skip this check for the first step
                if distance_to_target < self.previous_distance_to_target:
                    reward += 1  # Positive reward for getting closer
                else:
                    reward -= 0.5  # Negative reward for getting further away
            
        # Update previous distance
        self.previous_distance_to_target = distance_to_target  
        
        # Check if the game is done
        if self.period_duration <= 0:
            done = True
        
        # Set placeholder for info
        info = {}
        
        # Return step information
        return self.state, reward, done, info

    def reset(self):
        # Reset the environment
        self.state = np.array([np.random.randint(0, 50), np.random.randint(0, 50)])
        self.target = np.array([np.random.randint(0, 50), np.random.randint(0, 50)])  # New target
        
        # Ensure the target is not in the same position as the agent
        while np.array_equal(self.state, self.target):
            self.target = np.array([np.random.randint(0, 50), np.random.randint(0, 50)])

        self.period_duration = 100  # Reset period duration
        self.steps_taken = 0  # Reset step count
        self.previous_distance_to_target = np.inf  # Initialize previous distance
        return self.state


## 3.Test Environment

In [5]:
# The Hot_or_Cold_Env class definition should be placed here.

# Test function for the Hot or Cold environment
def test_hot_or_cold_env():
    env = Hot_or_Cold_Env()  # Create an instance of the environment
    state = env.reset()  # Reset the environment to get the initial state
    
    print("Initial State:", state)
    print("Target:", env.target)

    total_reward = 0  # Variable to track the total reward
    for step in range(10):  # Perform a maximum of 10 steps
        action = np.random.choice(4)  # Randomly select an action
        next_state, reward, done, info = env.step(action)  # Take the action and get the next state and reward

        # Update the total reward
        total_reward += reward

        # Print the current state and reward information
        print(f"Step {step + 1}:")
        print(f"  Action: {action}")
        print(f"  New State: {next_state}")
        print(f"  Reward: {reward}")
        print(f"  Total Reward: {total_reward}")
        print(f"  Distance to Target: {np.linalg.norm(next_state - env.target)}")
        
        if done:  # Check if the episode is done
            print("Agent reached the target or time has run out.")
            break  # Exit the loop if done

# Run the test
test_hot_or_cold_env()

Initial State: [22 34]
Target: [29 22]
Step 1:
  Action: 0
  New State: [22 33]
  Reward: -0.1
  Total Reward: -0.1
  Distance to Target: 13.038404810405298
Step 2:
  Action: 1
  New State: [23 33]
  Reward: 0.9
  Total Reward: 0.8
  Distance to Target: 12.529964086141668
Step 3:
  Action: 0
  New State: [23 32]
  Reward: 0.9
  Total Reward: 1.7000000000000002
  Distance to Target: 11.661903789690601
Step 4:
  Action: 3
  New State: [22 32]
  Reward: -0.6
  Total Reward: 1.1
  Distance to Target: 12.206555615733702
Step 5:
  Action: 1
  New State: [23 32]
  Reward: 0.9
  Total Reward: 2.0
  Distance to Target: 11.661903789690601
Step 6:
  Action: 1
  New State: [24 32]
  Reward: 0.9
  Total Reward: 2.9
  Distance to Target: 11.180339887498949
Step 7:
  Action: 2
  New State: [24 33]
  Reward: -0.6
  Total Reward: 2.3
  Distance to Target: 12.083045973594572
Step 8:
  Action: 3
  New State: [23 33]
  Reward: -0.6
  Total Reward: 1.6999999999999997
  Distance to Target: 12.52996408614166