In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam
import gym
from gym import spaces
from collections import deque
import random

# --- 1. Custom IITK Campus Environment ---
# We simplify the campus into a 10x10 grid
# 0: Road, 1: Obstacle (Building/Grass), 2: Start, 3: Goal

class IITKCampusEnv(gym.Env):
    """
    Custom Gym Environment for IIT Kanpur Campus Navigation.
    Simplified as a 2D Grid World.
    """
    def __init__(self):
        super(IITKCampusEnv, self).__init__()

        self.grid_size = 10
        # Define the grid map
        self.grid = np.zeros((self.grid_size, self.grid_size))
        # Obstacles
        self.grid[1, 1:4] = 1
        self.grid[3, 4:8] = 1
        self.grid[5, 1:3] = 1
        self.grid[7, 7:9] = 1
        self.grid[8, 2:5] = 1

        # Start (Library) and Goal (LCH)
        self.start_pos = (0, 0)
        self.goal_pos = (9, 9)
        self.grid[self.start_pos] = 2
        self.grid[self.goal_pos] = 3

        self.agent_pos = self.start_pos

        # Action space: 0:Up, 1:Down, 2:Left, 3:Right
        self.action_space = spaces.Discrete(4)
        # Observation space: the grid itself
        self.observation_space = spaces.Box(low=0, high=3,
                                            shape=(self.grid_size, self.grid_size), dtype=np.uint8)

        self.max_steps = 100 # Max steps per episode
        self.current_step = 0

    def reset(self):
        self.agent_pos = self.start_pos
        self.current_step = 0
        # Return the observation as a flattened array for the NN
        return self.grid.flatten()

    def step(self, action):
        self.current_step += 1
        x, y = self.agent_pos

        if action == 0:  # Up
            x = max(0, x - 1)
        elif action == 1:  # Down
            x = min(self.grid_size - 1, x + 1)
        elif action == 2:  # Left
            y = max(0, y - 1)
        elif action == 3:  # Right
            y = min(self.grid_size - 1, y + 1)

        new_pos = (x, y)

        # Check new position
        if new_pos == self.goal_pos:
            reward = 1000.0  # Large reward for reaching the goal
            done = True
        elif self.grid[new_pos] == 1:  # Hit obstacle
            reward = -100.0   # Large penalty for crashing (Rsafety)
            done = False       # Don't end episode, just penalize
            new_pos = self.agent_pos # Don't move
        else:
            reward = -0.1      # Small penalty for each step (Rtime)
            done = False
            self.agent_pos = new_pos # Update agent position

        # Check for max steps
        if self.current_step >= self.max_steps:
            done = True
            reward = -10.0 # Penalty for timeout

        return self.grid.flatten(), reward, done, {}

    def render(self, mode='human'):
        # Simple console render
        render_grid = np.copy(self.grid)
        render_grid[self.agent_pos] = 4 # 4 represents the agent
        print(render_grid)
        print("-" * 20)

# --- 2. Deep Q-Network (DQN) Agent ---
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000) # Experience Replay buffer

        # Hyperparameters
        self.gamma = 0.95    # Discount rate
        self.epsilon = 1.0   # Exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001

        self.model = self._build_model()
        self.target_model = self._build_model() # Target network
        self.update_target_model()

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        # Input layer is flattened grid
        model.add(Dense(64, input_dim=self.state_size, activation='relu'))
        model.add(Dense(64, activation='relu'))
        model.add(Dense(self.action_size, activation='linear')) # Output Q-values
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def update_target_model(self):
        # Copy weights from model to target_model
        self.target_model.set_weights(self.model.get_weights())

    def remember(self, state, action, reward, next_state, done):
        # Add experience to replay buffer
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        # Epsilon-greedy action selection
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size) # Explore
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # Exploit

    def replay(self, batch_size):
        # Train from experience replay
        minibatch = random.sample(self.memory, batch_size)

        for state, action, reward, next_state, done in minibatch:
            target = self.model.predict(state)

            if done:
                target[0][action] = reward
            else:
                # DDQN update: Q(s,a) = r + y * Q_target(s', argmax_a'(Q_main(s',a')))
                action_main = np.argmax(self.model.predict(next_state)[0])
                target[0][action] = reward + self.gamma * self.target_model.predict(next_state)[0][action_main]

            self.model.fit(state, target, epochs=1, verbose=0)

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

# --- 3. Training Loop ---
if __name__ == "__main__":
    env = IITKCampusEnv()
    state_size = env.observation_space.shape[0] * env.observation_space.shape[1]
    action_size = env.action_space.n
    agent = DQNAgent(state_size, action_size)

    EPISODES = 1000 # Reduced for quick demo, set to 50,000 for full training
    BATCH_SIZE = 32
    UPDATE_TARGET_EVERY = 5 # Episodes

    print(f"Starting Training for {EPISODES} episodes...")

    for e in range(EPISODES):
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        total_reward = 0

        for time in range(env.max_steps):
            # env.render() # Uncomment to see the grid (slows training)
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)

            total_reward += reward
            next_state = np.reshape(next_state, [1, state_size])

            agent.remember(state, action, reward, next_state, done)

            state = next_state

            if done:
                break

        # Train the agent from memory
        if len(agent.memory) > BATCH_SIZE:
            agent.replay(BATCH_SIZE)

        # Update target network
        if e % UPDATE_TARGET_EVERY == 0:
            agent.update_target_model()

        print(f"Episode: {e+1}/{EPISODES}, Score: {total_reward}, Epsilon: {agent.epsilon:.2}")

    print("Training finished.")
    # Add code here to save the model weights
    # agent.model.save_weights("dqn_iitk_shuttle.h5")

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Starting Training for 1000 episodes...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 122ms/step


  return datetime.utcnow().replace(tzinfo=utc)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39