<a href="https://colab.research.google.com/github/akulkarni14/AAI-praticals/blob/main/QLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import random

# Grid size (rows x columns)
grid_size = (5, 5)

# Possible actions: up, down, left, right
actions = ['up', 'down', 'left', 'right']
action_dict = {'up': (-1, 0), 'down': (1, 0), 'left': (0, -1), 'right': (0, 1)}

# Q-table initialized to zero
Q = np.zeros((grid_size[0], grid_size[1], len(actions)))

# Parameters
alpha = 0.1  # learning rate
gamma = 0.9  # discount factor
epsilon = 0.1  # exploration rate

# Define the rewards (for simplicity, just target and penalties)
goal = (4, 4)  # target goal position
penalty = -1  # penalty for hitting walls or wrong moves
reward = 10  # reward for reaching the goal

# Function to check if the state is within the grid bounds
def is_valid_state(state):
    return 0 <= state[0] < grid_size[0] and 0 <= state[1] < grid_size[1]

# Function to select an action using epsilon-greedy strategy
def select_action(state):
    if random.uniform(0, 1) < epsilon:
        # Exploration: choose a random action
        return random.choice(actions)
    else:
        # Exploitation: choose the action with the highest Q-value
        action_index = np.argmax(Q[state[0], state[1]])
        return actions[action_index]

# Function to move the agent in the grid
def move(state, action):
    move_delta = action_dict[action]
    new_state = (state[0] + move_delta[0], state[1] + move_delta[1])
    if is_valid_state(new_state):
        return new_state
    else:
        # If the move is out of bounds, stay in the same position
        return state

# Function to train the agent using Q-learning
def train_agent(episodes=1000):
    for episode in range(episodes):
        state = (0, 0)  # Start at the top-left corner
        done = False

        while not done:
            # Select an action
            action = select_action(state)

            # Perform the action and get the next state
            next_state = move(state, action)

            # Get the reward for the new state
            if next_state == goal:
                reward_received = reward
                done = True  # Goal reached
            else:
                reward_received = penalty

            # Update the Q-value for the state-action pair
            action_index = actions.index(action)
            next_max = np.max(Q[next_state[0], next_state[1]])  # max Q-value for the next state
            Q[state[0], state[1], action_index] = Q[state[0], state[1], action_index] + alpha * (reward_received + gamma * next_max - Q[state[0], state[1], action_index])

            # Update the state
            state = next_state

        if episode % 100 == 0:
            print(f"Episode {episode} complete")

# Run the Q-learning algorithm
train_agent(episodes=1000)

# After training, let's print the Q-table
print("Q-table after training:")
print(Q)


Episode 0 complete
Episode 100 complete
Episode 200 complete
Episode 300 complete
Episode 400 complete
Episode 500 complete
Episode 600 complete
Episode 700 complete
Episode 800 complete
Episode 900 complete
Q-table after training:
[[[-1.57523205 -0.434062   -1.54844577 -0.72974662]
  [-2.28617725  0.61694928 -2.21255559 -2.22949177]
  [-1.62976704 -0.54729355 -1.7278706  -1.61720273]
  [-1.04661746 -1.00889662 -1.23275464 -1.08733323]
  [-0.7478178  -0.5572507  -0.73069371 -0.7703044 ]]

 [[-1.49353246 -0.19404278 -0.66087498  0.62882   ]
  [-0.66394879  1.8098     -0.62228005  1.02981189]
  [-1.46610917  3.06825685 -1.2419634  -0.17741109]
  [-0.72151774  4.33616465 -0.76788621 -0.67407282]
  [-0.4833299   5.90608817  0.10846231 -0.13774671]]

 [[-1.55902976 -1.74632746 -1.36392049  1.72579665]
  [ 0.36051051  1.5850504  -0.04629683  3.122     ]
  [ 1.19530315  3.6605006   1.71416378  4.58      ]
  [ 2.01887695  5.79532474  2.66326948  6.2       ]
  [ 3.62172674  8.          4.301579