<a href="https://colab.research.google.com/github/aryanjha256/randomjupyternotebooks/blob/main/qLearningProgram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np

# Define the environment
env = np.array([['S', '.', '.', ','],
                ['.', 'C', '.', '.'],
                ['.', '.', '.', 'G']])

# Prameters
learning_rate = 0.1
discount_factor = 0.9
exploration_prob = 0.1
num_episodes = 1000

# Initialize the Q-table with zeroes
num_states = np.prod(env.shape)
num_actions = 4 # up, down, left, right
q_table = np.zeros((num_states, num_actions))

# Helper function to convert state coordinates to a single index
def state_to_index(state, env_shape):
    return state[0] * env_shape[1] + state[1]

# Helper function to choose an action based on Q-table
def choose_action(state):
    if np.random.uniform(0 ,1) < exploration_prob:
        return np.random.choice(num_actions) # Random action
    else:
        return np.argmax(q_table[state_to_index(state, env.shape), :])

# Helper function to get the next state based on the current state and action
def get_next_state(state, action):
    if action == 0 and state[0] > 0:
        return (state[0] - 1,  state[1]) # Up
    elif action == 1 and state[0] < env.shape[0] - 1:
        return (state[0] + 1,  state[1]) # Down
    elif action == 2 and state[1] > 0:
        return (state[0],  state[1] -1) # Left
    elif action == 3 and state[1] < env.shape[1] - 1:
        return (state[0],  state[1] + 1) # Right
    else:
        return state


# Q-Learning algorithm
for episode in range(num_episodes):
    state = (0, 0) # Start state
    done = False

    while not done:
        action = choose_action(state)
        next_state = get_next_state(state, action)

        # Calculate reward based on the next state
        if env[next_state] == 'C':
            reward = -100 # Cliff
            done = True

        elif env[next_state] == 'G':
            reward = 10 # Goal
            done = True
        else:
            reward = -1 # Empty cell

        # Update the Q-value for the current state-action pair
        q_table[state_to_index(state, env.shape),  action] = (1 - learning_rate) * q_table[state_to_index(state, env.shape), action] + learning_rate * (reward + discount_factor * np.max(q_table[state_to_index(next_state, env.shape), :]))

        state = next_state

        # Check if the episode is done
        if done:
            break

# Print the Q-table
print("Learned Q-table: ")
print(q_table)

Learned Q-table: 
[[  1.48134446  -0.72183154   1.63802417   3.122     ]
 [  2.81004737 -94.76652367   1.64966105   4.58      ]
 [  3.9413895    5.49975842   3.00445177   6.2       ]
 [  5.38708801   8.           3.82282618   6.04621621]
 [ -1.00225932   1.48250957  -1.13213616 -27.1       ]
 [  0.           0.           0.           0.        ]
 [ -0.1          1.91529155 -10.           7.95327876]
 [  5.82827798  10.           4.9469574    7.44877101]
 [ -0.67640616  -0.67078761  -0.67934652   4.28607438]
 [-10.          -0.2962      -0.231319     7.22380292]
 [ -0.1          1.38442475  -0.21439      9.86697205]
 [  0.           0.           0.           0.        ]]
