<a href="https://colab.research.google.com/github/adityasengar/Chatbot/blob/main/Hoffman_RL_game.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

# World dimensions
WORLD_SIZE = 5

# Rewards
REWARD_MEET = 100
REWARD_MOVE = -1
REWARD_MEET_AFTER_HUMAN = 50

# Hyperparameters for Q-Learning
EPSILON = 0.1
ALPHA = 0.5
GAMMA = 0.9
EPISODES = 5000

# Actions: Up, Down, Left, Right
ACTIONS = [(0, -1), (0, 1), (-1, 0), (1, 0)]

# Q-Tables for the Humans and the Robots
Q_human1 = np.zeros((WORLD_SIZE, WORLD_SIZE, len(ACTIONS)))
Q_human2 = np.zeros((WORLD_SIZE, WORLD_SIZE, len(ACTIONS)))
Q_robot1 = np.zeros((WORLD_SIZE, WORLD_SIZE, len(ACTIONS)))
Q_robot2 = np.zeros((WORLD_SIZE, WORLD_SIZE, len(ACTIONS)))

def step(state, action):
    next_state = (max(0, min(WORLD_SIZE - 1, state[0] + action[0])),
                  max(0, min(WORLD_SIZE - 1, state[1] + action[1])))
    return next_state

def get_action(state, Q_table):
    if np.random.rand() < EPSILON:
        return np.random.choice(len(ACTIONS))
    else:
        return np.argmax(Q_table[state[0], state[1], :])

def learn(state, action, reward, next_state, Q_table):
    target = reward + GAMMA * np.max(Q_table[next_state[0], next_state[1], :])
    Q_table[state[0], state[1], action] += ALPHA * (target - Q_table[state[0], state[1], action])

for _ in range(EPISODES):
    # Start positions for Humans and Robots
    human1 = (np.random.randint(WORLD_SIZE), np.random.randint(WORLD_SIZE))
    human2 = (np.random.randint(WORLD_SIZE), np.random.randint(WORLD_SIZE))
    robot1 = (np.random.randint(WORLD_SIZE), np.random.randint(WORLD_SIZE))
    robot2 = (np.random.randint(WORLD_SIZE), np.random.randint(WORLD_SIZE))

    met_human = False

    while True:
        action_human1 = get_action(human1, Q_human1)
        action_human2 = get_action(human2, Q_human2)
        action_robot1 = get_action(robot1, Q_robot1)
        action_robot2 = get_action(robot2, Q_robot2)

        next_human1 = step(human1, ACTIONS[action_human1])
        next_human2 = step(human2, ACTIONS[action_human2])
        next_robot1 = step(robot1, ACTIONS[action_robot1])
        next_robot2 = step(robot2, ACTIONS[action_robot2])

        # Check for AI creation
        if next_human1 == next_robot1 or next_human1 == next_robot2 or next_human2 == next_robot1 or next_human2 == next_robot2:
            reward = REWARD_MEET_AFTER_HUMAN if met_human else REWARD_MEET
            learn(human1, action_human1, reward, next_human1, Q_human1)
            learn(human2, action_human2, reward, next_human2, Q_human2)
            learn(robot1, action_robot1, reward, next_robot1, Q_robot1)
            learn(robot2, action_robot2, reward, next_robot2, Q_robot2)
            break
        # Check for human-human meeting
        elif next_human1 == next_human2:
            met_human = True

        reward = REWARD_MOVE
        learn(human1, action_human1, reward, next_human1, Q_human1)
        learn(human2, action_human2, reward, next_human2, Q_human2)
        learn(robot1, action_robot1, reward, next_robot1, Q_robot1)
        learn(robot2, action_robot2, reward, next_robot2, Q_robot2)

        human1 = next_human1
        human2 = next_human2
        robot1 = next_robot1
        robot2 = next_robot2


In [2]:
Q_human1

array([[[144.27289585, 251.38546754, 144.24176647, 141.93603096],
        [168.58546683, 304.50678394, 150.80801561, 196.67405767],
        [ 94.41199885, 306.31346668, 158.0686772 , 148.12656759],
        [153.0218114 , 133.8526206 , 144.36472339, 292.71247828],
        [138.11002271, 137.08701616, 135.48962515, 349.99989042]],

       [[150.10026008, 230.67978235, 147.66347469, 149.99837496],
        [150.66785305, 148.02628198, 154.57694166, 294.51078501],
        [226.84838977, 190.18182051, 151.11148699, 415.46770058],
        [333.08987045, 176.72313176, 170.72473479, 140.51948723],
        [204.88102746, 140.01308902, 142.3585863 , 422.02534047]],

       [[226.82511321, 324.90050374, 151.65191511, 177.20281841],
        [178.62653686, 376.93351989, 151.48003298, 239.0216206 ],
        [209.58662831, 306.54744829, 146.42950025, 379.70509984],
        [269.96507627, 274.06596853, 224.54950707, 357.71923881],
        [273.75776577, 173.41132489, 289.26405776, 441.34913294]],

    