# Simple Q-learning example

Environment setup:
- States: 0 (left), 1 (middle), 2 (right), 3 (goal)
- Actions: 0 (left), 1 (right)
- Rewards: 0 for all states except state 3 (goal), which gives a reward of 1
- Transition: moving left or right

In [2]:
import numpy as np

# Define the environment -- simple grid world with 4 states and 2 actions
states = [0, 1, 2, 3]  # simple grid world
actions = [0, 1]  # 0 = left, 1 = right

# Q-values are initialized to zero
Q = np.zeros((len(states), len(actions)))

print("Initial Q-values:")
print(Q)

rewards = [0, 0, 0, 1]  # reward at state 3

# Define the learning parameters
# alpha = learning rate, gamma = discount factor, epsilon = exploration rate
# These parameters are typically tuned based on the specific problem and environment
# For this example, we will use arbitrary values
alpha = 0.1
gamma = 0.9
epsilon = 0.1

def choose_action(state):
    if np.random.rand() < epsilon:
        return np.random.choice(actions)
    return np.argmax(Q[state])

for episode in range(1000):
    state = 0
    while state != 3:
        action = choose_action(state)
        next_state = state + 1 if action == 1 else max(0, state - 1)
        reward = rewards[next_state]
        Q[state, action] += alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])
        state = next_state

print("Q-values after training:")
print(Q)

Initial Q-values:
[[0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]]
Q-values after training:
[[0.72588583 0.81      ]
 [0.72777222 0.9       ]
 [0.78176577 1.        ]
 [0.         0.        ]]
