In [3]:
import random

# Define environment
states = [0, 1, 2, 3, 4]   # positions in 1D world
goal_state = 4

def step(state, action):
    """Take an action (left=-1, right=+1) and return new state and reward"""
    new_state = max(0, min(goal_state, state + action))  # keep within [0,4]
    if new_state == goal_state:
        return new_state, 10   # reached goal
    else:
        return new_state, -1   # penalty for step

# Example run (random moves)
state = 0
total_reward = 0
print("Start at:", state)

for i in range(10):
    action = random.choice([-1, 1])  # move left or right randomly
    state, reward = step(state, action)
    total_reward += reward
    print(f"Move {i+1}: action={action}, state={state}, reward={reward}")

print("Total Reward:", total_reward)


Start at: 0
Move 1: action=1, state=1, reward=-1
Move 2: action=-1, state=0, reward=-1
Move 3: action=1, state=1, reward=-1
Move 4: action=-1, state=0, reward=-1
Move 5: action=-1, state=0, reward=-1
Move 6: action=1, state=1, reward=-1
Move 7: action=-1, state=0, reward=-1
Move 8: action=1, state=1, reward=-1
Move 9: action=1, state=2, reward=-1
Move 10: action=-1, state=1, reward=-1
Total Reward: -10


In [None]:
import numpy as np
import random

# Define environment
states = [0, 1, 2, 3, 4]
actions = [-1, 1]  # left, right
goal_state = 4

def step(state, action):
    new_state = max(0, min(goal_state, state + action))
    if new_state == goal_state:
        return new_state, 10
    else:
        return new_state, -1

# Q-table initialization
Q = np.zeros((len(states), len(actions)))

# Parameters
alpha = 0.1      # learning rate
gamma = 0.9      # discount factor
epsilon = 0.2    # exploration (try random moves)

# Training
episodes = 1000
for _ in range(episodes):
    state = 0  # start
    while state != goal_state:
        # Choose action (explore or exploit)
        if random.uniform(0,1) < epsilon:
            action_idx = random.choice([0,1])
        else:
            action_idx = np.argmax(Q[state])
        action = actions[action_idx]

        # Take step
        new_state, reward = step(state, action)

        # Update Q-value
        Q[state, action_idx] += alpha * (reward + gamma * np.max(Q[new_state]) - Q[state, action_idx])

        state = new_state

print("Learned Q-table:")
print(Q)

Learned Q-table:
[[ 3.12197926  4.58      ]
 [ 3.12196216  6.2       ]
 [ 4.5799762   8.        ]
 [ 6.19996741 10.        ]
 [ 0.          0.        ]]


# 1. What is this Q-table?

* Rows = states (0, 1, 2, 3, 4)
* Columns = actions:

  * Col 0 = action = **−1** (move left)
  * Col 1 = action = **+1** (move right)
* Each entry = expected future reward if the agent takes that action from that state, following the learned policy.

**Example:**

* `Q[0, 0] = 3.12` → If at **state 0** and you move left (which keeps you at 0), the expected future reward ≈ 3.1.
* `Q[0, 1] = 4.58` → If at **state 0** and you move right, the expected future reward ≈ 4.6.

---

# 2. Why do values increase as you move closer to the goal?

Reading row by row:

* **State 0**: `[3.12, 4.58]` → Going right (4.58) is better than left (3.12).
* **State 1**: `[3.12, 6.20]` → Going right gives higher value.
* **State 2**: `[4.58, 8.00]` → Right is much better.
* **State 3**: `[6.20, 10.0]` → Going right reaches the goal (reward 10).
* **State 4**: `[0, 0]` → Goal state: no actions matter anymore (episode ends).

The numbers grow bigger as you get closer to the goal because the agent is looking ahead at **future discounted rewards**.

---

# 3. What does it mean in terms of policy?

The best action per state is the column with the higher Q-value:

* State 0 → best action = **+1 (right)**
* State 1 → best action = **+1 (right)**
* State 2 → best action = **+1 (right)**
* State 3 → best action = **+1 (right)**

So the **optimal policy** is: always move **right** until you reach the goal.

---

# 4. Why aren’t left moves negative?

Notice that even moving left has a positive number (e.g., 3.12).
That’s because:

* Even if you waste moves, eventually you’ll still reach the goal (+10).
* But since you get −1 penalty per step, left is **worse** than right.

So Q-values for left are lower than for right.

---

# 5. How to interpret numerically?

Imagine starting at state 0:

* If you always go right → shortest path = 4 steps → total reward ≈ (−1 −1 −1 +10) = **7**.
* If you go left sometimes → you delay reaching the goal, so the total reward shrinks (extra −1 penalties).

That’s why **right Q-values are higher**.

---

# Summary

* The Q-table tells us **how good each action is in each state**.
* The agent has learned to **always move right** to maximize reward.
* The values reflect the **expected total reward**, considering step penalties and the final goal.

---


In [None]:
state = env.reset()
state = state[0] if isinstance(state, tuple) else state

env.render()
done = False

while not done:
    action = np.argmax(Q_table[state, :])  # greedy (optimal) action
    next_state, reward, done, *info = env.step(action)
    env.render()
    state = next_state

print("Game finished with reward:", reward)


If you want to render in human mode, initialize the environment in this way: gym.make('EnvName', render_mode='human') and don't call the render method.
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(
  from pkg_resources import resource_stream, resource_exists
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_names

Game finished with reward: 1.0
