In [8]:
import numpy as np
import random

# ---------- تنظیمات محیط ----------
GRID = [
    ['S', '-', '-', '-'],
    ['-', '#', '-', 'G']
]
ACTIONS = ['up', 'down', 'left', 'right']
ALPHA = 0.1
GAMMA = 0.99
LAMBDA = 0.8
EPSILON = 0.1

def get_start_state():
    for i, row in enumerate(GRID):
        for j, cell in enumerate(row):
            if cell == 'S':
                return (i, j)

def is_terminal(state):
    i, j = state
    return GRID[i][j] == 'G'

def step(state, action):
    i, j = state
    if action == 'up':
        i = max(0, i - 1)
    elif action == 'down':
        i = min(len(GRID) - 1, i + 1)
    elif action == 'left':
        j = max(0, j - 1)
    elif action == 'right':
        j = min(len(GRID[0]) - 1, j + 1)

    if GRID[i][j] == '#':
        return state, -1, False
    elif GRID[i][j] == 'G':
        return (i, j), +1, True
    else:
        return (i, j), -0.01, False

def epsilon_greedy(Q, state, epsilon=EPSILON):
    if random.random() < epsilon:
        return random.choice(ACTIONS)
    values = [Q.get((state, a), 0.0) for a in ACTIONS]
    return ACTIONS[np.argmax(values)]

# ---------- الگوریتم Q(lambda) ----------
def q_lambda(num_episodes=1000):
    Q = {}
    for episode in range(num_episodes):
        state = get_start_state()
        action = epsilon_greedy(Q, state)
        E = {}

        while not is_terminal(state):
            next_state, reward, done = step(state, action)
            next_action = epsilon_greedy(Q, next_state)

            sa = (state, action)
            next_sa = (next_state, next_action)

            Q[sa] = Q.get(sa, 0.0)
            Q[next_sa] = Q.get(next_sa, 0.0)

            delta = reward + GAMMA * Q[next_sa] - Q[sa]
            E[sa] = E.get(sa, 0.0) + 1

            for s_a in Q:
                E[s_a] = E.get(s_a, 0.0)
                Q[s_a] += ALPHA * delta * E[s_a]
                E[s_a] *= GAMMA * LAMBDA

            state = next_state
            action = next_action
            if done:
                break
    return Q

# ---------- ارزیابی عملکرد ----------
def evaluate_policy(Q, episodes=100):
    success = 0
    for _ in range(episodes):
        state = get_start_state()
        while not is_terminal(state):
            action = epsilon_greedy(Q, state, epsilon=0)  # بدون اکتشاف
            next_state, reward, done = step(state, action)
            state = next_state
            if done and reward > 0:
                success += 1
    success_rate = success / episodes
    print(f"\n✅ Success rate: {success_rate * 100:.2f}% in {episodes} test episodes.")

# ---------- اجرا ----------
Q_result = q_lambda()
for key in sorted(Q_result.keys()):
    print(f"{key}: {Q_result[key]:.2f}")

evaluate_policy(Q_result)


((0, 0), 'down'): 0.67
((0, 0), 'left'): 0.70
((0, 0), 'right'): 0.93
((0, 0), 'up'): 0.66
((0, 1), 'down'): -0.08
((0, 1), 'left'): 0.85
((0, 1), 'right'): 0.96
((0, 1), 'up'): 0.87
((0, 2), 'down'): 0.74
((0, 2), 'left'): 0.76
((0, 2), 'right'): 0.98
((0, 2), 'up'): 0.90
((0, 3), 'down'): 1.00
((0, 3), 'left'): 0.89
((0, 3), 'right'): 0.92
((0, 3), 'up'): 0.96
((1, 0), 'down'): -0.08
((1, 0), 'left'): -0.23
((1, 0), 'right'): -0.21
((1, 0), 'up'): 0.79
((1, 2), 'down'): 0.08
((1, 2), 'up'): 0.82
((1, 3), 'down'): 0.00
((1, 3), 'left'): 0.00
((1, 3), 'right'): 0.00
((1, 3), 'up'): 0.00

✅ Success rate: 100.00% in 100 test episodes.
