In [1]:
import numpy as np
import random
from collections import defaultdict

# ---------- تنظیمات محیط ----------
GRID = [
    ['S', '-', '-', '-'],
    ['-', '#', '-', 'G']
]
ACTIONS = ['up', 'down', 'left', 'right']
GAMMA = 0.99
EPSILON = 0.1

def get_start_state():
    for i, row in enumerate(GRID):
        for j, cell in enumerate(row):
            if cell == 'S':
                return (i, j)

def is_terminal(state):
    i, j = state
    return GRID[i][j] == 'G'

def step(state, action):
    i, j = state
    if action == 'up':
        i = max(0, i - 1)
    elif action == 'down':
        i = min(len(GRID) - 1, i + 1)
    elif action == 'left':
        j = max(0, j - 1)
    elif action == 'right':
        j = min(len(GRID[0]) - 1, j + 1)

    if GRID[i][j] == '#':
        return state, -1, False
    elif GRID[i][j] == 'G':
        return (i, j), +1, True
    else:
        return (i, j), -0.01, False

def epsilon_greedy(Q, state):
    if random.random() < EPSILON:
        return random.choice(ACTIONS)
    values = [Q[(state, a)] for a in ACTIONS]
    return ACTIONS[np.argmax(values)]

# ---------- الگوریتم مونت‌کارلو ----------
def monte_carlo_control(num_episodes=1000):
    Q = defaultdict(float)
    returns = defaultdict(list)

    for episode in range(num_episodes):
        state = get_start_state()
        episode_data = []
        
        # Generate episode
        while not is_terminal(state):
            action = epsilon_greedy(Q, state)
            next_state, reward, done = step(state, action)
            episode_data.append((state, action, reward))
            state = next_state
            if done:
                break

        # Calculate returns and update Q
        G = 0
        visited = set()
        for t in reversed(range(len(episode_data))):
            s, a, r = episode_data[t]
            G = GAMMA * G + r
            if (s, a) not in visited:
                visited.add((s, a))
                returns[(s, a)].append(G)
                Q[(s, a)] = np.mean(returns[(s, a)])

    return Q

# ---------- ارزیابی ----------
def evaluate_policy(Q, episodes=100):
    success = 0
    for _ in range(episodes):
        state = get_start_state()
        while not is_terminal(state):
            values = [Q[(state, a)] for a in ACTIONS]
            best_action = ACTIONS[np.argmax(values)]
            next_state, reward, done = step(state, best_action)
            state = next_state
            if done and reward > 0:
                success += 1
    print(f"\n✅ Monte Carlo Success rate: {success / episodes * 100:.2f}% in {episodes} test episodes.")

# ---------- اجرا ----------
Q_mc = monte_carlo_control()
for key in sorted(Q_mc.keys()):
    print(f"{key}: {Q_mc[key]:.2f}")

evaluate_policy(Q_mc)


((0, 0), 'down'): -0.23
((0, 0), 'left'): 0.88
((0, 0), 'right'): 0.89
((0, 0), 'up'): 0.44
((0, 1), 'down'): -0.10
((0, 1), 'left'): 0.45
((0, 1), 'right'): 0.95
((0, 1), 'up'): 0.89
((0, 2), 'down'): 0.90
((0, 2), 'left'): 0.91
((0, 2), 'right'): 0.98
((0, 2), 'up'): 0.95
((0, 3), 'down'): 1.00
((0, 3), 'left'): 0.95
((0, 3), 'right'): 0.98
((0, 3), 'up'): 0.98
((1, 0), 'down'): -4.37
((1, 0), 'left'): -4.10
((1, 0), 'right'): -0.86
((1, 0), 'up'): 0.53
((1, 2), 'down'): 0.96
((1, 2), 'left'): -0.62
((1, 2), 'right'): 1.00
((1, 2), 'up'): 0.90

✅ Monte Carlo Success rate: 100.00% in 100 test episodes.
