In [None]:
import numpy as np
import random

# Configuración inicial
gridworld = np.array([
    ['S', 'F', 'F', 'F'],
    ['F', 'H', 'F', 'H'],
    ['F', 'F', 'F', 'H'],
    ['H', 'F', 'F', 'G']
])

# Parámetros del entorno y del aprendizaje
gamma = 0.9  # Factor de descuento
epsilon = 0.1  # Parámetro de exploración
episodes = 100  # Número de episodios
alpha = 0.1  # Tasa de aprendizaje

# Mapear celdas a recompensas
rewards = {
    'S': 0,  # Inicio
    'F': 0,  # Estado regular
    'H': -1,  # Hoyo
    'G': 1   # Meta
}

# Dimensiones del mundo
n_rows, n_cols = gridworld.shape

# Acciones posibles: arriba, abajo, izquierda, derecha
actions = ['up', 'down', 'left', 'right']
action_effects = {
    'up': (-1, 0),
    'down': (1, 0),
    'left': (0, -1),
    'right': (0, 1)
}

# Función para validar movimientos
def is_valid_move(row, col):
    return 0 <= row < n_rows and 0 <= col < n_cols

# Inicializar valores Q(s, a)
Q = {}
for row in range(n_rows):
    for col in range(n_cols):
        Q[(row, col)] = {action: 0 for action in actions}

# Epsilon-greedy policy
def epsilon_greedy(state):
    if random.random() < epsilon:
        return random.choice(actions)  # Explorar
    else:
        # Elegir acción con mayor valor Q
        state_actions = Q[state]
        return max(state_actions, key=state_actions.get)

# Simular un episodio
def generate_episode():
    # Estado inicial
    row, col = np.where(gridworld == 'S')
    state = (row[0], col[0])
    episode = []

    while True:
        action = epsilon_greedy(state)
        # Aplicar acción
        new_row = state[0] + action_effects[action][0]
        new_col = state[1] + action_effects[action][1]
        if is_valid_move(new_row, new_col):
            next_state = (new_row, new_col)
        else:
            next_state = state  # Movimiento inválido, quedarse en el mismo lugar

        # Registrar la transición
        reward = rewards[gridworld[next_state]]
        episode.append((state, action, reward))

        # Verificar si el episodio termina
        if gridworld[next_state] in ['G', 'H']:
            break
        state = next_state

    return episode

# Algoritmo Monte Carlo
for episode in range(episodes):
    # Generar un episodio
    episode_data = generate_episode()

    # Calcular el retorno acumulado (G_t) y actualizar Q
    G = 0
    visited = set()
    for state, action, reward in reversed(episode_data):
        G = reward + gamma * G
        if (state, action) not in visited:
            visited.add((state, action))
            # Actualizar Q(s, a)
            Q[state][action] += alpha * (G - Q[state][action])

# Visualizar la política aprendida
policy = np.empty_like(gridworld, dtype=str)
for row in range(n_rows):
    for col in range(n_cols):
        state = (row, col)
        if gridworld[state] in ['H', 'G']:
            policy[state] = gridworld[state]
        else:
            policy[state] = max(Q[state], key=Q[state].get)

print("Política aprendida:")
print(policy)


Política aprendida:
[['r' 'r' 'd' 'r']
 ['u' 'H' 'd' 'H']
 ['r' 'r' 'd' 'H']
 ['H' 'r' 'r' 'G']]


Revisión en 1 dimensión

In [None]:
import numpy as np
import random

# Configuración inicial
gridworld = np.array([
    ['H','F','S', 'F',  'G'],
])



In [None]:
# Parámetros del entorno y del aprendizaje
gamma = 0.9  # Factor de descuento
epsilon = 0.1  # Parámetro de exploración
episodes = 2  # Número de episodios
alpha = 0.1  # Tasa de aprendizaje



In [None]:
# Mapear celdas a recompensas
rewards = {
    'S': 0,  # Inicio
    'F': 0,  # Estado regular
    'H': -1,  # Hoyo
    'G': 1   # Meta
}

In [None]:
# Dimensiones del mundo
n_rows, n_cols = gridworld.shape

# Acciones posibles: arriba, abajo, izquierda, derecha
actions = ['left', 'right']
action_effects = {
    'left': (0, -1),
    'right': (0, 1)
}

In [None]:
# Función para validar movimientos
def is_valid_move(row, col):
    return 0 <= row < n_rows and 0 <= col < n_cols

In [None]:
# Inicializar valores Q(s, a)
Q = {}
for row in range(n_rows):
    for col in range(n_cols):
        Q[(row, col)] = {action: 0 for action in actions}
print(Q)

{(0, 0): {'left': 0, 'right': 0}, (0, 1): {'left': 0, 'right': 0}, (0, 2): {'left': 0, 'right': 0}, (0, 3): {'left': 0, 'right': 0}, (0, 4): {'left': 0, 'right': 0}}


In [None]:
# Epsilon-greedy policy
def epsilon_greedy(state):
    if random.random() < epsilon:
        return random.choice(actions)  # Explorar
    else:
        # Elegir acción con mayor valor Q
        state_actions = Q[state]
        return max(state_actions, key=state_actions.get)


In [None]:
# Simular un episodio
def generate_episode():
    # Estado inicial
    row, col = np.where(gridworld == 'S')
    print("np.where(gridworld == 'S') , row=",str(row)," col= ",str(col))
    state = (row[0], col[0])
    episode = []

    while True:
        action = epsilon_greedy(state)
        # Aplicar acción
        new_row = 0
        new_col = state[1] + action_effects[action][1]
        if is_valid_move(new_row, new_col):
            next_state = (new_row, new_col)
        else:
            next_state = state  # Movimiento inválido, quedarse en el mismo lugar

        # Registrar la transición
        reward = rewards[gridworld[next_state]]
        episode.append((state, action, reward))

        # Verificar si el episodio termina
        if gridworld[next_state] in ['G', 'H']:
            break
        state = next_state

    return episode



In [None]:
# Algoritmo Monte Carlo
for episode in range(episodes):
    # Generar un episodio
    episode_data = generate_episode()

    # Calcular el retorno acumulado (G_t) y actualizar Q
    G = 0
    visited = set()
    for state, action, reward in reversed(episode_data):
        G = reward + gamma * G
        if (state, action) not in visited:
            visited.add((state, action))
            # Actualizar Q(s, a)
            Q[state][action] += alpha * (G - Q[state][action])

np.where(gridworld == 'S') , row= [0]  col=  [2]
np.where(gridworld == 'S') , row= [0]  col=  [2]


In [None]:


# Visualizar la política aprendida
policy = np.empty_like(gridworld, dtype=str)
for row in range(n_rows):
    for col in range(n_cols):
        state = (row, col)
        if gridworld[state] in ['H', 'G']:
            policy[state] = gridworld[state]
        else:
            policy[state] = max(Q[state], key=Q[state].get)

print("Política aprendida:")
print(policy)


Política aprendida:
[['H' 'r' 'r' 'r' 'G']]
