In [55]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
import numpy as np
from tensorboardX import SummaryWriter

In [56]:
device = torch.device(
    "cuda" if torch.cuda.is_available() else
    "mps" if torch.backends.mps.is_available() else  
    "cpu"
)
print(f"Using device: {device}")

Using device: cuda


In [57]:
gamma = 0.98
n_rollout = 30
map_size = 8
env = gym.make('FrozenLake-v1',map_name = f'{map_size}x{map_size}', is_slippery=False)
state_dim = env.observation_space.n
action_dim = env.action_space.n
max_episodes = 25000

In [58]:
class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(ActorCritic, self).__init__()
        # Actor network (π)
        self.pi_fc1 = nn.Linear(state_dim, 128)
        self.pi_fc2 = nn.Linear(128, action_dim)
        
        # Critic network (V)
        self.v_fc1 = nn.Linear(state_dim, 128)
        self.v_fc2 = nn.Linear(128, 1)
        
        # Separate optimizers
        self.actor_optim = optim.Adam(self.actor_parameters(), lr=0.001)
        self.critic_optim = optim.Adam(self.critic_parameters(), lr=0.002)
        self.data = []
        
    def actor_parameters(self):
        return list(self.pi_fc1.parameters()) + list(self.pi_fc2.parameters())
    
    def critic_parameters(self):
        return list(self.v_fc1.parameters()) + list(self.v_fc2.parameters())
        
    def pi(self, x, softmax_dim=0):
        x = F.relu(self.pi_fc1(x))
        x = self.pi_fc2(x)
        return F.softmax(x, dim=softmax_dim)
    
    def v(self, x):
        x = F.relu(self.v_fc1(x))
        return self.v_fc2(x)
    
    def put_data(self, transition):
        self.data.append(transition)
        
    def make_batch(self):
        s_lst, a_lst, r_lst, s_prime_lst, done_lst = [], [], [], [], []
        for transition in self.data:
            s, a, r, s_prime, done = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r/100.0])
            s_prime_lst.append(s_prime)
            done_lst.append([0.0] if done else [1.0])
            
        return (torch.tensor(s_lst, dtype=torch.float, device=device),
                torch.tensor(a_lst, device=device),
                torch.tensor(r_lst, dtype=torch.float, device=device),
                torch.tensor(s_prime_lst, dtype=torch.float, device=device),
                torch.tensor(done_lst, dtype=torch.float, device=device))
  
    def train_net(self):
        s, a, r, s_prime, done = self.make_batch()
        td_target = r + gamma * self.v(s_prime) * done
        delta = td_target - self.v(s)
        
        pi = self.pi(s, softmax_dim=1)
        pi_a = pi.gather(1, a)
        policy_loss = -torch.log(pi_a) * delta.detach()
        value_loss = F.mse_loss(self.v(s), td_target.detach())
        loss = policy_loss + value_loss

        self.actor_optim.zero_grad()
        self.critic_optim.zero_grad()

        policy_loss.mean().backward()
        value_loss.backward()

        self.actor_optim.step()
        self.critic_optim.step()

        self.data = []
        return policy_loss.mean().detach(),value_loss.detach(),loss

In [59]:
def one_hot(state, state_dim):
    vec = np.zeros(state_dim)
    vec[state] = 1.0
    return torch.tensor(vec, dtype=torch.float, device=device)

In [60]:
def train_agent(env):
    state_dim = env.observation_space.n
    action_dim = env.action_space.n
    writer = SummaryWriter(log_dir=f'runs/A2C_{map_size}x{map_size}')

    model = ActorCritic(state_dim, action_dim)
    model.to(device)
    print_interval = 100
    score = 0.0
    steps = 0

    for n_epi in range(max_episodes):
        s, _ = env.reset()
        s = one_hot(s, state_dim)
        done = False

        while not done:
            for t in range(n_rollout):
                steps+=1
                with torch.no_grad():
                    prob = model.pi(s)
                m = Categorical(prob)
                a = m.sample().item()
                s_prime, r, terminated, truncated, _ = env.step(a)
                done = terminated or truncated
                s_prime_oh = one_hot(s_prime, state_dim)
                model.put_data((s.cpu().numpy(), a, r, s_prime_oh.cpu().numpy(), done))
                s = s_prime_oh
                score += r
                if done:
                    break
        actor_loss,critic_loss,loss = model.train_net()

        if n_epi % print_interval == 0 and n_epi != 0:
            avg_score = score / print_interval
            print(f"Episode: {n_epi}, Avg Score: {avg_score:.2f}")
            
            writer.add_scalar('Return', avg_score, steps)
            # writer.add_scalar('Actor Loss', actor_loss.mean(), steps)
            writer.add_scalar('Critic Loss', critic_loss.item(), steps)

            score = 0.0
    env.close()
    writer.close()

    return model


In [61]:
def test_agent(model):
    env = gym.make('FrozenLake-v1', is_slippery=True, map_name = f'{map_size}x{map_size}', render_mode='human')
    state_dim = env.observation_space.n
    s, _ = env.reset()
    s = one_hot(s, state_dim)
    done = False

    while not done:
        with torch.no_grad():
            prob = model.pi(s)
        a = torch.argmax(prob).item()
        s_prime, r, terminated, truncated, _ = env.step(a)
        done = terminated or truncated
        s = one_hot(s_prime, state_dim)
        env.render()
    env.close()


In [62]:
trained_model = train_agent(env)

Episode: 100, Avg Score: 0.00
Episode: 200, Avg Score: 0.01
Episode: 300, Avg Score: 0.00
Episode: 400, Avg Score: 0.00
Episode: 500, Avg Score: 0.00
Episode: 600, Avg Score: 0.00
Episode: 700, Avg Score: 0.00
Episode: 800, Avg Score: 0.00
Episode: 900, Avg Score: 0.01
Episode: 1000, Avg Score: 0.00
Episode: 1100, Avg Score: 0.01
Episode: 1200, Avg Score: 0.00
Episode: 1300, Avg Score: 0.00
Episode: 1400, Avg Score: 0.01
Episode: 1500, Avg Score: 0.00
Episode: 1600, Avg Score: 0.00
Episode: 1700, Avg Score: 0.01
Episode: 1800, Avg Score: 0.00
Episode: 1900, Avg Score: 0.00
Episode: 2000, Avg Score: 0.01
Episode: 2100, Avg Score: 0.02
Episode: 2200, Avg Score: 0.00
Episode: 2300, Avg Score: 0.02
Episode: 2400, Avg Score: 0.01
Episode: 2500, Avg Score: 0.01
Episode: 2600, Avg Score: 0.01
Episode: 2700, Avg Score: 0.01
Episode: 2800, Avg Score: 0.02
Episode: 2900, Avg Score: 0.02
Episode: 3000, Avg Score: 0.01
Episode: 3100, Avg Score: 0.03
Episode: 3200, Avg Score: 0.05
Episode: 3300, Av

KeyboardInterrupt: 

In [11]:
!ls runs/

ls: cannot access runs/: No such file or directory


In [54]:
!rm -rf runs/A2C_8x8/

In [None]:
torch.save(trained_model.state_dict(), "models/frozenlake_a2c.pth")
print("Model saved as frozenlake_a2c.pth")

In [8]:
trained_model = ActorCritic(state_dim, action_dim).to(device)

In [9]:
trained_model.load_state_dict(torch.load("models/frozenlake_a2c.pth", map_location=device))
trained_model.eval()  # Set to evaluation mode if only using for inference
print("Model loaded from frozenlake_a2c.pth")

Model loaded from frozenlake_actor_critic.pth


In [11]:
test_agent(trained_model)