In [10]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from tensorboardX import SummaryWriter

# Check for MPS availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Hyperparameters
learning_rate = 0.0001
gamma = 0.99
map_size = 4  # 4x4 or 8x8 grid



Using device: cuda


In [11]:
class Policy(nn.Module):
    def __init__(self, state_size, action_size):
        super().__init__()
        self.data = []  # Initialize the data list

        self.fc1 = nn.Linear(state_size, 128)  # Now uses Python int
        # Network architecture for discrete states
       # self.embed = nn.Embedding(state_size, 16)
        self.fc2 = nn.Linear(128, action_size)
        
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        
    def forward(self, x):
        # Convert discrete state to embedding
        #x = self.embed(x)
        x = F.one_hot(x, num_classes=int(self.fc1.in_features)).float()
        x = F.relu(self.fc1(x))
        x = F.softmax(self.fc2(x), dim=-1)
        return x
      
    def put_data(self, item):
        self.data.append(item)
        
    def train_net(self):
        R = 0
        self.optimizer.zero_grad()
        
        # Calculate returns and update policy
        returns = []
        for r, _ in reversed(self.data):
            R = r + gamma * R
            returns.insert(0, R)
            
        for (r, prob), G in zip(self.data, returns):
            loss = -torch.log(prob) * G
            loss.backward()
            
        self.optimizer.step()
        self.data = []



In [12]:
env = gym.make('FrozenLake-v1', 
               map_name=f"{map_size}x{map_size}",
               is_slippery=False)

state_size = int(env.observation_space.n)
action_size = env.action_space.n
pi = Policy(state_size, action_size).to(device)  # Move model to MPS device
print_interval = 100


In [13]:
score = 0.0
writer = SummaryWriter(log_dir=f'runs/REINFORCE_{map_size}x{map_size}')
steps = 0
for n_epi in range(100000):
    s, _ = env.reset()
    done = False
    
    while not done:
        steps+=1
        s_tensor = torch.tensor(s, device=device)
        
        prob = pi(s_tensor)
        m = Categorical(prob)
        a = m.sample()
        
        s_prime, r, terminated, truncated, _ = env.step(a.item())
        done = terminated or truncated
        
        pi.put_data((r, prob[a]))
        s = s_prime
        score += r
    pi.train_net()

    
    if n_epi % print_interval == 0 and n_epi != 0:
        avg_score = score / print_interval
        print(f"Ep {n_epi:5d} | Avg Score: {avg_score:.2f}")
        writer.add_scalar('Return', avg_score, steps)
        score = 0.0
writer.close()
env.close()


Ep   100 | Avg Score: 0.00
Ep   200 | Avg Score: 0.02
Ep   300 | Avg Score: 0.00
Ep   400 | Avg Score: 0.01
Ep   500 | Avg Score: 0.00
Ep   600 | Avg Score: 0.02
Ep   700 | Avg Score: 0.00
Ep   800 | Avg Score: 0.01
Ep   900 | Avg Score: 0.03
Ep  1000 | Avg Score: 0.04
Ep  1100 | Avg Score: 0.01
Ep  1200 | Avg Score: 0.03
Ep  1300 | Avg Score: 0.02
Ep  1400 | Avg Score: 0.02
Ep  1500 | Avg Score: 0.03
Ep  1600 | Avg Score: 0.01
Ep  1700 | Avg Score: 0.04
Ep  1800 | Avg Score: 0.03
Ep  1900 | Avg Score: 0.06
Ep  2000 | Avg Score: 0.04
Ep  2100 | Avg Score: 0.05
Ep  2200 | Avg Score: 0.05
Ep  2300 | Avg Score: 0.02
Ep  2400 | Avg Score: 0.05
Ep  2500 | Avg Score: 0.07
Ep  2600 | Avg Score: 0.04
Ep  2700 | Avg Score: 0.04
Ep  2800 | Avg Score: 0.04
Ep  2900 | Avg Score: 0.06
Ep  3000 | Avg Score: 0.08
Ep  3100 | Avg Score: 0.11
Ep  3200 | Avg Score: 0.07
Ep  3300 | Avg Score: 0.11
Ep  3400 | Avg Score: 0.12
Ep  3500 | Avg Score: 0.08
Ep  3600 | Avg Score: 0.14
Ep  3700 | Avg Score: 0.15
E

KeyboardInterrupt: 

In [4]:
torch.save(pi.state_dict(), 'policy_model.pth')


In [4]:
pi = Policy(state_size, action_size).to(device)
pi.load_state_dict(torch.load('models/policy_model.pth', map_location=device))
pi.eval()


Policy(
  (fc1): Linear(in_features=64, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=4, bias=True)
)

In [5]:
env = gym.make('FrozenLake-v1', 
                map_name=f"{map_size}x{map_size}", 
                is_slippery=False, 
                render_mode="human")
state_size = env.observation_space.n
action_size = env.action_space.n

# Load trained policy
# policy = Policy(state_size, action_size)
# policy.load_state_dict(torch.load(model_path, map_location='cpu'))
pi.eval()
num_episodes = 5
successes = 0

for ep in range(num_episodes):
    s, _ = env.reset()
    done = False
    total_reward = 0
    steps = 0
    
    while not done:
        # Move state tensor to MPS device
        s_tensor = torch.tensor(s, device=device, dtype=torch.long)
        
        with torch.no_grad():
            probs = pi(s_tensor)
            
        action = torch.argmax(probs).item()  # Greedy action selection
        
        s, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        total_reward += reward
        steps += 1
#        env.render()
        
    print(f"Episode {ep+1}: Reward {total_reward}, Steps {steps}, {'Success' if reward == 1 else 'Fail'}")
    if reward == 1:
        successes += 1

print(f"\nSuccess rate: {successes}/{num_episodes} ({100*successes/num_episodes:.1f}%)")
env.close()


Episode 1: Reward 1.0, Steps 14, Success
Episode 2: Reward 1.0, Steps 14, Success
Episode 3: Reward 1.0, Steps 14, Success
Episode 4: Reward 1.0, Steps 14, Success
Episode 5: Reward 1.0, Steps 14, Success

Success rate: 5/5 (100.0%)


: 