In [160]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt

from training import *
from models import *
from A2C_agent import *
from helpers import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [161]:
batch_size = 256
gamma_ = 0.99
num_episodes = 1000
max_steps_per_episode = 1000

device = device_selection() # mps -> cuda -> cpu

# Initialize environment
env = gym.make('CartPole-v1') 

# Initialize agent 
agent = Agent(input_size=4, hidden_size=64, \
                output_size_actor=2, output_size_critic=1, \
                eps=0.1, gamma=0.99, lr=1e-4, K=1, device=device)

# Initialize batch
batch = []

for episode in range(num_episodes):
    state, _ = env.reset()
    state = torch.from_numpy(state).float().to(device)  # Convert state to a tensor

    episode_reward = 0

    for t in range(max_steps_per_episode):
        action = agent.select_action(state, worker_id=0, policy="eps-greedy")
        next_state, reward, terminated, truncated, _ = env.step(action.item())
        
        next_state = torch.from_numpy(next_state).float().to(device)  # Convert next_state to a tensor
        done = terminated or truncated
        episode_reward += reward

        # Add the experience to the batch
        batch.append((state, action, reward, next_state, done))

        if len(batch) >= batch_size or done:
            # Train the agent
            worker_losses = agent.train(batch, agent.gamma, agent.lr, agent.device)

            # Clear the batch
            batch.clear()

        state = next_state
        if done:
            break

    if episode % 100 == 0:
        print(f"Episode {episode} finished after {t+1} steps with reward {episode_reward:.2f}")



Episode 0 finished after 26 steps with reward 26.00
Episode 100 finished after 11 steps with reward 11.00
Episode 200 finished after 15 steps with reward 15.00
Episode 300 finished after 16 steps with reward 16.00
Episode 400 finished after 14 steps with reward 14.00
Episode 500 finished after 17 steps with reward 17.00
Episode 600 finished after 19 steps with reward 19.00
Episode 700 finished after 17 steps with reward 17.00
Episode 800 finished after 12 steps with reward 12.00
Episode 900 finished after 10 steps with reward 10.00
