In [1]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt

from training import *
from models import *
from A2C_agent import *
from helpers import *

%load_ext autoreload
%autoreload 2

In [168]:
device = device_selection() # mps -> cuda -> cpu

# Initialize environment
env = gym.make('CartPole-v1') 

# hyperparameters
batch_size = 256
gamma_ = 0.99
lr_actor = 1e-5
lr_critic = 1e-3
eps = 0.1
num_workers=1
num_episodes = 10000
total_steps_budget = 500000
max_steps_per_episode = 1000

# neural network structure
input_size = env.observation_space.shape[0] # 4
hidden_size = 64
output_size_actor = env.action_space.n # 2
output_size_critic = 1


# Initialize agent 
agent = Agent(input_size, hidden_size, \
                output_size_actor, output_size_critic, \
                eps, gamma_, lr_actor, lr_critic, num_workers, \
                device=device)

# Initialize batch
batch = []

for episode in range(num_episodes):
    state, _ = env.reset()
    state = torch.from_numpy(state).float().to(device)  # Convert state to a tensor

    episode_reward = 0

    for t in range(max_steps_per_episode):
        action = agent.select_action(state, worker_id=0, policy="eps-greedy")
        next_state, reward, terminated, truncated, _ = env.step(action.item())
        agent.num_steps += 1
        
        next_state = torch.from_numpy(next_state).float().to(device)  # Convert next_state to a tensor
        done = terminated or truncated
        episode_reward += reward

        # Add the experience to the batch
        batch.append((state, action, reward, next_state, done))

        if len(batch) >= batch_size or done:
            # Train the agent
            worker_losses = agent.train(batch, agent.gamma, agent.lr_actor, agent.lr_critic, agent.device)

            # Clear the batch
            batch.clear()

        state = next_state
        if done: break

    if episode % 100 == 0:
        print(f"Episode {episode} finished after {t+1} steps with reward {episode_reward:.2f}")
    
    if (agent.num_steps >= total_steps_budget): 
        print(f"Reached total training budget of {total_steps_budget} steps ----> Stopping training at episode {episode}")
        break



Episode 0 finished after 12 steps with reward 12.00
Episode 100 finished after 12 steps with reward 12.00
Episode 200 finished after 12 steps with reward 12.00
Episode 300 finished after 10 steps with reward 10.00
Episode 400 finished after 85 steps with reward 85.00
Episode 500 finished after 37 steps with reward 37.00
Episode 600 finished after 10 steps with reward 10.00
Episode 700 finished after 373 steps with reward 373.00
Episode 800 finished after 163 steps with reward 163.00
Episode 900 finished after 274 steps with reward 274.00
Episode 1000 finished after 137 steps with reward 137.00
Episode 1100 finished after 21 steps with reward 21.00
Episode 1200 finished after 73 steps with reward 73.00
Episode 1300 finished after 52 steps with reward 52.00
Episode 1400 finished after 77 steps with reward 77.00
Episode 1500 finished after 181 steps with reward 181.00
Episode 1600 finished after 139 steps with reward 139.00
Episode 1700 finished after 165 steps with reward 165.00
Episode 

KeyboardInterrupt: 

## TO IMPLEMENT:

- fix training for multiple workers 
    - wrong computations for target and advantage probably!
    - wrong way to accumulate batches for K>1 right now (right now it's a replay buffer for current agent, should be transitions from K-workers, each worker on different seed of environment in parallel, all have the same networks --> no need to initialize K actor-critic networks?)
    - mini-batch of 256 consequent transitions? wrong
- methods for plotting and training observation (cf 2.3 of project PDF)
- agent class needs method for saving trained models 
- add counting steps to agent class
- output size of policy make dependent on env (not =2 constant)

workers all use the same parameters and get updated with the same gradient at the same time (if not at the same time --> updates with different policies by definition, but A2C is on-policy). Doesn't matter if they (inevitably) end episodes at different time-steps. What matters is they have the same policy when accumulating a batch (same actor-critic params).

batches vanilla are (1x1), in general (num_time_steps x num_workers)