In [1]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt

In [None]:
#IF ON COLAB
############àSEARCH HOW TO SET ON COLAB PYTHON 3.5, IF NOT IT DOES NOT WORK
!pip install gymnasium "gymnasium[classic-control,mujoco]==0.29.1"

In [2]:
#CPU set-up with GPU available: run this:
device=torch.device("cpu")
print(device)

cpu


In [None]:
#Else:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(torch.cuda.get_device_name(device))
print(device)

In [3]:
class Actor(nn.Module):
    def __init__(self, state_dim, n_actions):
        super().__init__()

        self.model = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.Tanh(),
            nn.Linear(64,64),
            nn.Tanh(),
            nn.Linear(64,64),
            nn.Tanh(),
            nn.Linear(64,n_actions)
        )

    def forward(self,x):
        x = torch.Tensor(x)
        return self.model(x)

In [4]:
class Critic(nn.Module):
    def __init__(self, state_dim):
        super().__init__()

        self.model = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.Tanh(),
            nn.Linear(64,64),
            nn.Tanh(),
            nn.Linear(64,64),
            nn.Tanh(),
            nn.Linear(64,1)
        )

    def forward(self,x):
        x = torch.Tensor(x)
        return self.model(x)

In [21]:
class A2C_Agent(): #this should work with more parallel environment already, not the evaluate functions

    def __init__(self, n_steps, actor, critic, lr_actor, lr_critic, device):
        self.n = n_steps
        self.device = device
        self.actor = actor.to(self.device) 
        self.critic = critic.to(self.device)
        self.actor_optimizer = torch.optim.Adam(actor.parameters(), lr=lr_actor)
        self.critic_optimizer = torch.optim.Adam(critic.parameters(), lr=lr_critic)
    

    def choose_action(self, states, action_type = "training"):        
        state_values = self.critic.forward(states)
        action_logits = self.actor.forward(states)
        

        if action_type == "training":
            actions_pd = torch.distributions.Categorical(logits=action_logits)
            actions = actions_pd.sample()
            actions_log_prog = actions_pd.log_prob(actions)
            return actions, actions_log_prog, state_values
        
        elif action_type == "greedy":
            actions = torch.argmax(action_logits, dim=1) #greedy policy
            return actions, state_values
    
    def get_losses(self, rewards, states, next_states, log_prob, gamma, terminated):
        
        delta = torch.Tensor(rewards).reshape_as(self.critic(next_states)) + (1-torch.Tensor(terminated).reshape_as(self.critic(next_states))) * gamma * self.critic(next_states)
        advantage = delta - self.critic(states)        

        critic_loss = advantage.pow(2).mean() 
        

        
        actor_loss = - (advantage.detach() * log_prob.reshape_as(advantage)).mean() #minus sign ? not in slides
        # actor_loss = -log_prob*advantage.detach()

        return actor_loss, critic_loss
    
    def update_params(self, critic_loss, actor_loss):
        
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

    def evaluate_performance(self, env, total_steps):
        
        episodic_rewards_10 = []
        v_values = []
        for j in range(10):
            state, _ = env.reset()
            total_reward = 0
            done = False
            while not done:
                action, state_values = self.choose_action(state, action_type= "greedy")
                #get the current value function fro one trajectory (j==0)
                if j ==0 : v_values.append( state_values.detach().data.numpy() ) #shitty with float
                
                #go to the next state
                next_state, reward, terminated, truncated, _  = env.step(action.detach().data.numpy())
                total_reward += reward
                state = next_state
                
                done = terminated or truncated
            episodic_rewards_10.append(total_reward)
        print("episodic return of the 20000:", np.mean(episodic_rewards_10))
        #comment this if you are trying with more envs
        #plt.plot(v_values)
        #plt.xlabel("trajectory")
        #plt.xlabel("V lalue")
        #plt.title("V values for the first evaluation trajectory")
        #plt.savefig(f'figures/v_values_{total_steps}.png')
        #plt.close()
        print("\n")
    
    def evaluate_training(self, total_reward, actor_loss, critic_loss):
        print("current episodic return:", np.log(total_reward))
        print("critic loss:", np.log(critic_loss.detach().data.numpy()))
        print("actor loss:", np.log(actor_loss.detach().data.numpy()))
        print("\n")
    


In [22]:
episode_rewards = []
actor_losses = []
critic_losses = []
total_steps = 0
max_steps = 500000
env = gym.vector.AsyncVectorEnv( [lambda:gym.make("CartPole-v1") for i in range(1)] )
state_dim  = env.single_observation_space.shape[0]
n_actions = env.single_action_space.n
actor = Actor(state_dim, n_actions)
critic = Critic(state_dim)
agent = A2C_Agent(n_steps=1, actor= actor, critic= critic, lr_actor=1e-5, lr_critic=1e-3, device= device)
state = env.reset()[0]

In [23]:
while total_steps < max_steps:  
    
    terminated = False
    total_reward = 0
    state, _ = env.reset() 
    while not terminated:
        actions, actions_log_prog, state_values = agent.choose_action(state)
        
        next_state, reward, terminated, truncated, _ = env.step(actions.numpy())
        actor_loss, critic_loss = agent.get_losses(reward, state, next_state, actions_log_prog, 0.99, terminated)
        total_reward += reward
        state = next_state

        agent.update_params(critic_loss, actor_loss)

        

        critic_losses.append(critic_loss)
        actor_losses.append(actor_loss)


        if (total_steps % 1000) == 0: agent.evaluate_training(total_reward, actor_loss, critic_loss)
        if (total_steps % 20000 == 0):agent.evaluate_performance(env, total_steps)


        terminated = terminated.all()
        if total_steps >= max_steps:
            break  
        else: total_steps += 1
        
            
    episode_rewards.append(total_reward)

current episodic return: [0.]
critic loss: 0.0154156815
actor loss: -0.41033


episodic return of the 20000: 9.2


current episodic return: [2.77258872]
critic loss: 0.23437867
actor loss: -0.229189


current episodic return: [3.40119738]
critic loss: -0.30066654
actor loss: -0.53412783


current episodic return: [0.]
critic loss: -0.30520827
actor loss: -0.5280647


current episodic return: [1.79175947]
critic loss: -0.28957137
actor loss: -0.314291


current episodic return: [2.48490665]
critic loss: -0.03052327
actor loss: -0.32767397


current episodic return: [3.29583687]
critic loss: -0.39003402
actor loss: -0.3645774


current episodic return: [2.19722458]
critic loss: -0.72488904
actor loss: nan




  print("actor loss:", np.log(actor_loss.detach().data.numpy()))


current episodic return: [2.19722458]
critic loss: -0.99219286
actor loss: -1.2008941


current episodic return: [3.17805383]
critic loss: -0.38500783
actor loss: -0.81013316




KeyboardInterrupt: 