In [101]:
# from https://gymnasium.farama.org/tutorials/training_agents/reinforce_invpend_gym_v26/
import gymnasium as gym
import torch 
import torch.nn as nn
import numpy as np

from torch.utils.tensorboard import SummaryWriter

In [102]:
# env = gym.make("CartPole-v1", render_mode='human')
env = gym.make("CartPole-v1")
#%load_ext tensorboard
#wandb.init(project="cartpole-v1", entity="bpanthi977")

In [103]:
def evaluate_agent(agent, steps=100):
    observation, info = env.reset()
    total_reward = 0
    total_episodes = 0
    for _ in range(steps):
        action = agent.action(observation)
        observation, reward, terminated, truncated, info = env.step(action)
        
        total_reward += reward
        # env.render()
        if terminated or truncated:
            observation, info = env.reset()
            total_episodes += 1
            
    return total_reward/total_episodes

In [178]:
class CartPoleAgent(nn.Module):
    def __init__(self):
        super(CartPoleAgent, self).__init__()
        input_dim = 4
        out_dim = 2
        
        self.net = nn.Sequential(
            nn.Linear(in_features=input_dim, out_features=512),
            nn.ReLU(),
            nn.Linear(in_features=512, out_features=256),
            nn.ReLU(),
            nn.Linear(in_features=256, out_features=128),
            nn.ReLU(),
            nn.Linear(in_features=128, out_features=out_dim),
            nn.Softmax(dim=0)
        )
        
    def forward(self, x):
        return self.net(x)
    
    def action(self, state):
        probs = self.forward(torch.tensor(state).to(device))
        action = np.random.choice([0,1], p=probs.detach().cpu().numpy())
        self.action_probs = probs
        
        return action

device = torch.device('mps')   
def reset_network():
    global network, optim, total_episodes, writer
    network = CartPoleAgent().to(device)
    #optim = torch.optim.SGD(network.parameters())
    optim = torch.optim.SGD(network.parameters(), lr=0.0001)
    total_episodes = 0
    
    writer = SummaryWriter()

reset_network()

In [164]:
class EWMA():
    "Exponentially Weighted Moving Average"
    def __init__(self, alpha = 1/50):
        self.alpha = alpha
        self.value = False

    def add(self, value):
        if not self.value:
            self.value = value 
        else:
            self.value = (1 - self.alpha) * self.value + self.alpha * value
        return self

In [179]:
GAMMA=0.99
ENTROPY_BETA=0.1

def train_reinforce(steps):
    global total_episodes
    observation, info = env.reset(seed=42)
    
    total_reward = 0
    episode_steps = 0
    baseline = EWMA()
    par = []
    
    def train(par):
        # REINFORCE Update
        # compute returns in backward order and compute loss
        g_t = 0
        loss = 0
        for t in range(len(par)-1, -1, -1):
            prob, action, reward = par[t]
            g_t = reward + GAMMA * g_t

            log_prob = torch.log(prob)
            # L = - (G - baseline) ln \pi(a)
            loss += - (g_t - baseline.value) * log_prob[action]
            ## L = entropy penalty
            entropy = - (prob * log_prob).sum()         # H = - \sum p_i log p_i
            loss += - ENTROPY_BETA * entropy          # increase entropy 
        
        if g_t == 0:
            return 

        optim.zero_grad()
        loss.backward()
        optim.step()
            
    for step in range(steps):
        action = network.action(observation)
        observation, reward, terminated, truncated, info = env.step(action)

        par.append([network.action_probs, action, reward])
        
        total_reward += reward
        episode_steps += 1
        if terminated or truncated:
            train(par)
            par = []
            observation, info = env.reset()
            total_episodes += 1
            baseline.add(episode_steps)
            writer.add_scalar("Episode Length / Steps ", episode_steps, step)
            writer.add_scalar("MA Reward/Episode", baseline.value, total_episodes)
            episode_steps = 0

            if (total_episodes % 100 == 0):
                print('Av. steps/ episode', baseline.value)
    train(par)
            
    avg_reward = total_reward/total_episodes
    writer.add_hparams({'entropy': ENTROPY_BETA, 'gamma': GAMMA, 'baseline': True, 'total_steps': steps}, {'avg_reward': avg_reward})
    return avg_reward

In [181]:
reset_network()
ENTROPY_BETA = 0.2
train_reinforce(100_000)

Av. steps/ episode 21.422211589952298


Av. steps/ episode 25.723585123065973


Av. steps/ episode 41.164534256292896


Av. steps/ episode 55.03712726831607


Av. steps/ episode 27.758539751422678


Av. steps/ episode 11.778087004417475


Av. steps/ episode 9.767475182027194


Av. steps/ episode 9.503231438290184


Av. steps/ episode 9.389021295950377


Av. steps/ episode 9.3269047301868


Av. steps/ episode 9.48262160673763


Av. steps/ episode 9.414248087114764


Av. steps/ episode 9.604437097812013


Av. steps/ episode 9.554861287193866


Av. steps/ episode 9.418739545345666


Av. steps/ episode 9.413979195297218


Av. steps/ episode 9.658716451837256


Av. steps/ episode 9.413804322868359


Av. steps/ episode 9.597936635174708


Av. steps/ episode 9.429895992016784


Av. steps/ episode 9.737229458112305


Av. steps/ episode 10.50744486769525


Av. steps/ episode 44.417686509719694


Av. steps/ episode 38.68396447797985


Av. steps/ episode 74.3596889048958


Av. steps/ episode 55.43116737178317


Av. steps/ episode 57.73126043822206


Av. steps/ episode 31.143399952732878


Av. steps/ episode 62.84849757066339


Av. steps/ episode 39.018055940284306


Av. steps/ episode 61.60243274504789


Av. steps/ episode 29.209824588508692


Av. steps/ episode 12.532981222418352


Av. steps/ episode 10.009893580818218


Av. steps/ episode 9.584829187254952


Av. steps/ episode 9.61403473068894


Av. steps/ episode 10.564195013611466


Av. steps/ episode 12.815117295262002


Av. steps/ episode 15.077055259568121


Av. steps/ episode 18.716096489007597


Av. steps/ episode 24.445744663772615


Av. steps/ episode 41.2170636003365


23.646252069047055

In [183]:
for beta in [0]:
    reset_network()
    ENTROPY_BETA = beta 
    train_reinforce(20_000)

Av. steps/ episode 20.39287883831354


Av. steps/ episode 25.849196216984986


Av. steps/ episode 36.109600074203435


Av. steps/ episode 52.44949145905201


Av. steps/ episode 34.69604690526616


In [170]:
evaluate_agent(network, 1_000)

34.48275862068966