# ‘RE’ward ‘I’ncrement ‘N’on-negative ‘F’actor times ‘O’ffset ‘R’einforcement times ‘C’haracteristic ‘E’ligibility (REINFORCE)

## Defining Policy Network

In [42]:
from __future__ import annotations

import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
from torch.distributions.normal import Normal
import gymnasium as gym
import flappy_bird_gymnasium
from tqdm import tqdm

plt.rcParams["figure.figsize"] = (10, 5)

In [24]:
class Policy_Network(nn.Module):
    """Parametrized Policy Network."""

    def __init__(self, obs_space_dims: int, action_space_dims: int):
        """Initializes a neural network that estimates the mean and standard deviation
         of a normal distribution from which an action is sampled from.

        Args:
            obs_space_dims: Dimension of the observation space
            action_space_dims: Dimension of the action space
        """
        super().__init__()

        hidden_space1 = 16  # Nothing special with 16, feel free to change
        hidden_space2 = 32  # Nothing special with 32, feel free to change

        # Shared Network
        self.shared_net = nn.Sequential(
            nn.Linear(obs_space_dims, hidden_space1),
            nn.Tanh(),
            nn.Linear(hidden_space1, hidden_space2),
            nn.Tanh(),
        )

        # Policy Mean specific Linear Layer
        self.policy_mean_net = nn.Sequential(
            nn.Linear(hidden_space2, action_space_dims)
        )

        # Policy Std Dev specific Linear Layer
        self.policy_stddev_net = nn.Sequential(
            nn.Linear(hidden_space2, action_space_dims)
        )

    def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
        """Conditioned on the observation, returns the mean and standard deviation
         of a normal distribution from which an action is sampled from.

        Args:
            x: Observation from the environment

        Returns:
            action_means: predicted mean of the normal distribution
            action_stddevs: predicted standard deviation of the normal distribution
        """
        shared_features = self.shared_net(x.float())

        action_means = self.policy_mean_net(shared_features)
        action_stddevs = torch.log(
            1 + torch.exp(self.policy_stddev_net(shared_features))
        )

        return action_means, action_stddevs

## Defining Agent

In [38]:
class REINFORCE:
    """REINFORCE algorithm."""

    def __init__(self, obs_space_dims: int, action_space_dims: int):
        """Initializes an agent that learns a policy via REINFORCE algorithm [1]
        to solve the task at hand (Inverted Pendulum v4).

        Args:
            obs_space_dims: Dimension of the observation space
            action_space_dims: Dimension of the action space
        """

        # Hyperparameters
        self.learning_rate = 1e-4  # Learning rate for policy optimization
        self.gamma = 0.99  # Discount factor
        self.eps = 1e-6  # small number for mathematical stability

        self.probs = []  # Stores probability values of the sampled action
        self.rewards = []  # Stores the corresponding rewards

        self.net = Policy_Network(obs_space_dims, action_space_dims)
        self.optimizer = torch.optim.AdamW(self.net.parameters(), lr=self.learning_rate)

    def sample_action(self, state: np.ndarray) -> float:
        """Returns an action, conditioned on the policy and observation.

        Args:
            state: Observation from the environment

        Returns:
            action: Action to be performed
        """
        state = torch.tensor(np.array([state]))
        action_means, action_stddevs = self.net(state)

        # create a normal distribution from the predicted
        #   mean and standard deviation and sample an action
        distrib = Normal(action_means[0] + self.eps, action_stddevs[0] + self.eps)
        action = distrib.sample()
        prob = distrib.log_prob(action)

        action = action.numpy()

        self.probs.append(prob)

        return 1 if prob.item() > 0 else 0

    def update(self):
        """Updates the policy network's weights."""
        running_g = 0
        gs = []

        # Discounted return (backwards) - [::-1] will return an array in reverse
        for R in self.rewards[::-1]:
            running_g = R + self.gamma * running_g
            gs.insert(0, running_g)

        deltas = torch.tensor(gs)

        log_probs = torch.stack(self.probs)

        # Calculate the mean of log probabilities for all actions in the episode
        log_prob_mean = log_probs.mean()

        # Update the loss with the mean log probability and deltas
        # Now, we compute the correct total loss by taking the sum of the element-wise products.
        loss = -torch.sum(log_prob_mean * deltas)

        # Update the policy network
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Empty / zero out all episode-centric/related variables
        self.probs = []
        self.rewards = []

## Training Agent

### Prepare for training

In [45]:
# hyperparameters
learning_rate = 0.01
total_num_episodes = int(5e3)  # Total number of episodes
obs_space_dims = 12
action_space_dims = 1


env = gym.make("FlappyBird-v0", render_mode=None, use_lidar=False)
env = gym.wrappers.RecordEpisodeStatistics(env, buffer_length=total_num_episodes)


rewards_over_seeds = []

for seed in tqdm([1, 2, 3, 5, 8]):  # Fibonacci seeds
    # set seed
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)

    # Reinitialize agent every seed
    agent = REINFORCE(obs_space_dims, action_space_dims)
    reward_over_episodes = []

    for episode in range(total_num_episodes):
        # gymnasium v26 requires users to set seed while resetting the environment
        obs, info = env.reset(seed=seed)

        done = False
        while not done:
            action = agent.sample_action(obs)
            # print("SAMPLED ", action)

            obs, reward, terminated, truncated, info = env.step(action)
            agent.rewards.append(reward)

            done = terminated or truncated

        reward_over_episodes.append(env.return_queue[-1])
        agent.update()

        if episode % 1000 == 0:
            avg_reward = int(np.mean(env.return_queue))
            print("Episode:", episode, "Average Reward:", avg_reward)

    rewards_over_seeds.append(reward_over_episodes)

  0%|          | 0/5 [00:00<?, ?it/s]

Episode: 0 Average Reward: 2
Episode: 1000 Average Reward: 2
Episode: 2000 Average Reward: 2
Episode: 3000 Average Reward: 2
Episode: 4000 Average Reward: 2


 20%|██        | 1/5 [02:25<09:43, 145.82s/it]

Episode: 0 Average Reward: 2
Episode: 1000 Average Reward: 2
Episode: 2000 Average Reward: 2
Episode: 3000 Average Reward: 2
Episode: 4000 Average Reward: 2


 40%|████      | 2/5 [04:40<06:57, 139.11s/it]

Episode: 0 Average Reward: 2
Episode: 1000 Average Reward: 2
Episode: 2000 Average Reward: 2
Episode: 3000 Average Reward: 2
Episode: 4000 Average Reward: 2


 60%|██████    | 3/5 [07:05<04:43, 141.93s/it]

Episode: 0 Average Reward: 2
Episode: 1000 Average Reward: 2
Episode: 2000 Average Reward: 2
Episode: 3000 Average Reward: 2
Episode: 4000 Average Reward: 2


 80%|████████  | 4/5 [10:09<02:38, 158.63s/it]

Episode: 0 Average Reward: 2
Episode: 1000 Average Reward: 2
Episode: 2000 Average Reward: 2
Episode: 3000 Average Reward: 2
Episode: 4000 Average Reward: 2


100%|██████████| 5/5 [13:20<00:00, 160.09s/it]


### Training

### Visualize training