# Advantage Actor-Critic (A2C)

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim
import numpy as np
import pandas as pd
import flappy_bird_gymnasium
import matplotlib.pyplot as plt
import random
import gymnasium as gym

## Defining Policy Network

In [39]:
class ActorCritic(nn.Module):
    def __init__(self, input_dim=12, hidden_dim=128*2, action_dim=2):
        super(ActorCritic, self).__init__()
        # Shared layers
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, hidden_dim)
        self.fc4 = nn.Linear(hidden_dim, hidden_dim)
        
        # Policy head (actor)
        self.policy = nn.Linear(hidden_dim, action_dim)
        
        # Value head (critic)
        self.value = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        policy_logits = self.policy(x)
        value = self.value(x)
        return policy_logits, value


## Select Action & Save Log Prob

In [40]:
from torch.distributions import Categorical

def select_action(model, state):
    state = torch.FloatTensor(state).unsqueeze(0)  # shape (1, 12)
    logits, value = model(state)
    probs = F.softmax(logits, dim=-1)
    dist = Categorical(probs)
    action = dist.sample()
    return action.item(), dist.log_prob(action), value


## Training Loop

In [41]:
def train_a2c(env, model, optimizer, gamma=0.99, max_episodes=1000):
    total_reward = 0

    for episode in range(max_episodes):
        state = env.reset()[0]
        log_probs = []
        values = []
        rewards = []
        done = False

        while not done:
            action, log_prob, value = select_action(model, state)
            next_state, reward, done, _, info = env.step(action)
            
            log_probs.append(log_prob)
            values.append(value)
            rewards.append(reward)
            
            state = next_state
            total_reward += reward

        # Compute returns and advantages
        returns = []
        R = 0 if done else model(torch.FloatTensor(state).unsqueeze(0))[1].item()
        for r in reversed(rewards):
            R = r + gamma * R
            returns.insert(0, R)

        returns = torch.tensor(returns)
        values = torch.cat(values)
        log_probs = torch.stack(log_probs)

        advantage = returns - values.squeeze()

        # Losses
        actor_loss = -(log_probs * advantage.detach()).mean()
        critic_loss = advantage.pow(2).mean()
        loss = actor_loss + 0.5 * critic_loss

        # Optimization step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if episode % 100 == 0:
            print(f"Episode {episode} and past 100 episodes, Avg Reward: {total_reward/100:.2f}")
            total_reward = 0


## Training

In [42]:
env = gym.make("FlappyBird-v0", render_mode=None, use_lidar=False)

model = ActorCritic()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
train_a2c(env, model, optimizer, max_episodes=3000)

# Save model after training
torch.save(model.state_dict(), "saved_policies/AC2_model.pth")

Episode 0 and past 100 episodes, Avg Reward: -0.08
Episode 100 and past 100 episodes, Avg Reward: -0.52
Episode 200 and past 100 episodes, Avg Reward: 2.63
Episode 300 and past 100 episodes, Avg Reward: 4.43
Episode 400 and past 100 episodes, Avg Reward: 4.31
Episode 500 and past 100 episodes, Avg Reward: 4.07
Episode 600 and past 100 episodes, Avg Reward: 4.13
Episode 700 and past 100 episodes, Avg Reward: 4.67
Episode 800 and past 100 episodes, Avg Reward: 4.82
Episode 900 and past 100 episodes, Avg Reward: 4.35
Episode 1000 and past 100 episodes, Avg Reward: 4.43
Episode 1100 and past 100 episodes, Avg Reward: 4.47
Episode 1200 and past 100 episodes, Avg Reward: 4.99
Episode 1300 and past 100 episodes, Avg Reward: 5.09
Episode 1400 and past 100 episodes, Avg Reward: 4.97
Episode 1500 and past 100 episodes, Avg Reward: 5.06
Episode 1600 and past 100 episodes, Avg Reward: 5.23
Episode 1700 and past 100 episodes, Avg Reward: 5.03
Episode 1800 and past 100 episodes, Avg Reward: 4.93
Epi

## Let bot play the game

In [56]:
env = gym.make("FlappyBird-v0", render_mode="human", use_lidar=False)

# Load model
model = ActorCritic()
model.load_state_dict(torch.load("saved_policies/AC2_model.pth"))
model.eval()


# Use the trained model
obs, _ = env.reset()

done = False
while not done:

    action, log_prob, value = select_action(model, obs)
    print(action)

    obs, reward, done, truncated, info = env.step(action)
    # print(f"Action {action} gave {reward} reward...")
    # print(obs, reward, done, truncated, info)

env.close()


0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
1
1
0
0
0
0
0
0
1
0
0
0
1
1
1
1
1
1
1
