Tutorial: https://www.youtube.com/playlist?list=PL58zEckBH8fCMIVzQCRSZVPUp3ZAVagWi

In [3]:
# just running the game
import gymnasium as gym
import flappy_bird_gymnasium

env = gym.make('FlappyBird-v0', render_mode='human', use_lidar=False)

obs, _ = env.reset()

while True:
    action = env.action_space.sample()
    obs, reward, done, _, info = env.step(action)
    if done:
        break
    
env.close()

Video 2 notes
A Deep Q Network is a regular deep NN - sounds like the replay stuff is outside the neural network

Inputs for flappy birds are the position information for the pipes and bird,  the outputs are the Q values for flap or not - the expected reward for each action


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class DQN(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=256):
        super(DQN, self).__init__()
        # defining the layers
        # fc1 is the transformation from state (input) to hidden layer
        # fc2 is the transformation from hidden layer to action (output)
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, action_dim)
        
    def forward(self, x):
        # forward pass through the network
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [8]:
# run the NN one time through
state_dim = 12 # 12 inputs
action_dim = 2 # 2 outputs
net = DQN(state_dim, action_dim) # create the network
 # generate random states, first dimension is batch size (number of rand states)
state = torch.randn(10, state_dim)
output = net(state) # forward pass through the network
print(state)

tensor([[-0.7432,  1.1921, -0.3785,  0.9334,  1.3129, -1.2886, -0.4679,  0.3091,
         -0.1712,  0.1940,  0.2843, -0.9781],
        [-0.6122, -0.5828,  0.1675, -0.3430,  0.8718, -0.8395,  0.3988, -0.7742,
         -0.4155, -0.0101, -0.6008, -2.0459],
        [-0.6817, -0.8177,  1.0145,  1.8057, -1.3016, -0.1157, -1.5302,  1.6468,
          1.5894, -1.5910, -1.2085,  0.1495],
        [ 0.1929, -0.0856,  2.2308, -0.7415,  0.9316,  0.2523, -0.8879,  0.5740,
         -0.9224,  0.0234,  0.9386,  0.8440],
        [ 0.0718,  1.6999,  0.3295,  0.4061,  0.5953,  0.0145, -0.1113,  1.0412,
         -0.2331, -0.6426,  0.8588, -0.4182],
        [ 1.1015,  0.5450,  0.5400,  0.6448,  0.8865,  1.8591, -0.0749,  1.1033,
          0.9859, -0.3608,  1.4454,  1.0393],
        [ 0.0147, -0.8066, -0.6363, -1.4995,  0.7986, -0.3894, -0.4158, -1.2806,
         -0.2191, -0.2951, -0.0492, -0.6229],
        [ 0.2289, -1.1060, -0.3838, -1.2434, -1.7453, -1.1995, -0.0112,  0.0153,
          0.8604, -0.2299, -1.

In [None]:
# make a class for the agent

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class Agent:
    def run (self, is_train, render=False):
        # run the agent
        env = gym.make('FlappyBird-v0', render_mode='human' if render else None, use_lidar=False)
        
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.n
        
        policy_net = DQN(state_dim, action_dim).to_device(device)
        
        obs, _ = env.reset()
        while True:
            action = env.action_space.sample()
            obs, reward, done, _, info = env.step(action)
            if done:
                break
        env.close()

Video 3 notes

Experience replay:
  - an experience is defined as a tuple of (state, action, reward, next_state, terminated)
  - save these experiences in a replay buffer (first in first out)
  - the epsilon ( $\epsilon$ ) greedy policy is used to select the action
    - $\epsilon$ is the probability of selecting a random action, else the best action is selected
    - kinda of a stochastic annealing type thing
  

In [10]:
# replay memory
from collections import deque
import random

class ReplayMemory:
    def __init__(self, capacity, seed=None):
        self.memory = deque(maxlen=capacity)
        if seed is not None:
            random.seed(seed)
        
    def append(self, transition): # transition is a tuple of (state, action, next_state, reward, done)
        # append a transition to the buffer
        self.memory.append(transition)
        
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def __len__(self):
        return len(self.memory)

In [11]:
import itertools
import yaml

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class Agent:
    def __init__(self, hyperparam_option):
        with open('hyperparameters.yml', 'r') as f:
            all_hyperparams = yaml.safe_load(f)
            self.hyperparams = all_hyperparams[hyperparam_option]
            self.replay_memory_size = self.hyperparams['replay_memory_size'] # size of the replay memory
            self.mini_batch_size = self.hyperparams['mini_batch_size'] # size of the training data set sampled from the replay memory
            self.epsilon_init = self.hyperparams['epsilon_init'] # proportion of actions that are random
            self.epsilon_decay = self.hyperparams['epsilon_decay'] # decay rate of epsilon
            self.epsilon_min = self.hyperparams['epsilon_min'] # minimum value of epsilon
    
    
    
    def run (self, is_train, render=False):
        # run the agent
        env = gym.make('FlappyBird-v0', render_mode='human' if render else None, use_lidar=False)
        
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.n
        
        rewards_per_episode = []
        epsilon_history = []
        
        policy_net = DQN(state_dim, action_dim).to_device(device)
        
        if is_train:
            memory = ReplayMemory(capacity=10000)
            epsilon = self.epsilon_init
        
        state, _ = env.reset()
        
        for episode in itertools.count():
            episode = 0.0
            done = False
            while not done:
                
                if is_train and random.random() < epsilon:
                    action = env.action_space.sample()
                else:
                    action = policy_net(state).argmax()
                
                new_state, reward, done, _, info = env.step(action)
                episode += reward
                if is_train:
                    memory.append((state, action, new_state, reward, done))
                    
                state = new_state
                
            rewards_per_episode.append(episode)
            
            # in this implementation we're using a geometric decay for epsilon (taking the product of epsilon_decay and current epsilon)
            # a linear decay is another option, decreasing epsilon by a fixed amount each episode (adjust epsilon_decay hyperparameter accordingly)
            epsilon = max(epsilon * self.epsilon_decay, self.epsilon_min)
            epsilon_history.append(epsilon)
        