In [39]:
import os
import torch
import argparse

def loadNetwork(fileName, **kwargs):
    """
    Loads the previous training details from a file. The file should be 
    a dictionary, with all the details necessary to pickup where you left.
    The necessary data are explained below.
    
    Args:
        fileName (str): The name of the file
        kwargs (dict): A dictionary with the following keys:
            qNetwork_model (torch.nn): The qNetwork_model
            optimizer_main (torch.optim): The qNetwork_model's optimizer 
                object
            targetQNetwork_model (torch.nn): The targetQNetwork_model
            trainingParams (list): A list containing following parameters
                in order. We chose this approach to be able to change training 
                parameters in-place:
                startEpisode (int): The episode number to start from
                startEbsilon (int): The starting ebsilon number (The ebsilon 
                    prior to latest run's termination)
                lstHistory (list): The list holding the training history
                eDecay (float): The decay of ebsilon
                mem (ReplayMemory): An instance of replay memory object
    """
    # Check if all necessary data has been given so it can be overwritten 
    # when loaded and passed back to the user
    assert "qNetwork_model" in kwargs.keys(), "Please pass the qNetwork_model object"
    assert "optimizer_main" in kwargs.keys(), "Please pass the optimizer_main object"
    assert "targetQNetwork_model" in kwargs.keys(), "Please pass the targetQNetwork_model object"
    assert "trainingParams" in kwargs.keys(), "Please pass the trainingParams object"
    assert len(kwargs["trainingParams"]) == 5, "You should enter the following parameters in the order:\nstartEpisode, startEbsilon, lstHistory, eDecay, mem"
    
    if os.path.isfile(fileName):
        try:
            # Try to read the main file
            try:
                __data = torch.load(fileName, weights_only = False)
            except:
                print("Couldn't load the main file, trying to load the backup file")
                try:
                    # Try to read the backup file
                    __data = torch.load(os.path.join(os.path.dirname(fileName), "Backups", os.path.basename(fileName)), weights_only = False)
                except Exception as e:
                    raise Exception(f"Couldn't load the backup file")
            
            # Check if the file is a dictionary
            if not isinstance(__data, dict): raise Exception(f"Couldn't load the file. File {fileName} is not a dictionary")

            # Load Q-Network
            kwargs["qNetwork_model"].load_state_dict(__data["qNetwork_state_dict"]) # Model weights
            kwargs["optimizer_main"].load_state_dict(__data["qNetwork_optimizer_state_dict"]) # Optimizer

            # Load target Q-Network
            kwargs["targetQNetwork_model"].load_state_dict(__data["targetQNetwork_state_dict"]) # Model weights

            # Load process parameters
            kwargs["trainingParams"][0] = __data["episode"] # Starting episode number
            kwargs["trainingParams"][1] = __data["hyperparameters"]["ebsilon"] # Starting ebsilon
            kwargs["trainingParams"][2] = __data["train_history"]
            kwargs["trainingParams"][3] = __data["hyperparameters"]["eDecay"]

            kwargs["trainingParams"][4].loadExperiences(
                __data["experiences"]["state"],
                __data["experiences"]["action"],
                __data["experiences"]["reward"],
                __data["experiences"]["nextState"],
                __data["experiences"]["done"],
            )
            
            # All changes are in-place, however, we return the changed objects for convenience
            return (
                kwargs["qNetwork_model"],
                kwargs["optimizer_main"],
                kwargs["targetQNetwork_model"],
                kwargs["trainingParams"][0],  # startEpisode
                kwargs["trainingParams"][1],  # startEbsilon
                kwargs["trainingParams"][2],  # lstHistory
                kwargs["trainingParams"][3],  # eDecay
                kwargs["trainingParams"][4]   # mem
            )
            
        except Exception as e:
            print("ERROR: ", e)
            return None
    else:
        raise Exception(f"Couldn't load the file. File {fileName} does not exist")

def modelParamParser():
    """
    Gets the arguments from the command line for the model to run
    """
    parser = argparse.ArgumentParser(description = "The neural network with specified parameters")
    parser.add_argument("--name", "-n", type = str, default = "parallelDQN", help = "The name of the model")
    parser.add_argument("--continue_run", "-c", type = bool, default = False, help = "Continue the last run")
    parser.add_argument("--hidden_layers", "-hl", type = int, nargs = "+", default = [64, 64], help = "Hidden layer size")
    parser.add_argument("--learning_rate", "-lr", type = float, default = 0.0001, help = "Learning rate")
    parser.add_argument("--decay", "-d", type = float, default = 0.999, help = "Ebsilon decay rate")
    parser.add_argument("--batch", "-b", type = int, default = 1000, help = "The mini-batch size")
    parser.add_argument("--gamma", "-g", type = float, default = 0.995, help = "Discount factor")
    parser.add_argument("--extra_info", "-extra", type = str, default = "", help = "Extra information")
    
    return parser

In [40]:
from models import qNetwork_ANN
from collections import deque, namedtuple
import os
import argparse
from utils import *
from IPython.display import clear_output

import sys
from tqdm import tqdm
import pandas as pd
import random, imageio, time, copy
import numpy as np
import gymnasium as gym
import matplotlib.pyplot as plt

import torch.nn as nn
import torch

# Define the super parameters
projectName = "parallelDQN"

# Save/Get weights from persistent storage. Pass empty string for not saving. 
# Pass drive for using google derive (If code is running in colab). If local, 
# pass the location of your desire
savePath = "./Data"
continueLastRun = False
backUpData = {}

# Make the save directory if it does not exist
os.makedirs(savePath, exist_ok = True)

# Making the environments
NUM_ENVS = 2
env = gym.make("LunarLander-v3") # Use render_mode = "human" to render each episode
envs = gym.make_vec(
    "LunarLander-v3", 
    num_envs = NUM_ENVS, # Number of environments to create
    vectorization_mode = "async",
    wrappers = (gym.wrappers.TimeAwareObservation,),
)
states, info = env.reset() # Get a sample state of the environment
stateSize = env.observation_space.shape # Number of variables to define current step
nActions = env.action_space.n # Number of actions
actionSpace = np.arange(nActions).tolist()

# Set pytorch parameters: The device (CPU or GPU) and data types
__device = torch.device("cuda") if torch.cuda.is_available() else torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
__dtype = torch.float

# Model parameters
parser = modelParamParser()
args, unknown = parser.parse_known_args()
hiddenNodes = args.hidden_layers
learningRate = args.learning_rate
eDecay = args.decay
miniBatchSize = args.batch # The length of mini-batch that is used for training
gamma = args.gamma # The discount factor
extraInfo = args.extra_info
continueLastRun = args.continue_run

# handle the save location
modelDetails = f"{'_'.join([str(l) for l in hiddenNodes])}_{learningRate}_{eDecay}_{miniBatchSize}_{gamma}_{extraInfo}"
savePath = os.path.join(savePath, f"{projectName}_{modelDetails}")
os.makedirs(savePath, exist_ok=True)

# Get how many times the model has been trained and add it to the file name
runNumber =  len([f for f in os.listdir(savePath) if f"{modelDetails}" in f and os.path.isfile(os.path.join(savePath, f))]) if savePath != None else ""
modelDetails += f"_{runNumber}" if not continueLastRun else f"_{runNumber -1 }"
saveFileName = f"{projectName}_{modelDetails}.pth"

# Make the model objects
qNetwork_model = qNetwork_ANN([stateSize[0], *hiddenNodes, nActions]).to(__device, dtype = __dtype)
targetQNetwork_model = qNetwork_ANN([stateSize[0], *hiddenNodes, nActions]).to(__device, dtype = __dtype)

# Two models should have identical weights initially
targetQNetwork_model.load_state_dict(qNetwork_model.state_dict())

# TODO: Add gradient clipping to the optimizer for avoiding exploding gradients
# Suitable optimizer for gradient descent
optimizer_main = torch.optim.Adam(qNetwork_model.parameters(), lr=learningRate)
optimizer_target = torch.optim.Adam(targetQNetwork_model.parameters(), lr=learningRate)

# Starting episode and ebsilon
startEpisode = 0
startEbsilon = None
lstHistory = None

# Making the memory buffer object
memorySize = 100_000 # The length of the entire memory
mem = ReplayMemory(memorySize, __dtype, __device)

if continueLastRun and os.path.isfile(os.path.join(savePath, saveFileName)):
    # Load necessary parameters to resume the training from most recent run 
    saveLen = 1
    load_params = {
        "qNetwork_model": qNetwork_model,
        "optimizer_main": optimizer_main,
        "targetQNetwork_model": targetQNetwork_model,
        "trainingParams": [startEpisode, startEbsilon, lstHistory, eDecay, mem]
    }
    qNetwork_model, optimizer_main, targetQNetwork_model, startEpisode, startEbsilon, lstHistory, eDecay, mem =loadNetwork(os.path.join(savePath, saveFileName), **load_params)
    print("Continuing from episode:", startEpisode)

In [41]:
print(f"Device is: {__device}")

# Start the timer
tstart = time.time()

# The experience of the agent is saved as a named tuple containing various variables
agentExp = namedtuple("exp", ["state", "action", "reward", "nextState", "done"])

# Parameters
nEpisodes = 6000 # Number of learning episodes
maxNumTimeSteps = 1000 # The number of time step in each episode
ebsilon = 1 if startEbsilon == None else startEbsilon # The starting  value of ebsilon
ebsilonEnd   = .1 # The finishing value of ebsilon
eDecay = eDecay # The rate at which ebsilon decays
numUpdateTS = 4 # Frequency of time steps to update the NNs
numP_Average = 100 # The number of previous episodes for calculating the average episode reward

# Variables for saving the required data for later analysis
episodePointHist = [] # For saving each episode's point for later demonstration
episodeHistDf = None
lstHistory = [] if lstHistory == None else lstHistory
initialCond = None # Initial condition (state) of the episode
epPointAvg = -999999 if len(lstHistory) == 0 else pd.DataFrame(lstHistory).iloc[-numP_Average:]["points"].mean()
latestCheckpoint = 0
_lastPrintTime = 0


initialSeed = random.randint(1,1_000_000_000) # The random seed that determines the episode's I.C.
states, info = envs.reset(seed = initialSeed)
points = np.zeros((envs.num_envs, 1))
initialCond = states
tempTime = time.time()
t = 0
episode = 0

while True:
    # The last element of each state is the time step, so we slice the tensor
    qValueForActions = qNetwork_model(torch.tensor(states[:,:-1], device = __device, dtype = __dtype))

    # use ebsilon-Greedy algorithm to take the new step
    action = getAction(qValueForActions, ebsilon, actionSpace, __device).cpu().numpy()

    # Take a step
    observation, reward, terminated, truncated, _ = envs.step(action)

    batchExperiences = [agentExp(s, a, r, o, d) for s, a, r, o, d in zip(states[:,:-1], action, reward, observation[:,:-1], terminated | truncated) ]

    # Store the experience of the current step in an experience deque.
    mem.addMultiple(batchExperiences)

    # Check to see if we have to update the networks in the current step
    update = updateNetworks(t, mem, miniBatchSize, numUpdateTS)

    if update:
        # Update the NNs
        experience = mem.sample(miniBatchSize)

        # Update the Q-Network and the target Q-Network
        # Bear in mind that we do not update the target Q-network with direct gradient descent.
        # so there is no optimizer needed for it
        fitQNetworks(experience, gamma, [qNetwork_model, optimizer_main], [targetQNetwork_model, None])

    # Save the necessary data
    points += reward.reshape(-1, 1)
    states = observation.copy()

    # Print the training status. Print only once each second to avoid jitters.
    if 1 < (time.time() - _lastPrintTime):
        os.system('clear')
        _lastPrintTime = time.time()
        print(f"ElapsedTime: {int(time.time() - tstart): <5}s | Episode: {episode: <5} | Time step: {t: <5} | The average of the {numP_Average: <5} episodes is: {int(epPointAvg): <5}")
        print(f"Latest checkpoint: {latestCheckpoint} | Speed {t/(time.time()-tempTime+1e-9):.1f} tps | ebsilon: {ebsilon:.3f}")

    # Handle episode ending
    if (terminated | truncated).any():
        mask = terminated | truncated
        finalPoint = points[mask]
        
        for k in range(finalPoint.shape[0]):
            # Decay ebsilon
            ebsilon = decayEbsilon(ebsilon, eDecay, ebsilonEnd)
            episode += 1
            
            # Save the episode history in dataframe
            if (episode+1) % 3 == 0:
                # only save every 10 episodes
                lstHistory.append({
                    "episode": episode,
                    "seed": initialSeed,
                    "points": finalPoint[k]
                })
            
            # Save model weights and parameters periodically (For later use)
            if (episode + 1) % (20) == 0:
                _exp = mem.exportExperience()
                backUpData = {
                    "episode": episode,
                    'qNetwork_state_dict': qNetwork_model.state_dict(),
                    'qNetwork_optimizer_state_dict': optimizer_main.state_dict(),
                    'targetQNetwork_state_dict': targetQNetwork_model.state_dict(),
                    'targetQNetwork_optimizer_state_dict': optimizer_target.state_dict(),
                    'hyperparameters': {"ebsilon": ebsilon, "eDecay":eDecay},
                    "train_history": lstHistory,
                    "experiences": {
                        "state": _exp["state"],
                        "action": _exp["action"],
                        "reward": _exp["reward"],
                        "nextState": _exp["nextState"],
                        "done": _exp["done"]
                    }
                }
                print("saving in ", os.path.join(savePath, saveFileName), episode)
                saveModel(backUpData, os.path.join(savePath, saveFileName))

                # Save the episode number
                latestCheckpoint = episode
        
        # Add the points to the history
        episodePointHist.extend(finalPoint.tolist())
        
        # Getting the average of {numP_Average} episodes
        epPointAvg = np.mean(episodePointHist[-numP_Average:])

        # Reset the points of terminated episodes
        points[mask] = 0
    
    
    
    if episode == 25: break


    # Stop the learning process if suitable average point is reached
    if 200 < epPointAvg:
        Tend = time.time()
        print(f"\nThe learning ended. Elapsed time for learning: {Tend-tstart:.2f}s. \nAVG of latest 100 episodes: {epPointAvg}")
        
        _exp = mem.exportExperience()
        backUpData = {
            "episode": episode,
            'qNetwork_state_dict': qNetwork_model.state_dict(),
            'qNetwork_optimizer_state_dict': optimizer_main.state_dict(),
            'targetQNetwork_state_dict': targetQNetwork_model.state_dict(),
            'targetQNetwork_optimizer_state_dict': optimizer_target.state_dict(),
            'hyperparameters': {"ebsilon": ebsilon, "eDecay":eDecay},
            "train_history": lstHistory,
            "experiences": {
                "state": _exp["state"],
                "action": _exp["action"],
                "reward": _exp["reward"],
                "nextState": _exp["nextState"],
                "done": _exp["done"]
            }
        }
        
        saveModel(backUpData, os.path.join(savePath, saveFileName))

        # Save the episode number
        latestCheckpoint = episode
        
        break
    t += 1

# Reset the index
episodeHistDf = pd.DataFrame(lstHistory)
episodeHistDf.reset_index(drop=True, inplace=True)

Device is: cuda
ElapsedTime: 0    s | Episode: 0     | Time step: 0     | The average of the 100   episodes is: -999999
Latest checkpoint: 0 | Speed 0.0 tps | ebsilon: 1.000
ElapsedTime: 1    s | Episode: 8     | Time step: 453   | The average of the 100   episodes is: -204 
Latest checkpoint: 0 | Speed 418.2 tps | ebsilon: 0.992
ElapsedTime: 2    s | Episode: 16    | Time step: 747   | The average of the 100   episodes is: -197 
Latest checkpoint: 0 | Speed 352.1 tps | ebsilon: 0.984
saving in  ./Data\parallelDQN_64_64_0.0001_0.999_1000_0.995_\parallelDQN_64_64_0.0001_0.999_1000_0.995__1.pth 19
ElapsedTime: 3    s | Episode: 21    | Time step: 979   | The average of the 100   episodes is: -194 
Latest checkpoint: 19 | Speed 307.9 tps | ebsilon: 0.979
