# Deep SARSA for different environments

## 1. Import the libraries

In [2]:
import numpy as np
import matplotlib.pyplot as plt #plotting library
from matplotlib import animation #animated visualizations library
from collections import namedtuple, deque
#nametuple creates tuple subclasses with name fields, access elements by names instead of index
#deque (double-ended queue) for adding and removing elements from both ends
from tqdm import tqdm
#add progress bars to Python code for easy monitoring progress of loops and tasks
# %matplotlib inline
import gym #environments for agents
from datetime import datetime #manipulating dates and times
import pandas as pd #work with structured data
import torch #Pytorch supports tensor computations and neural networks
import torch.nn as nn #Pytorch supports building neural networks
import torch.nn.functional as Function
#common functions in neural network operations
    # Activation functions (ReLU, sigmoid, tanh)
    # Loss functions (cross_entropy, mse_loss)
    #Utility functions for tensor manipulation (softmax, dropout, batch_norm, etc.)
import torch.optim as optim #optimization algorithms for training neural networks
import random #generate random numbers/selections
from collections import namedtuple, deque
import itertools
# provides various functions for creating iterators and combining them for complex interators
# includes cycle, chain, zip, etc.

## 2. Neural network model for approximating Q-values

In [8]:
class QNetwork(nn.Module):
    #Actor Model

    def __init__(self, state_size, action_size, seed, fc1_units=64, fc2_units=64):
        #Parameters
#             state_size (int): Dimensionality of input state space
#             action_size (int): Dimensionality of output action space
#             seed (int): Random seed for reproducibility
#             fc1_units (int): Number of neurons (units) in first fully connected hidden layer
#             fc2_units (int): Number of neurons (unit) in second fully connected hidden layer

        #Initialization
        super(QNetwork, self).__init__()
        self.seed = torch.manual_seed(seed) # Set the random seed
        state_size = env.observation_space.shape[0]
        #Ensure random numbers generate are reproducible
        #Running same code with the same seed will produce the same sequence of random numbers

        # nn.Linear creates fully connected layers (input units, output units)
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.fc3 = nn.Linear(fc2_units, action_size)

    def forward(self, state):
        # Network to map state with action values (Q-values)
        #inside method, input state pass through each layer
        #ReLU activation functions are applied to the outputs of hidden layers
#         print('state:', state.shape)
        #x output of 1 layer is input to the next
        x = Function.relu(self.fc1(state))
        x = Function.relu(self.fc2(x))
        return self.fc3(x)

## 3. Set the parameters

In [9]:
gamma = 1 #Discount factor
tau = 1 #Soft update target parameters, tau = 1 = copy completely
alpha = 0.005 #learning rate
update_every = 4 #How often to update the network
device = torch.device("cpu") #Device Initialization

## 4. Agent learning implementation

In [10]:
class Agent():
    def __init__(self, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        #self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=alpha)
        self.t_step = 0

    def step(self, state, action, reward, next_state, next_action, done, state_size):
        self.t_step += 1
        self.learn(state, action, reward, next_state, next_action, done, state_size)

    def act(self, state, epsilon):
        if isinstance(state, np.ndarray):
            if state.dtype == np.object_:
                # Convert elements of the NumPy array to compatible types
                state = np.array([self.convert_to_compatible_type(elem) for elem in state])
            # Convert the modified NumPy array to a PyTorch tensor for neural network input
            state = torch.as_tensor(state, dtype=torch.float32).to(device)
        elif isinstance(state, list) or isinstance(state, tuple):
            # Convert the state to a NumPy array and handle ragged nested sequences
            state = self.handle_ragged_nested_sequence(state)
            # Convert the modified NumPy array to a PyTorch tensor for neural network input
            state = torch.as_tensor(state, dtype=torch.float32).to(device)
        elif isinstance(state, torch.Tensor):
            # Ensure the tensor is in the correct device and data type
            state = state.to(device)
        else:
            raise TypeError("Input state should be a NumPy array, list, tuple, or torch.Tensor.")

        # Ensure the state has the correct shape for the neural network input
        state = state.unsqueeze(0)  # Add a batch dimension

        self.qnetwork_local.eval()  # Evaluation mode
        with torch.no_grad():  # Disable gradient calculation when choosing action
            action_values = self.qnetwork_local(state)  # Pass preprocessed states into local Q-network
        self.qnetwork_local.train()  # Set local Q network back to training mode after inference is complete

        # Epsilon-greedy action selection
        if random.random() > epsilon:
            # Return action with highest Q-value
            action = np.argmax(action_values.cpu().data.numpy())
        else:
            # Select a random action
            action = random.choice(range(self.action_size))
        action = min(max(action, 0), self.action_size - 1) # Ensure action is within valid range

        return action

    def convert_to_compatible_type(self, elem):
        # Convert the element to a compatible type if it is a numpy.object_
        if isinstance(elem, str):
            return float(elem)
        elif isinstance(elem, np.ndarray):
            return elem.astype(np.float32)
        else:
            return float(elem)

    def handle_ragged_nested_sequence(self, state):
        if not state:
            return np.array([])  # Return an empty NumPy array if the state is empty
        else:
            # Convert dictionaries to lists of values
            state = [list(d.values()) if isinstance(d, dict) else d for d in state]
            # Determine the maximum length among all nested sequences
            max_length = max(len(sublist) for sublist in state)
            # Pad shorter sequences with zeros to make them of equal length
            state = [sublist + [0] * (max_length - len(sublist)) if len(sublist) < max_length else sublist for sublist in state]
            # Convert the nested sequence to a NumPy array
            state = np.array(state)
            return state

    def learn(self, state, action, reward, next_state, next_action, done, state_size):

        state_size = env.observation_space.shape[0]

        # Function to flatten ragged nested sequences into a flat array of compatible type
        def flatten_nested_sequence(elem):
            flattened = []
            for e in elem:
                if isinstance(e, (list, tuple)):
                    flattened.extend(flatten_nested_sequence(e))
                elif isinstance(e, np.ndarray):
                    if e.dtype == np.object_:
                        raise ValueError("Cannot handle np.ndarray of type numpy.object_")
                    else:
                        flattened.append(e)
                else:
                    flattened.append(e)
            return flattened

        # Convert state and next_state to appropriate types
        if isinstance(state, np.ndarray):
            state = flatten_nested_sequence(state)
            state = torch.as_tensor(state, dtype=torch.float32).to(device)
        elif isinstance(state, torch.Tensor):
            state = state.to(device)
        else:
            state = torch.zeros((1, state_size), dtype=torch.float32).to(device)

        if isinstance(next_state, np.ndarray):
            next_state = flatten_nested_sequence(next_state)
            next_state = torch.as_tensor(next_state, dtype=torch.float32).to(device)
        elif isinstance(next_state, torch.Tensor):
            next_state = next_state.to(device)
        else:
            next_state = torch.zeros((1, state_size), dtype=torch.float32).to(device)
        # Convert action, reward, and done flag to tensors
        #action = torch.as_tensor(action, dtype=torch.float32).to(device)
        reward = torch.as_tensor(reward, dtype=torch.float32).to(device)
        #done = torch.as_tensor(done, dtype=torch.bool).to(device)

        if state.size() == torch.Size([1, 8]):
            # Convert the second type of state to the first type
            state = state.view(-1)

        self.optimizer.zero_grad()
        if i_episode < 3 or i_episode > 1998:
            print("State: ", state)
            print("State size: ", state.size())
            print("Action: ", action)
            print("All Q_current): ", self.qnetwork_local(state))
        Q_current = self.qnetwork_local(state)[action]
        
        if i_episode < 3 or i_episode > 1998:    
            print("Q(current_action): ", Q_current)

        #print(Q_current.shape)
        
        if i_episode < 3 or i_episode > 1998:
            print("Next state: ", next_state)
            print("Next action:  ", next_action)
            print("All Q_next: ",self.qnetwork_local(next_state))
        
        Q_next = self.qnetwork_local(next_state)[next_action]
        
        if i_episode < 3 or i_episode > 1998:
            print("Q(next_state): ", Q_next)
            print("Reward", reward)
        
        Q_target = reward + gamma * Q_next * (1 - done)
        
        if i_episode < 3 or i_episode > 1998:
            print("Q(target_action): ", Q_target)
            print("Error (Q-target_action - Q_current_action): ", Q_target - Q_current)
            print(" ")

        loss = Function.mse_loss(Q_current, Q_target)
        loss.backward()
        self.optimizer.step()

        # #Update the target network
        # self.soft_update(self.qnetwork_local, self.qnetwork_target, tau)


    def soft_update(self, local_model, target_model, tau):
        #local_model: online model, actively being trained. Weights will be copied from here
        #target_model: use to generate target Q-values during training. Weights will be copied to here
        #tau: interpolation parameters determine rate at which parameters of target models are updated
        #small tau slower update, big tau faster update, less stable

        #function iterates over parameters of both target model and local model using zip
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            #for each target param - local param pair, update target param by the formula
            # target_param = tau*local_param + (1-tau)*target_local
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)


    # Function to simulate a model in an environment
    def simulate_model(env_name, model_path):
        # Load the environment
        env = gym.make(env_name)

        # Get environment parameters
        state_size = env.observation_space.shape[0]
        action_size = env.action_space.n

        # Initialize the agent
        agent = Agent(state_size, action_size, seed)

        # Load the model weights
        agent.qnetwork_local.load_state_dict(torch.load(model_path))
        agent.qnetwork_local.eval()

        # Simulate the model in the environment
        scores = []
        n_episodes = 100  # Number of episodes for simulation
        max_t = 1000  # Maximum number of timesteps per episode

        for i_episode in tqdm(range(1, n_episodes+1)):
            state = env.reset()
            score = 0

            for t in range(max_t):
                action = agent.act(state, epsilon = 0)
                step_result = env.step(action)
                next_state, reward, done, _ = step_result[:4]
                next_action = agent.act(next_state, epsilon)
                agent.step(state,action,reward,next_state,next_action, done, state_size)
                state = next_state
                action = next_action
                score = score + reward
                if done:
                    break

            scores.append(score)

        # Close the environment
        env.close()

        # Print average score
        print("Average score:", np.mean(scores))

## 5. Training parameters and environments

In [11]:
envs = ['LunarLander-v2', 'MountainCar-v0'] #list of environments ,
seeds = [1,37,42] #list of seeds , 37, 42
n_episodes = 2000 #number of training episodes
max_t = 1000 #maximum number of timesteps
epsilon_start = 1 #starting value of epsilon greedy
epsilon_end = 0.01 #minimum value of epsilon
epsilon_decay = 0.995 #rate at which epsilon decays

## 6. Training implementation

In [12]:
print("Algorithm: Deep SARSA" )
for i in envs:
    print("ENVIRONMENT:-----------", i)
    env = gym.make(i)
    res=[]

    for j in seeds:
        print("Seed = ", j)
        rewards = []
        aver_reward = []
        aver = deque(maxlen = 100)
        state_size = env.observation_space.shape[0]
        action_size=env.action_space.n
        agent = Agent(state_size, action_size, j)
        epsilon = epsilon_start

        for i_episode in tqdm(range(1, n_episodes+1)):
            if i_episode < 3 or i_episode > 1998:
                print("Iteration number: ", i_episode-1)
            state = env.reset()
            score = 0
            action = agent.act(state, epsilon)
            for t in range(max_t):

                step_result = env.step(action)
                next_state, reward, done, _ = step_result[:4]
                next_action = agent.act(next_state, epsilon)
                agent.step(state,action,reward,next_state,next_action, done, state_size)
                state = next_state
                action = next_action
                score = score + reward
                if done:
                    break

            aver.append(score)
            aver_reward.append(np.mean(aver))
            rewards.append(score)
            epsilon = max(epsilon_end, epsilon_decay*epsilon) # decrease epsilon

        # Save the model
        reward = "model/"+ "Seed" + str(j)  + i + "_" + "_" + str(n_episodes) + "_" + str(datetime.now().strftime("%Y%m%d%H%M%S"))
        torch.save(agent.qnetwork_local.state_dict(), reward + '.pt')

        # Append the average reward to the results
        res.append(aver_reward)
        print("----------------End Algorithm--------------------")

    fig = plt.figure()

    reward = 'plots/' + i + '_result' + str(datetime.now().strftime("%Y%m%d%H%M%S"))
    df = pd.DataFrame({str(seeds[0]): res[0], str(seeds[1]): res[1], str(seeds[2]): res[2]})  # Use seed as column labels
    df.to_csv(reward + '.csv')
    print("------------------------End Environment-------------------")

    plt.xlabel("Episode")
    plt.ylabel("Reward")

    # Plot rewards for each seed
    for seed in seeds:
        plt.plot(df[str(seed)], label='Seed ' + str(seed))

    plt.title('Learning Curve ' + i)

    # Insert the legends in the plot
    fig.legend(loc='lower right')
    fig.savefig(reward + '.png', dpi=100)





Algorithm: Deep SARSA
ENVIRONMENT:----------- LunarLander-v2
Seed =  1


  0%|                                               | 0/2000 [00:00<?, ?it/s]

Iteration number:  0
State:  tensor([0., 0., 0., 0., 0., 0., 0., 0.])
State size:  torch.Size([8])
Action:  0
All Q_current):  tensor([-0.0122,  0.0065, -0.0236, -0.0899], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-0.0122, grad_fn=<SelectBackward0>)
Next state:  tensor([-6.2571e-04,  1.4014e+00, -3.1653e-02, -2.2485e-01,  7.2394e-04,
         7.0954e-03,  0.0000e+00,  0.0000e+00])
Next action:   3
All Q_next:  tensor([-0.0058,  0.0767, -0.0193, -0.1091], grad_fn=<AddBackward0>)
Q(next_state):  tensor(-0.1091, grad_fn=<SelectBackward0>)
Reward tensor(-2.0661)
Q(target_action):  tensor(-2.1753, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(-2.1631, grad_fn=<SubBackward0>)
 
State:  tensor([-6.2571e-04,  1.4014e+00, -3.1653e-02, -2.2485e-01,  7.2394e-04,
         7.0954e-03,  0.0000e+00,  0.0000e+00])
State size:  torch.Size([8])
Action:  3
All Q_current):  tensor([-0.0163,  0.0577, -0.0056, -0.0178], grad_fn=<AddBackward0>)
Q(current_action):  tenso

State:  tensor([-0.0040,  1.2992, -0.0266, -0.5054, -0.0268, -0.0745,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  3
All Q_current):  tensor([-0.0207, -0.0385,  0.1218,  0.0333], grad_fn=<AddBackward0>)
Q(current_action):  tensor(0.0333, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.0042,  1.2872, -0.0145, -0.5328, -0.0329, -0.1231,  0.0000,  0.0000])
Next action:   1
All Q_next:  tensor([-0.0206, -0.0374,  0.1210,  0.0345], grad_fn=<AddBackward0>)
Q(next_state):  tensor(-0.0374, grad_fn=<SelectBackward0>)
Reward tensor(-2.1364)
Q(target_action):  tensor(-2.1737, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(-2.2071, grad_fn=<SubBackward0>)
 
State:  tensor([-0.0042,  1.2872, -0.0145, -0.5328, -0.0329, -0.1231,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  1
All Q_current):  tensor([-0.0264, -0.0385,  0.1312,  0.0257], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-0.0385, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.

All Q_next:  tensor([-0.0872, -0.2254,  0.7193, -0.2418], grad_fn=<AddBackward0>)
Q(next_state):  tensor(-0.2254, grad_fn=<SelectBackward0>)
Reward tensor(-2.3013)
Q(target_action):  tensor(-2.5266, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(-2.2864, grad_fn=<SubBackward0>)
 
State:  tensor([-0.0041,  1.0783,  0.0315, -0.7389, -0.1284, -0.2283,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  1
All Q_current):  tensor([-0.0939, -0.2173,  0.7605, -0.2703], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-0.2173, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.0039,  1.0611,  0.0214, -0.7651, -0.1378, -0.1880,  0.0000,  0.0000])
Next action:   2
All Q_next:  tensor([-0.0945, -0.2164,  0.7597, -0.2706], grad_fn=<AddBackward0>)
Q(next_state):  tensor(0.7597, grad_fn=<SelectBackward0>)
Reward tensor(-1.8356)
Q(target_action):  tensor(-1.0759, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(-0.8586, grad_fn=<SubBackwa

All Q_next:  tensor([-0.3103, -0.0411,  1.5333, -0.7203], grad_fn=<AddBackward0>)
Q(next_state):  tensor(-0.3103, grad_fn=<SelectBackward0>)
Reward tensor(-0.7243)
Q(target_action):  tensor(-1.0346, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(-0.9928, grad_fn=<SubBackward0>)
 
State:  tensor([ 0.0033,  0.7664,  0.0611, -0.9853, -0.2745, -0.0956,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  0
All Q_current):  tensor([-0.3099, -0.0278,  1.5282, -0.7322], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-0.3099, grad_fn=<SelectBackward0>)
Next state:  tensor([ 0.0038,  0.7436,  0.0611, -1.0120, -0.2793, -0.0956,  0.0000,  0.0000])
Next action:   2
All Q_next:  tensor([-0.3120, -0.0266,  1.5322, -0.7348], grad_fn=<AddBackward0>)
Q(next_state):  tensor(1.5322, grad_fn=<SelectBackward0>)
Reward tensor(-0.8653)
Q(target_action):  tensor(0.6669, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(0.9769, grad_fn=<SubBackward

  0%|                                       | 1/2000 [00:00<32:32,  1.02it/s]

tensor(0.4915)
Q(target_action):  tensor(0.1783, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(0.1846, grad_fn=<SubBackward0>)
 
State:  tensor([ 0.0209,  0.3697,  0.1217, -1.2209, -0.3234,  0.0091,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  0
All Q_current):  tensor([-3.2258e-01,  2.1012e-04,  1.1780e+00, -5.3861e-01],
       grad_fn=<AddBackward0>)
Q(current_action):  tensor(-0.3226, grad_fn=<SelectBackward0>)
Next state:  tensor([ 0.0222,  0.3416,  0.1217, -1.2475, -0.3229,  0.0091,  0.0000,  0.0000])
Next action:   2
All Q_next:  tensor([-3.2445e-01,  1.1576e-03,  1.1769e+00, -5.3876e-01],
       grad_fn=<AddBackward0>)
Q(next_state):  tensor(1.1769, grad_fn=<SelectBackward0>)
Reward tensor(0.1861)
Q(target_action):  tensor(1.3631, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(1.6857, grad_fn=<SubBackward0>)
 
State:  tensor([ 0.0222,  0.3416,  0.1217, -1.2475, -0.3229,  0.0091,  0.0000,  0.0000])
State size:

State:  tensor([-0.0204,  1.3896, -0.5143, -0.2765,  0.0260,  0.1216,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  2
All Q_current):  tensor([-0.2222, -0.3837,  1.5964, -0.5599], grad_fn=<AddBackward0>)
Q(current_action):  tensor(1.5964, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.0256,  1.3838, -0.5243, -0.2596,  0.0317,  0.1132,  0.0000,  0.0000])
Next action:   3
All Q_next:  tensor([-0.2211, -0.3821,  1.5876, -0.5567], grad_fn=<AddBackward0>)
Q(next_state):  tensor(-0.5567, grad_fn=<SelectBackward0>)
Reward tensor(-0.3997)
Q(target_action):  tensor(-0.9564, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(-2.5529, grad_fn=<SubBackward0>)
 
State:  tensor([-0.0256,  1.3838, -0.5243, -0.2596,  0.0317,  0.1132,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  3
All Q_current):  tensor([-0.1918, -0.4190,  1.5082, -0.5083], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-0.5083, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.

Q(target_action):  tensor(0.0302, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(0.0695, grad_fn=<SubBackward0>)
 
State:  tensor([-0.1148,  1.2297, -0.5017, -0.5675,  0.1193, -0.0206,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  3
All Q_current):  tensor([ 0.0263, -0.6012,  0.5893, -0.0311], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-0.0311, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.1198,  1.2164, -0.4933, -0.5934,  0.1166, -0.0550,  0.0000,  0.0000])
Next action:   0
All Q_next:  tensor([ 0.0267, -0.6115,  0.6077, -0.0366], grad_fn=<AddBackward0>)
Q(next_state):  tensor(0.0267, grad_fn=<SelectBackward0>)
Reward tensor(0.1163)
Q(target_action):  tensor(0.1431, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(0.1741, grad_fn=<SubBackward0>)
 
State:  tensor([-0.1198,  1.2164, -0.4933, -0.5934,  0.1166, -0.0550,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  0
All Q_current):  tensor([ 0.0283

All Q_current):  tensor([ 0.0527, -0.7019,  0.7414, -0.1248], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-0.1248, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.2057,  0.9591, -0.5790, -0.7856,  0.1522,  0.0932,  0.0000,  0.0000])
Next action:   0
All Q_next:  tensor([ 0.0539, -0.7135,  0.7617, -0.1314], grad_fn=<AddBackward0>)
Q(next_state):  tensor(0.0539, grad_fn=<SelectBackward0>)
Reward tensor(-0.1883)
Q(target_action):  tensor(-0.1344, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(-0.0096, grad_fn=<SubBackward0>)
 
State:  tensor([-0.2057,  0.9591, -0.5790, -0.7856,  0.1522,  0.0932,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  0
All Q_current):  tensor([ 0.0511, -0.7238,  0.7779, -0.1332], grad_fn=<AddBackward0>)
Q(current_action):  tensor(0.0511, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.2115,  0.9408, -0.5790, -0.8123,  0.1569,  0.0932,  0.0000,  0.0000])
Next action:   3
All Q_next:  tensor([ 0.0527, -0.7324,  0.7

Q(current_action):  tensor(-0.1403, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.3241,  0.5602, -0.6471, -1.1146,  0.2051,  0.0367,  0.0000,  0.0000])
Next action:   3
All Q_next:  tensor([-0.1417, -0.8201,  1.0168, -0.1557], grad_fn=<AddBackward0>)
Q(next_state):  tensor(-0.1557, grad_fn=<SelectBackward0>)
Reward tensor(-0.6104)
Q(target_action):  tensor(-0.7662, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(-0.6259, grad_fn=<SubBackward0>)
 
State:  tensor([-0.3241,  0.5602, -0.6471, -1.1146,  0.2051,  0.0367,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  3
All Q_current):  tensor([-0.1522, -0.8215,  1.0185, -0.1495], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-0.1495, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.3305,  0.5346, -0.6398, -1.1402,  0.2055,  0.0062,  0.0000,  0.0000])
Next action:   2
All Q_next:  tensor([-0.1544, -0.8319,  1.0363, -0.1536], grad_fn=<AddBackward0>)
Q(next_state):  tensor(1.0363, grad_fn=<Selec

  0%|                                       | 2/2000 [00:01<31:49,  1.05it/s]

All Q_current):  tensor([-0.2595, -0.8560,  0.9217,  0.0107], grad_fn=<AddBackward0>)
Q(current_action):  tensor(0.9217, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.4484,  0.0727, -0.7216, -1.2741,  0.2139,  0.0499,  0.0000,  0.0000])
Next action:   3
All Q_next:  tensor([-0.2591, -0.8508,  0.9136,  0.0105], grad_fn=<AddBackward0>)
Q(next_state):  tensor(0.0105, grad_fn=<SelectBackward0>)
Reward tensor(0.2904)
Q(target_action):  tensor(0.3009, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(-0.6208, grad_fn=<SubBackward0>)
 
State:  tensor([-0.4484,  0.0727, -0.7216, -1.2741,  0.2139,  0.0499,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  3
All Q_current):  tensor([-0.2623, -0.8662,  0.9382,  0.0117], grad_fn=<AddBackward0>)
Q(current_action):  tensor(0.0117, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.4555,  0.0435, -0.7119, -1.2992,  0.2144,  0.0095,  0.0000,  0.0000])
Next action:   0
All Q_next:  tensor([-0.2652, -0.8789,  0.9531

100%|███████████████████████████████████▉| 1998/2000 [24:11<00:01,  1.35it/s]

Iteration number:  1998
State:  tensor([0., 0., 0., 0., 0., 0., 0., 0.])
State size:  torch.Size([8])
Action:  3
All Q_current):  tensor([ -96.5311, -106.7595, -127.2639, -122.9472], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-122.9472, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.0045,  1.4037, -0.2201, -0.1723,  0.0034,  0.0153,  0.0000,  0.0000])
Next action:   0
All Q_next:  tensor([-114.6745, -128.2934, -135.9356, -142.5062], grad_fn=<AddBackward0>)
Q(next_state):  tensor(-114.6745, grad_fn=<SelectBackward0>)
Reward tensor(-0.4479)
Q(target_action):  tensor(-115.1224, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(7.8249, grad_fn=<SubBackward0>)
 
State:  tensor([-0.0045,  1.4037, -0.2201, -0.1723,  0.0034,  0.0153,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  0
All Q_current):  tensor([-115.7107, -128.5629, -136.2163, -142.4072], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-115.7107, grad_fn=<SelectBackward0>)
Next sta

State:  tensor([-0.0286,  1.3215, -0.2201, -0.4657,  0.0118,  0.0153,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  0
All Q_current):  tensor([-116.5879, -123.9833, -124.6530, -134.0302], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-116.5879, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.0308,  1.3104, -0.2201, -0.4923,  0.0126,  0.0153,  0.0000,  0.0000])
Next action:   0
All Q_next:  tensor([-115.9801, -123.3239, -123.4117, -133.2149], grad_fn=<AddBackward0>)
Q(next_state):  tensor(-115.9801, grad_fn=<SelectBackward0>)
Reward tensor(-1.3971)
Q(target_action):  tensor(-117.3772, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(-0.7893, grad_fn=<SubBackward0>)
 
State:  tensor([-0.0308,  1.3104, -0.2201, -0.4923,  0.0126,  0.0153,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  0
All Q_current):  tensor([-116.2663, -123.3935, -123.4974, -133.1824], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-116.2663, grad_fn=<SelectBac

Q(next_state):  tensor(-111.8359, grad_fn=<SelectBackward0>)
Reward tensor(5.3481)
Q(target_action):  tensor(-106.4879, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(4.5764, grad_fn=<SubBackward0>)
 
State:  tensor([-0.0547,  1.1505, -0.2048, -0.7197,  0.0218,  0.0321,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  0
All Q_current):  tensor([-112.1656, -117.2453, -112.3602, -125.0603], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-112.1656, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.0568,  1.1336, -0.2048, -0.7473,  0.0234,  0.0321,  0.0000,  0.0000])
Next action:   2
All Q_next:  tensor([-111.4606, -116.4872, -110.9940, -124.1465], grad_fn=<AddBackward0>)
Q(next_state):  tensor(-110.9940, grad_fn=<SelectBackward0>)
Reward tensor(-1.1513)
Q(target_action):  tensor(-112.1453, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(0.0203, grad_fn=<SubBackward0>)
 
State:  tensor([-0.0568,  1.1336, -0.2048, -0.7473

All Q_current):  tensor([-116.2487, -118.6026, -115.3514, -126.9549], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-115.3514, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.0804,  0.9794, -0.2127, -0.5226,  0.0412,  0.0469,  0.0000,  0.0000])
Next action:   2
All Q_next:  tensor([-116.7946, -119.1621, -116.4837, -127.6929], grad_fn=<AddBackward0>)
Q(next_state):  tensor(-116.4837, grad_fn=<SelectBackward0>)
Reward tensor(3.1418)
Q(target_action):  tensor(-113.3420, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(2.0094, grad_fn=<SubBackward0>)
 
State:  tensor([-0.0804,  0.9794, -0.2127, -0.5226,  0.0412,  0.0469,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  2
All Q_current):  tensor([-116.9745, -119.1585, -116.3488, -127.6793], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-116.3488, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.0824,  0.9678, -0.2039, -0.5131,  0.0441,  0.0576,  0.0000,  0.0000])
Next action:   2
All Q_next:

Q(current_action):  tensor(-119.9681, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.1200,  0.8423, -0.3607, -0.3277,  0.0404, -0.0573,  0.0000,  0.0000])
Next action:   0
All Q_next:  tensor([-120.7318, -121.8434, -121.7508, -131.1252], grad_fn=<AddBackward0>)
Q(next_state):  tensor(-120.7318, grad_fn=<SelectBackward0>)
Reward tensor(1.8515)
Q(target_action):  tensor(-118.8804, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(1.0878, grad_fn=<SubBackward0>)
 
State:  tensor([-0.1200,  0.8423, -0.3607, -0.3277,  0.0404, -0.0573,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  0
All Q_current):  tensor([-120.8430, -121.8465, -121.6653, -131.1285], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-120.8430, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.1236,  0.8343, -0.3607, -0.3544,  0.0376, -0.0573,  0.0000,  0.0000])
Next action:   3
All Q_next:  tensor([-120.5092, -121.3706, -120.2361, -130.6593], grad_fn=<AddBackward0>)
Q(next_state): 

Error (Q-target_action - Q_current_action):  tensor(-0.2947, grad_fn=<SubBackward0>)
 
State:  tensor([-0.1701,  0.7323, -0.4025, -0.4116, -0.0035, -0.0502,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  2
All Q_current):  tensor([-122.4382, -119.5718, -118.0641, -118.3630], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-118.0641, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.1743,  0.7236, -0.4108, -0.3880, -0.0065, -0.0585,  0.0000,  0.0000])
Next action:   3
All Q_next:  tensor([-122.5501, -119.8281, -119.2071, -118.5822], grad_fn=<AddBackward0>)
Q(next_state):  tensor(-118.5822, grad_fn=<SelectBackward0>)
Reward tensor(1.2203)
Q(target_action):  tensor(-117.3618, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(0.7023, grad_fn=<SubBackward0>)
 
State:  tensor([-0.1743,  0.7236, -0.4108, -0.3880, -0.0065, -0.0585,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  3
All Q_current):  tensor([-122.7135, -119.9620, -119.0826, -118.36

All Q_current):  tensor([-126.0381, -122.1951, -120.7084, -120.7627], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-120.7627, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.2284,  0.6148, -0.3711, -0.3229, -0.1354, -0.2920,  0.0000,  0.0000])
Next action:   2
All Q_next:  tensor([-126.2337, -121.9853, -119.2609, -120.9511], grad_fn=<AddBackward0>)
Q(next_state):  tensor(-119.2609, grad_fn=<SelectBackward0>)
Reward tensor(-1.9142)
Q(target_action):  tensor(-121.1752, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(-0.4125, grad_fn=<SubBackward0>)
 
State:  tensor([-0.2284,  0.6148, -0.3711, -0.3229, -0.1354, -0.2920,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  2
All Q_current):  tensor([-126.2821, -122.0208, -119.1257, -121.0339], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-119.1257, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.2323,  0.6078, -0.3777, -0.3114, -0.1506, -0.3058,  0.0000,  0.0000])
Next action:   2
All Q_nex

Q(next_state):  tensor(-124.8769, grad_fn=<SelectBackward0>)
Reward tensor(3.7636)
Q(target_action):  tensor(-121.1133, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(2.7269, grad_fn=<SubBackward0>)
 
State:  tensor([-0.2713,  0.5566, -0.2289, -0.1165, -0.3232, -0.2320,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  1
All Q_current):  tensor([-128.7926, -125.1189, -125.2972, -125.0208], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-125.1189, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.2738,  0.5535, -0.2381, -0.1406, -0.3327, -0.1910,  0.0000,  0.0000])
Next action:   3
All Q_next:  tensor([-127.8309, -124.3091, -124.0021, -123.8456], grad_fn=<AddBackward0>)
Q(next_state):  tensor(-123.8456, grad_fn=<SelectBackward0>)
Reward tensor(-2.7851)
Q(target_action):  tensor(-126.6306, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(-1.5117, grad_fn=<SubBackward0>)
 
State:  tensor([-0.2738,  0.5535, -0.2381, -0.140

Error (Q-target_action - Q_current_action):  tensor(-1.0323, grad_fn=<SubBackward0>)
 
State:  tensor([-0.2879,  0.5207,  0.0229, -0.1141, -0.4569, -0.2409,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  2
All Q_current):  tensor([-128.2960, -126.0521, -123.8105, -125.5440], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-123.8105, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.2875,  0.5184,  0.0518, -0.1038, -0.4689, -0.2395,  0.0000,  0.0000])
Next action:   2
All Q_next:  tensor([-128.3804, -126.0026, -124.2215, -125.8461], grad_fn=<AddBackward0>)
Q(next_state):  tensor(-124.2215, grad_fn=<SelectBackward0>)
Reward tensor(-1.2372)
Q(target_action):  tensor(-125.4588, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(-1.6483, grad_fn=<SubBackward0>)
 
State:  tensor([-0.2875,  0.5184,  0.0518, -0.1038, -0.4689, -0.2395,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  2
All Q_current):  tensor([-128.3995, -126.0900, -124.0024, -126.

Q(current_action):  tensor(-117.5358, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.2739,  0.4747,  0.1466, -0.2253, -0.5104,  0.0941,  0.0000,  0.0000])
Next action:   1
All Q_next:  tensor([-120.0904, -117.1547, -119.0045, -119.4182], grad_fn=<AddBackward0>)
Q(next_state):  tensor(-117.1547, grad_fn=<SelectBackward0>)
Reward tensor(0.4093)
Q(target_action):  tensor(-116.7454, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(0.7904, grad_fn=<SubBackward0>)
 
State:  tensor([-0.2739,  0.4747,  0.1466, -0.2253, -0.5104,  0.0941,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  1
All Q_current):  tensor([-120.0841, -117.0791, -119.0311, -119.4664], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-117.0791, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.2724,  0.4690,  0.1370, -0.2484, -0.5034,  0.1406,  0.0000,  0.0000])
Next action:   1
All Q_next:  tensor([-119.2845, -116.5061, -117.7495, -118.3004], grad_fn=<AddBackward0>)
Q(next_state): 

Q(next_state):  tensor(-116.5196, grad_fn=<SelectBackward0>)
Reward tensor(1.7606)
Q(target_action):  tensor(-114.7590, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(1.7315, grad_fn=<SubBackward0>)
 
State:  tensor([-0.2530,  0.3862,  0.1795, -0.3900, -0.3148,  0.5152,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  2
All Q_current):  tensor([-119.1653, -117.1411, -116.5292, -117.3466], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-116.5292, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.2505,  0.3779,  0.2216, -0.3613, -0.2883,  0.5291,  0.0000,  0.0000])
Next action:   1
All Q_next:  tensor([-119.5434, -117.4098, -118.9612, -118.4275], grad_fn=<AddBackward0>)
Q(next_state):  tensor(-117.4098, grad_fn=<SelectBackward0>)
Reward tensor(3.7086)
Q(target_action):  tensor(-113.7012, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(2.8280, grad_fn=<SubBackward0>)
 
State:  tensor([-0.2505,  0.3779,  0.2216, -0.3613,

State:  tensor([-0.2230,  0.2859,  0.2152, -0.2702,  0.0763,  0.6845,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  1
All Q_current):  tensor([-127.1641, -125.2075, -126.6616, -125.8162], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-125.2075, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.2206,  0.2792,  0.2040, -0.2974,  0.1128,  0.7292,  0.0000,  0.0000])
Next action:   1
All Q_next:  tensor([-127.7621, -125.8607, -126.0683, -125.9913], grad_fn=<AddBackward0>)
Q(next_state):  tensor(-125.8607, grad_fn=<SelectBackward0>)
Reward tensor(-4.5269)
Q(target_action):  tensor(-130.3877, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(-5.1802, grad_fn=<SubBackward0>)
 
State:  tensor([-0.2206,  0.2792,  0.2040, -0.2974,  0.1128,  0.7292,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  1
All Q_current):  tensor([-127.7441, -125.7334, -126.1738, -125.9871], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-125.7334, grad_fn=<SelectBac

Q(target_action):  tensor(-126.3229, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(-0.8612, grad_fn=<SubBackward0>)
 
State:  tensor([-0.1988,  0.2055,  0.0818, -0.2696,  0.5426,  0.6031,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  2
All Q_current):  tensor([-127.5942, -127.5853, -126.1921, -126.8944], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-126.1921, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.1982,  0.1997,  0.0373, -0.2671,  0.5720,  0.5880,  0.0000,  0.0000])
Next action:   3
All Q_next:  tensor([-126.8866, -127.2611, -126.6319, -126.0740], grad_fn=<AddBackward0>)
Q(next_state):  tensor(-126.0740, grad_fn=<SelectBackward0>)
Reward tensor(-1.5852)
Q(target_action):  tensor(-127.6591, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(-1.4670, grad_fn=<SubBackward0>)
 
State:  tensor([-0.1982,  0.1997,  0.0373, -0.2671,  0.5720,  0.5880,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  3
Al

State:  tensor([-0.2063,  0.1162, -0.1962, -0.3651,  0.8372,  0.3758,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  3
All Q_current):  tensor([-119.1147, -121.5219, -119.4711, -118.4893], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-118.4893, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.2081,  0.1076, -0.1905, -0.3875,  0.8540,  0.3366,  0.0000,  0.0000])
Next action:   3
All Q_next:  tensor([-118.1654, -120.4722, -117.5182, -117.3653], grad_fn=<AddBackward0>)
Q(next_state):  tensor(-117.3653, grad_fn=<SelectBackward0>)
Reward tensor(-3.1978)
Q(target_action):  tensor(-120.5630, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(-2.0737, grad_fn=<SubBackward0>)
 
State:  tensor([-0.2081,  0.1076, -0.1905, -0.3875,  0.8540,  0.3366,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  3
All Q_current):  tensor([-118.1395, -120.4672, -117.6186, -117.2163], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-117.2163, grad_fn=<SelectBac

All Q_current):  tensor([-129.2393, -128.9631, -130.3362, -130.2746], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-128.9631, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.2413,  0.0064, -0.2714, -0.3268,  0.5454, -1.2253,  0.0000,  0.0000])
Next action:   2
All Q_next:  tensor([-129.1005, -128.3957, -127.4591, -129.9191], grad_fn=<AddBackward0>)
Q(next_state):  tensor(-127.4591, grad_fn=<SelectBackward0>)
Reward tensor(3.1528)
Q(target_action):  tensor(-124.3064, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(4.6567, grad_fn=<SubBackward0>)
 
State:  tensor([-0.2413,  0.0064, -0.2714, -0.3268,  0.5454, -1.2253,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  2
All Q_current):  tensor([-128.8032, -128.1743, -127.7854, -129.6561], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-127.7854, grad_fn=<SelectBackward0>)
Next state:  tensor([-2.4495e-01, -1.1802e-03, -3.0862e-01, -3.1604e-01,  4.8381e-01,
        -1.2310e+00,  0.0000e+00,  1

100%|███████████████████████████████████▉| 1999/2000 [24:14<00:01,  1.48s/it]

Q(target_action):  tensor(-131.7698, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(-0.8012, grad_fn=<SubBackward0>)
 
State:  tensor([-0.2902, -0.0662, -0.3782, -0.1839,  0.1665,  0.2497,  1.0000,  1.0000])
State size:  torch.Size([8])
Action:  0
All Q_current):  tensor([-131.7228, -137.2750, -138.5221, -132.2123], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-131.7228, grad_fn=<SelectBackward0>)
Next state:  tensor([-0.2938, -0.0699, -0.3768, -0.1660,  0.1798,  0.2652,  1.0000,  1.0000])
Next action:   0
All Q_next:  tensor([-132.2386, -137.7673, -139.2965, -132.8726], grad_fn=<AddBackward0>)
Q(next_state):  tensor(-132.2386, grad_fn=<SelectBackward0>)
Reward tensor(-0.8846)
Q(target_action):  tensor(-133.1232, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(-1.4004, grad_fn=<SubBackward0>)
 
State:  tensor([-0.2938, -0.0699, -0.3768, -0.1660,  0.1798,  0.2652,  1.0000,  1.0000])
State size:  torch.Size([8])
Action:  0
Al

State:  tensor([ 0.1784,  1.3416,  0.7818, -0.4278, -0.1704, -0.1417,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  0
All Q_current):  tensor([ -90.1432,  -95.0742, -109.0886, -114.6075], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-90.1432, grad_fn=<SelectBackward0>)
Next state:  tensor([ 0.1862,  1.3314,  0.7818, -0.4544, -0.1774, -0.1416,  0.0000,  0.0000])
Next action:   0
All Q_next:  tensor([ -89.5317,  -94.2182, -107.6338, -113.7822], grad_fn=<AddBackward0>)
Q(next_state):  tensor(-89.5317, grad_fn=<SelectBackward0>)
Reward tensor(-1.1139)
Q(target_action):  tensor(-90.6456, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(-0.5023, grad_fn=<SubBackward0>)
 
State:  tensor([ 0.1862,  1.3314,  0.7818, -0.4544, -0.1774, -0.1416,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  0
All Q_current):  tensor([ -89.3976,  -94.1682, -107.5832, -113.7712], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-89.3976, grad_fn=<SelectBackwar

All Q_current):  tensor([ -81.5056,  -83.8755,  -90.0757, -104.0001], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-81.5056, grad_fn=<SelectBackward0>)
Next state:  tensor([ 0.2868,  1.1439,  0.7816, -0.8021, -0.2699, -0.1416,  0.0000,  0.0000])
Next action:   0
All Q_next:  tensor([ -81.0094,  -83.1617,  -88.8035, -103.3046], grad_fn=<AddBackward0>)
Q(next_state):  tensor(-81.0094, grad_fn=<SelectBackward0>)
Reward tensor(-1.0370)
Q(target_action):  tensor(-82.0463, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(-0.5408, grad_fn=<SubBackward0>)
 
State:  tensor([ 0.2868,  1.1439,  0.7816, -0.8021, -0.2699, -0.1416,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  0
All Q_current):  tensor([ -80.9804,  -83.1513,  -88.7930, -103.3029], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-80.9804, grad_fn=<SelectBackward0>)
Next state:  tensor([ 0.2946,  1.1253,  0.7816, -0.8288, -0.2770, -0.1416,  0.0000,  0.0000])
Next action:   0
All Q_next:  

State:  tensor([ 0.3726,  0.9085,  0.8072, -1.0335, -0.3477, -0.1477,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  0
All Q_current):  tensor([-77.3721, -77.9881, -77.3688, -97.3837], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-77.3721, grad_fn=<SelectBackward0>)
Next state:  tensor([ 0.3806,  0.8847,  0.8072, -1.0602, -0.3551, -0.1477,  0.0000,  0.0000])
Next action:   2
All Q_next:  tensor([-77.2298, -77.7719, -76.1094, -96.8697], grad_fn=<AddBackward0>)
Q(next_state):  tensor(-76.1094, grad_fn=<SelectBackward0>)
Reward tensor(-0.9648)
Q(target_action):  tensor(-77.0742, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(0.2979, grad_fn=<SubBackward0>)
 
State:  tensor([ 0.3806,  0.8847,  0.8072, -1.0602, -0.3551, -0.1477,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  2
All Q_current):  tensor([-77.2972, -77.8022, -76.0526, -96.8929], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-76.0526, grad_fn=<SelectBackward0>)
Next sta

State:  tensor([ 0.4836,  0.5906,  0.9051, -1.1146, -0.3913,  0.0426,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  1
All Q_current):  tensor([-72.8706, -72.3802, -73.2808, -92.5404], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-72.3802, grad_fn=<SelectBackward0>)
Next state:  tensor([ 0.4925,  0.5650,  0.8938, -1.1383, -0.3867,  0.0930,  0.0000,  0.0000])
Next action:   1
All Q_next:  tensor([-72.5681, -72.0677, -72.2626, -91.9565], grad_fn=<AddBackward0>)
Q(next_state):  tensor(-72.0677, grad_fn=<SelectBackward0>)
Reward tensor(0.6657)
Q(target_action):  tensor(-71.4020, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(0.9782, grad_fn=<SubBackward0>)
 
State:  tensor([ 0.4925,  0.5650,  0.8938, -1.1383, -0.3867,  0.0930,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  1
All Q_current):  tensor([-72.5804, -72.0676, -72.2551, -91.9615], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-72.0676, grad_fn=<SelectBackward0>)
Next stat

All Q_current):  tensor([-74.9464, -74.0462, -74.6336, -94.6804], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-74.0462, grad_fn=<SelectBackward0>)
Next state:  tensor([ 0.6061,  0.2440,  0.9594, -1.2125, -0.2248,  0.3940,  0.0000,  0.0000])
Next action:   2
All Q_next:  tensor([-74.8569, -73.8788, -73.5812, -94.3476], grad_fn=<AddBackward0>)
Q(next_state):  tensor(-73.5812, grad_fn=<SelectBackward0>)
Reward tensor(0.5732)
Q(target_action):  tensor(-73.0079, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(1.0382, grad_fn=<SubBackward0>)
 
State:  tensor([ 0.6061,  0.2440,  0.9594, -1.2125, -0.2248,  0.3940,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  2
All Q_current):  tensor([-74.8444, -73.8013, -73.6632, -94.3348], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-73.6632, grad_fn=<SelectBackward0>)
Next state:  tensor([ 0.6161,  0.2175,  0.9884, -1.1757, -0.2048,  0.4008,  0.0000,  0.0000])
Next action:   1
All Q_next:  tensor([-75.82

100%|████████████████████████████████████| 2000/2000 [24:15<00:00,  1.37it/s]

tensor(-83.5211, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(-2.2712, grad_fn=<SubBackward0>)
 
State:  tensor([ 0.7375, -0.1023,  0.9855, -1.1106,  0.0956,  0.5502,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  2
All Q_current):  tensor([ -83.2054,  -82.9346,  -82.1914, -104.4304], grad_fn=<AddBackward0>)
Q(current_action):  tensor(-82.1914, grad_fn=<SelectBackward0>)
Next state:  tensor([ 0.7476, -0.1263,  0.9817, -1.0676,  0.1235,  0.5562,  0.0000,  0.0000])
Next action:   2
All Q_next:  tensor([ -84.6341,  -84.6352,  -84.4381, -106.3003], grad_fn=<AddBackward0>)
Q(next_state):  tensor(-84.4381, grad_fn=<SelectBackward0>)
Reward tensor(-1.0005)
Q(target_action):  tensor(-85.4387, grad_fn=<AddBackward0>)
Error (Q-target_action - Q_current_action):  tensor(-3.2473, grad_fn=<SubBackward0>)
 
State:  tensor([ 0.7476, -0.1263,  0.9817, -1.0676,  0.1235,  0.5562,  0.0000,  0.0000])
State size:  torch.Size([8])
Action:  2
All Q_current):  tensor([




IndexError: list index out of range

<Figure size 640x480 with 0 Axes>

## 7. Demonstration with learned policy

In [None]:
import gym
from tqdm import tqdm
import torch
import numpy as np

# Function to simulate a model in an environment
def simulate_model(env, agent, n_episodes=100, max_t=1000):
    scores = []

    for i_episode in tqdm(range(1, n_episodes+1)):
        state = env.reset()
        score = 0
        epsilon = 0

        for t in range(max_t):
            env.render()  # Move rendering here
            action = agent.act(state, epsilon = 0)
            step_result = env.step(action)
            next_state, reward, done, _ = step_result[:4]
            next_action = agent.act(next_state, epsilon = 0)
            agent.step(state,action,reward,next_state,next_action, done, state_size)
            state = next_state
            action = next_action
            score = score + reward
            if done:
                break
        print(score)
        scores.append(score)

    # Close the environment
    env.close()

    # Print average score
    print("Average score:", np.mean(scores))


env_name = 'LunarLander-v2'
model_path = 'model/Seed37_LunarLander-v2__DeepSARSA_4000_20240407085617.pt'
env = gym.make(env_name,  render_mode = 'human')

# Get environment parameters
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# Initialize the agent
agent = Agent(state_size, action_size, seed=37)

# Load the model weights
agent.qnetwork_local.load_state_dict(torch.load(model_path))
agent.qnetwork_local.eval()

# Simulate the model in the environment
simulate_model(env, agent)


## 8. Plot the shaded graph

In [None]:
# Step 1: Read Data from CSV
df = pd.read_csv('plots/PriorityDDQN_MountainCar.csv')

# Step 2: Remove the first row (Episodes, Run 1, Run 2, Run 3) to keep only the rewards data
# df = df.drop(0)

# Step 3: Convert the remaining DataFrame to numeric values
df = df.apply(pd.to_numeric)

# Step 4: Select only the three late columns (Run 1, Run 2, Run 3)
later_columns = df.iloc[:, 1:4]

# Step 5: Calculate Mean and Standard Deviation for the three later columns
mean_values = later_columns.mean(axis=1)
std_values = later_columns.std(axis=1)
print("Standard deviation: ", std_values)
print("Mean: ", mean_values)

# Step 6: Plot the Data
plt.figure(figsize=(12, 6))
plt.plot(mean_values, label='Mean Sum of Rewards', color = "red")
plt.fill_between(mean_values.index, mean_values - std_values, mean_values + std_values, alpha=0.5, label='Standard Deviation', color = "orange")
plt.xlabel('Episode')
plt.ylabel('Sum of Rewards')
plt.title('Shaded Plot of Sum of Rewards with Mean and Standard Deviation for Priority Double Deep Q-Learning Algorithm, Mountain Car')
plt.savefig('plots/shaded_plot_PriorityDDQN_MountainCar.png')
plt.legend()
plt.show()

In [2]:
pip install ufal.pybox2d

