In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np


# Define the LSTM-based architecture for the Advantage Actor-Critic (A2C) agent
class A2CNetwork(nn.Module):
    def __init__(
        self, input_size, hidden_size_policy=48, hidden_size_value=48, num_actions=4
    ):
        super(A2CNetwork, self).__init__()

        # LSTM for both Policy and Value functions
        self.lstm_policy = nn.LSTM(input_size, hidden_size_policy, batch_first=True)
        self.lstm_value = nn.LSTM(input_size, hidden_size_value, batch_first=True)

        # Fully connected layers for Policy (Actor)
        self.fc_policy = nn.Linear(hidden_size_policy, num_actions)

        # Fully connected layers for Value (Critic)
        self.fc_value = nn.Linear(hidden_size_value, 1)

        # Entropy regularization term
        self.entropy_weight = 0.01

    def forward(
        self,
        observation,
        prev_reward,
        prev_action_onehot,
        hidden_state_policy,
        hidden_state_value,
    ):
        # Prepare input by concatenating observation, reward, and action
        x = torch.cat(
            [observation, prev_reward.unsqueeze(-1), prev_action_onehot], dim=-1
        )

        # LSTM forward pass
        policy_out, hidden_state_policy = self.lstm_policy(x, hidden_state_policy)
        value_out, hidden_state_value = self.lstm_value(x, hidden_state_value)

        # Actor (Policy): output action probabilities
        action_probs = F.softmax(self.fc_policy(policy_out[:, -1, :]), dim=-1)

        # Critic (Value): output state value estimate
        state_value = self.fc_value(value_out[:, -1, :])

        return action_probs, state_value, hidden_state_policy, hidden_state_value

    def compute_loss(
        self, action_probs, state_value, reward, done, prev_state_value, entropy
    ):
        # Advantage function
        advantage = reward - prev_state_value

        # Actor loss (policy gradient)
        actor_loss = -torch.mean(torch.log(action_probs) * advantage)

        # Critic loss (value function)
        critic_loss = F.mse_loss(state_value, reward)

        # Entropy regularization
        entropy_loss = -torch.mean(entropy)

        # Total loss
        total_loss = actor_loss + critic_loss + self.entropy_weight * entropy_loss
        return total_loss


# Hyperparameters
input_size = 10  # Example input size (observation, reward, action)
num_actions = 4  # Number of possible actions
hidden_size = 48  # LSTM hidden size

# Initialize the A2C Network
net = A2CNetwork(
    input_size=input_size,
    hidden_size_policy=hidden_size,
    hidden_size_value=hidden_size,
    num_actions=num_actions,
)

# Optimizer
optimizer = optim.Adam(net.parameters(), lr=1e-4)


# Function to train the agent
def train_agent():
    hidden_state_policy = (
        torch.zeros(1, 1, hidden_size),
        torch.zeros(1, 1, hidden_size),
    )
    hidden_state_value = (
        torch.zeros(1, 1, hidden_size),
        torch.zeros(1, 1, hidden_size),
    )

    for episode in range(1000):  # Simulate for 1000 episodes
        observation = torch.randn(1, input_size)  # Example observation
        prev_reward = torch.tensor([1.0])  # Example previous reward
        prev_action_onehot = torch.randn(
            1, num_actions
        )  # Example one-hot encoded action from previous step

        # Forward pass through the network
        action_probs, state_value, hidden_state_policy, hidden_state_value = net(
            observation,
            prev_reward,
            prev_action_onehot,
            hidden_state_policy,
            hidden_state_value,
        )

        # Calculate the entropy (for regularization)
        entropy = -torch.sum(action_probs * torch.log(action_probs), dim=-1)

        # Get the reward (just for illustration purposes)
        reward = torch.tensor([1.0])  # Example reward from environment
        done = torch.tensor([False])  # Example terminal state (for simplicity)

        # Compute the loss
        total_loss = net.compute_loss(
            action_probs, state_value, reward, done, state_value, entropy
        )

        # Backpropagation
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

        print(f"Episode {episode+1}, Loss: {total_loss.item()}")


# Run training
train_agent()

ImportError: DLL load failed while importing _C: The specified procedure could not be found.

In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

# load the dataset, split into input (X) and output (y) variables
dataset = np.loadtxt("pima-indians-diabetes.csv", delimiter=",")
X = dataset[:, 0:8]
y = dataset[:, 8]

X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32).reshape(-1, 1)

# define the model
model = nn.Sequential(
    nn.Linear(8, 12),
    nn.ReLU(),
    nn.Linear(12, 8),
    nn.ReLU(),
    nn.Linear(8, 1),
    nn.Sigmoid(),
)
print(model)

# train the model
loss_fn = nn.BCELoss()  # binary cross entropy
optimizer = optim.Adam(model.parameters(), lr=0.001)

n_epochs = 100
batch_size = 10

for epoch in range(n_epochs):
    for i in range(0, len(X), batch_size):
        Xbatch = X[i : i + batch_size]
        y_pred = model(Xbatch)
        ybatch = y[i : i + batch_size]
        loss = loss_fn(y_pred, ybatch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Finished epoch {epoch}, latest loss {loss}")

# compute accuracy (no_grad is optional)
with torch.no_grad():
    y_pred = model(X)
accuracy = (y_pred.round() == y).float().mean()
print(f"Accuracy {accuracy}")

Sequential(
  (0): Linear(in_features=8, out_features=12, bias=True)
  (1): ReLU()
  (2): Linear(in_features=12, out_features=8, bias=True)
  (3): ReLU()
  (4): Linear(in_features=8, out_features=1, bias=True)
  (5): Sigmoid()
)
Finished epoch 0, latest loss 0.5336266756057739
Finished epoch 1, latest loss 0.5653856992721558
Finished epoch 2, latest loss 0.5651547908782959
Finished epoch 3, latest loss 0.5410540103912354
Finished epoch 4, latest loss 0.5350542068481445
Finished epoch 5, latest loss 0.5325108766555786
Finished epoch 6, latest loss 0.5291311144828796
Finished epoch 7, latest loss 0.5220627188682556
Finished epoch 8, latest loss 0.5184709429740906
Finished epoch 9, latest loss 0.5112599730491638
Finished epoch 10, latest loss 0.5053502321243286
Finished epoch 11, latest loss 0.5031177997589111
Finished epoch 12, latest loss 0.49540233612060547
Finished epoch 13, latest loss 0.4966287612915039
Finished epoch 14, latest loss 0.49399465322494507
Finished epoch 15, latest los