In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np


class QNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super().__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x


class Agent:
    def __init__(
        self,
        state_size,
        action_size,
        lr=0.001,
        gamma=0.99,
        epsilon_start=1.0,
        epsilon_end=0.01,
        epsilon_decay=0.995,
    ):
        self.state_size = state_size
        self.action_size = action_size
        self.epsilon = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        self.gamma = gamma

        self.q_network = QNetwork(state_size, action_size)
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)

    def select_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.action_size)
        else:
            with torch.no_grad():
                state = torch.tensor(state, dtype=torch.float32)
                q_values = self.q_network(state)
                return torch.argmax(q_values).item()

    def update_model(self, state, action, reward, next_state, done):
        state = torch.tensor(state, dtype=torch.float32)
        next_state = torch.tensor(next_state, dtype=torch.float32)
        action = torch.tensor(action).unsqueeze(0)
        reward = torch.tensor(reward, dtype=torch.float32).unsqueeze(0)
        done = torch.tensor(done).unsqueeze(0)

        q_value = self.q_network(state).gather(1, action)

        next_q_value = torch.max(self.q_network(next_state), dim=1)[0].unsqueeze(1)
        target = reward + self.gamma * next_q_value * (1 - done)

        loss = nn.MSELoss()(q_value, target.detach())

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.epsilon > self.epsilon_end:
            self.epsilon *= self.epsilon_decay
        print(f"{self.epsilon}")


# Define environment and training loop
class Environment:
    def __init__(self):
        self.state_size = 2  # For simplicity, let's assume a 2D grid
        self.action_size = 4  # Up, Down, Left, Right
        self.reset()

    def reset(self):
        self.state = [0, 0]  # Starting position
        self.goal = [5, 5]  # Goal position
        self.done = False

    def step(self, action):
        if action == 0:  # Up
            self.state[1] += 1
        elif action == 1:  # Down
            self.state[1] -= 1
        elif action == 2:  # Left
            self.state[0] -= 1
        elif action == 3:  # Right
            self.state[0] += 1

        # Check if the agent reached the goal
        if self.state == self.goal:
            self.done = True
            reward = 1.0
        else:
            reward = 0.0

        # Check if the agent hit a wall (boundary)
        if any(s < 0 or s > 5 for s in self.state):
            self.done = True
            reward = -1.0

        return self.state, reward, self.done

In [8]:
# Training loop
env = Environment()
agent = Agent(state_size=env.state_size, action_size=env.action_size)
for episode in range(1000):
    env.reset()
    state = env.state
    total_reward = 0
    done = False
    while not done:
        action = agent.select_action(state)
        next_state, reward, done = env.step(action)
        agent.update_model(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward

    if episode % 100 == 0:
        print(f"Episode: {episode}, Total Reward: {total_reward}")

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [5]:
state