In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import gym
import random
import numpy as np

# Actor Network
class Actor(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.tanh(self.fc3(x))
        return x

# Initialize the environment
env = gym.make('Pendulum-v1')  # Pendulum environment for continuous action space

# Define dimensions
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

# Initialize actor network
actor = Actor(state_dim, action_dim)

# Initialize optimizer
actor_optimizer = optim.Adam(actor.parameters(), lr=0.0001)

# Initialize replay buffer
replay_buffer = []
replay_buffer_size = 100000
batch_size = 64

# Discount factor
gamma = 0.99

# Training loop
for episode in range(500):
    state = env.reset()
    episode_reward = 0

    while True:
        # Render the environment
        env.render()

        # Select action
        with torch.no_grad():
            action = actor(torch.tensor(state, dtype=torch.float32)).numpy()

        # Step through environment
        next_state, reward, done, _ = env.step(action)

        # Store transition in replay buffer
        replay_buffer.append((state, action, reward, next_state, done))
        if len(replay_buffer) > replay_buffer_size:
            replay_buffer.pop(0)

        # Sample random minibatch from replay buffer
        if len(replay_buffer) >= batch_size:
            minibatch = random.sample(replay_buffer, batch_size)
            states, actions, rewards, next_states, dones = zip(*minibatch)
            states = torch.tensor(states, dtype=torch.float32)
            actions = torch.tensor(actions, dtype=torch.float32)
            rewards = torch.tensor(rewards, dtype=torch.float32)
            next_states = torch.tensor(next_states, dtype=torch.float32)
            dones = torch.tensor(dones, dtype=torch.float32)

            # Update actor
            predicted_actions = actor(states)
            actor_loss = -torch.mean(predicted_actions)
            actor_optimizer.zero_grad()
            actor_loss.backward()
            actor_optimizer.step()

        episode_reward += reward
        state = next_state

        if done:
            print("Episode: {}, Reward: {:.2f}".format(episode, episode_reward))
            break

env.close()


  import distutils.spawn
  states = torch.tensor(states, dtype=torch.float32)


Episode: 0, Reward: -1195.57


KeyboardInterrupt: 