In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
import gym
import random
import numpy as np

# Ornstein-Uhlenbeck Noise for exploration
class OUNoise:
    def __init__(self, action_dim, mu=0, theta=0.15, sigma=0.2):
        self.action_dim = action_dim
        self.mu = mu * np.ones(action_dim)
        self.theta = theta
        self.sigma = sigma
        self.reset()

    def reset(self):
        self.state = self.mu.copy()

    def sample(self):
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
        self.state = x + dx
        return self.state

# Actor Network
class Actor(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.tanh(self.fc3(x))
        return x

# Critic Network
class Critic(nn.Module):
    def __init__(self, input_dim, action_dim):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(input_dim + action_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, state, action):
        x = torch.cat([state, action], dim=1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Initialize the environment
env = gym.make('Pendulum-v1')  # Pendulum environment for continuous action space

# Define dimensions
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

# Initialize actor and critic networks
actor = Actor(state_dim, action_dim)
critic = Critic(state_dim, action_dim)

# Initialize target networks
target_actor = Actor(state_dim, action_dim)
target_actor.load_state_dict(actor.state_dict())
target_actor.eval()

target_critic = Critic(state_dim, action_dim)
target_critic.load_state_dict(critic.state_dict())
target_critic.eval()

# Initialize optimizers
actor_optimizer = optim.Adam(actor.parameters(), lr=0.0001)
critic_optimizer = optim.Adam(critic.parameters(), lr=0.001)

# Initialize replay buffer
replay_buffer = []
replay_buffer_size = 100000
batch_size = 64

# Initialize Ornstein-Uhlenbeck noise process
noise = OUNoise(action_dim)

# Discount factor
gamma = 0.99

# Training loop
for episode in range(500):
    state = env.reset()
    episode_reward = 0

    while True:
        # Render the environment
        env.render()

        # Select action with exploration (add noise)
        with torch.no_grad():
            action = actor(torch.tensor(state, dtype=torch.float32)).numpy() + noise.sample()

        # Clip action to ensure it's within action bounds
        action = np.clip(action, env.action_space.low, env.action_space.high)

        # Step through environment
        next_state, reward, done, _ = env.step(action)

        # Store transition in replay buffer
        replay_buffer.append((state, action, reward, next_state, done))
        if len(replay_buffer) > replay_buffer_size:
            replay_buffer.pop(0)

        # Sample random minibatch from replay buffer
        if len(replay_buffer) >= batch_size:
            minibatch = random.sample(replay_buffer, batch_size)
            states, actions, rewards, next_states, dones = zip(*minibatch)
            states = torch.tensor(states, dtype=torch.float32)
            actions = torch.tensor(actions, dtype=torch.float32)
            rewards = torch.tensor(rewards, dtype=torch.float32)
            next_states = torch.tensor(next_states, dtype=torch.float32)
            dones = torch.tensor(dones, dtype=torch.float32)

            # Compute Q targets
            with torch.no_grad():
                next_actions = target_actor(next_states)
                q_targets = rewards.view(-1, 1) + gamma * (1 - dones.view(-1, 1)) * target_critic(next_states, next_actions)

            # Update critic
            q_values = critic(states, actions)
            critic_loss = nn.MSELoss()(q_values, q_targets)
            critic_optimizer.zero_grad()
            critic_loss.backward()
            critic_optimizer.step()

            # Update actor
            predicted_actions = actor(states)
            actor_loss = -critic(states, predicted_actions).mean()
            actor_optimizer.zero_grad()
            actor_loss.backward()
            actor_optimizer.step()

            # Update target networks
            for target_param, param in zip(target_actor.parameters(), actor.parameters()):
                target_param.data.copy_(param.data * 0.001 + target_param.data * (1.0 - 0.001))

            for target_param, param in zip(target_critic.parameters(), critic.parameters()):
                target_param.data.copy_(param.data * 0.001 + target_param.data * (1.0 - 0.001))

        episode_reward += reward
        state = next_state

        if done:
            print("Episode: {}, Reward: {:.2f}".format(episode, episode_reward))
            break

env.close()


  states = torch.tensor(states, dtype=torch.float32)


Episode: 0, Reward: -1149.30
Episode: 1, Reward: -1195.62
Episode: 2, Reward: -1308.27
Episode: 3, Reward: -1245.50
Episode: 4, Reward: -1061.01
Episode: 5, Reward: -1456.54
Episode: 6, Reward: -1172.99
Episode: 7, Reward: -1363.89
Episode: 8, Reward: -1496.50
Episode: 9, Reward: -1685.64
Episode: 10, Reward: -1417.73
Episode: 11, Reward: -1507.81
Episode: 12, Reward: -1486.66
Episode: 13, Reward: -1768.95
Episode: 14, Reward: -1736.23
Episode: 15, Reward: -1550.42
Episode: 16, Reward: -1738.48
Episode: 17, Reward: -1238.34
Episode: 18, Reward: -1583.64
Episode: 19, Reward: -1731.01
Episode: 20, Reward: -1604.64
Episode: 21, Reward: -1758.75
Episode: 22, Reward: -1688.66
Episode: 23, Reward: -1327.49
Episode: 24, Reward: -1356.89
Episode: 25, Reward: -1372.26
Episode: 26, Reward: -1370.86
Episode: 27, Reward: -1488.33
Episode: 28, Reward: -1264.92
Episode: 29, Reward: -1223.73
Episode: 30, Reward: -1394.56
Episode: 31, Reward: -1470.93
Episode: 32, Reward: -1280.16
Episode: 33, Reward:

KeyboardInterrupt: 

: 