In [1]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# Create CartPole environment
env = gym.make('CartPole-v1')

# Define parameters
num_episodes = 1000
max_steps_per_episode = 500
learning_rate = 0.01
discount_rate = 0.99

# Define neural network architecture for policy
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, 32)
        self.fc2 = nn.Linear(32, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.softmax(self.fc2(x), dim=-1)
        return x

# Initialize policy network
policy_model = PolicyNetwork(env.observation_space.shape[0], env.action_space.n)

# Define optimizer
optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate)

# Helper function to choose action based on policy probabilities
def choose_action(state):
    state = np.array(state) # Convert state to NumPy array
    state = torch.from_numpy(state).float().unsqueeze(0)  # Convert state to tensor and add batch dimension
    action_probs = policy_model(state).squeeze().detach().numpy()
    return np.random.choice(len(action_probs), p=action_probs)


# Function to compute discounted rewards
def compute_discounted_rewards(rewards):
    discounted_rewards = np.zeros_like(rewards, dtype=np.float32)
    running_add = 0
    for t in reversed(range(len(rewards))):
        running_add = running_add * discount_rate + rewards[t]
        discounted_rewards[t] = running_add
    return discounted_rewards

# REINFORCE algorithm
for episode in range(num_episodes):
    episode_states, episode_actions, episode_rewards = [], [], []
    state = env.reset()

    for step in range(max_steps_per_episode):
        action = choose_action(state)
        next_state, reward, done, _ = env.step(action)

        episode_states.append(state)
        episode_actions.append(action)
        episode_rewards.append(reward)

        state = next_state

        if done:
            break

    discounted_rewards = compute_discounted_rewards(episode_rewards)
    episode_states = np.array(episode_states)
    episode_actions = np.array(episode_actions)

    # Convert to PyTorch tensors
    episode_states = torch.tensor(episode_states, dtype=torch.float32)
    episode_actions = torch.tensor(episode_actions, dtype=torch.int64)
    discounted_rewards = torch.tensor(discounted_rewards, dtype=torch.float32)

    # Compute loss
    logits = policy_model(episode_states)
    chosen_action_logits = torch.gather(logits, 1, episode_actions.unsqueeze(1))
    loss = -torch.mean(torch.log(chosen_action_logits) * discounted_rewards)

    # Backpropagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if episode % 100 == 0:
        total_reward = np.sum(episode_rewards)
        print(f"Episode: {episode}, Total Reward: {total_reward}")

print("Training finished.")


  import distutils.spawn


Episode: 0, Total Reward: 38.0
Episode: 100, Total Reward: 13.0
Episode: 200, Total Reward: 9.0
Episode: 300, Total Reward: 9.0
Episode: 400, Total Reward: 10.0
Episode: 500, Total Reward: 10.0
Episode: 600, Total Reward: 10.0
Episode: 700, Total Reward: 9.0
Episode: 800, Total Reward: 10.0
Episode: 900, Total Reward: 9.0
Training finished.
