!pip install minerl gym[all]

In [2]:
import minerl
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque

In [None]:
# Choose a MineRL task (e.g., MineRLNavigate-v0)
env_name = "MineRLNavigate-v0" 
env = gym.make(env_name)

In [None]:
# Define a simple neural network for the agent
class Policy(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(state_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, action_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return torch.tanh(self.fc3(x))

In [None]:
# Hyperparameters
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
learning_rate = 0.001
gamma = 0.99
buffer_size = 10000
batch_size = 64
episodes = 100

In [None]:
# Initialize the agent and optimizer
agent = Policy(state_dim, action_dim)
optimizer = optim.Adam(agent.parameters(), lr=learning_rate)

# Experience replay buffer
replay_buffer = deque(maxlen=buffer_size)

In [None]:
# Training loop
for episode in range(episodes):
    state = env.reset()
    episode_reward = 0

    while True:
        # Select action (using epsilon-greedy policy for exploration)
        epsilon = 0.05  # Adjust epsilon for exploration-exploitation trade-off
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            action_tensor = agent(state_tensor)
            action = action_tensor.detach().numpy()[0]

        # Execute action and observe reward and next state
        next_state, reward, done, _ = env.step(action)
        episode_reward += reward

        # Store transition in replay buffer
        replay_buffer.append((state, action, reward, next_state, done))

        # Sample a batch from the replay buffer
        if len(replay_buffer) >= batch_size:
            batch = random.sample(replay_buffer, batch_size)
            states, actions, rewards, next_states, dones = zip(*batch)
            states = torch.FloatTensor(states)
            actions = torch.FloatTensor(actions)
            rewards = torch.FloatTensor(rewards)
            next_states = torch.FloatTensor(next_states)
            dones = torch.FloatTensor(dones)

            # Compute TD target
            q_values = agent(states)
            next_q_values = agent(next_states)
            q_target = rewards + gamma * torch.max(next_q_values, dim=1)[0] * (1 - dones)

            # Compute loss
            loss = nn.MSELoss()(q_values, q_target.unsqueeze(1))

            # Optimize the model
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        state = next_state

        if done:
            print(f"Episode: {episode+1}/{episodes}, Reward: {episode_reward}")
            break

In [None]:
# Evaluate the trained agent
env = gym.make(env_name)
state = env.reset()
total_reward = 0
while True:
    state_tensor = torch.FloatTensor(state).unsqueeze(0)
    action_tensor = agent(state_tensor)
    action = action_tensor.detach().numpy()[0]
    next_state, reward, done, _ = env.step(action)
    total_reward += reward
    state = next_state
    if done:
        print(f"Evaluation Reward: {total_reward}")
        break

env.close()