In [None]:
!pip install gymnasium




In [None]:
!pip install minigrid



In [None]:
!pip install git+https://github.com/openai/CLIP.git


Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-w2xhxl7b
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-w2xhxl7b
  Resolved https://github.com/openai/CLIP.git to commit a1d071733d7111c9c014f024669f959182114e33
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
import gym
import minigrid

In [None]:
from minigrid.wrappers import *

  and should_run_async(code)


In [None]:
env = gym.make("BabyAI-GoToRedBallGrey-v0", render_mode="rgb_array", max_episode_steps=100)


In [None]:
pip install torch torchvision transformers




In [None]:
from clip import clip


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision.transforms import Compose, Resize, ToTensor
from collections import namedtuple
from PIL import Image
import random

# Check if a GPU is available; if not, use the CPU
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the CLIP model and preprocessing function
model, transform = clip.load("ViT-B/16", device)

# Define the DQN neural network
class DQN(nn.Module):
    def __init__(self, input_size, output_size):
        super(DQN, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 128),  # Input size based on your specific setup
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, output_size)  # Output size corresponds to the action space
        )

    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten the tensor
        return self.model(x)

# Define a named tuple for storing transitions in the replay buffer
Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward', 'done'))

# Define a class for the replay buffer
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

# Define a data transformation pipeline using torchvision
#transform = Compose([Resize((32, 32)), ToTensor()])

# Define a function to preprocess observations
def preprocess_observation(observation):
    # Preprocess the image
    image = Image.fromarray(observation['image'])
    image_transformed = transform(image).unsqueeze(0)  # Flatten the image tensor
    # Encode the image using the CLIP model
    with torch.no_grad():
        image_features = (model.encode_image(image_transformed.to(device))).to(device)
    # Tokenize the mission
    mission = clip.tokenize([observation['mission']])
    with torch.no_grad():
        text_features = (model.encode_text(mission.to(device))).to(device)
    # Extract the direction as a tensor
    direction = torch.tensor([observation['direction']]).unsqueeze(0).to(torch.float32)

    return  image_features.to(device), direction.to(device), text_features.to(device)

# Define a function to select an action using epsilon-greedy policy
def select_action(state):
    if random.random() < epsilon:
        return env.action_space.sample()
    else:
        q_values = dqn(state)
        return q_values.argmax().item()

# Define hyperparameters and initialize the DQN models and optimizer
input_size = 1025  # Adjust based on specific input
output_size = env.action_space.n  # Number of possible actions
dqn = DQN(input_size, output_size).to(device)
target_dqn = DQN(input_size, output_size).to(device)
target_dqn.load_state_dict(dqn.state_dict())  # Initialize target network with the same weights

epsilon = 1.0  # Initial exploration rate
epsilon_decay = 0.9  # Decay rate for exploration
min_epsilon = 0.1  # Minimum exploration rate
gamma = 0.99  # Discount factor
lr = 0.001  # Learning rate
batch_size = 32
memory_size = 10000
target_update_interval = 10

# Initialize the replay buffer, loss function, and optimizer
memory = ReplayBuffer(memory_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(dqn.parameters(), lr=lr)

# Specify the number of episodes for training
num_episodes = 100


In [None]:
for episode in range(num_episodes):
    # Reset the environment for a new episode
    observation = env.reset()
    obs = observation[0] # Extract the first observation
    image_features, direction, text_features = preprocess_observation(obs)

    # Concatenate image, direction, and mission to form the initial state
    state = torch.cat((image_features, direction, text_features), dim=1)
    total_reward = 0
    done = False

    while not done:
        # Select an action using an epsilon-greedy policy
        action = select_action(state.to(device))

        # Take the selected action and observe the next state, reward, and done flag
        next_observation, reward, done, info, _ = env.step(action)
        next_image, next_direction, next_mission = preprocess_observation(next_observation)
        next_state = torch.cat((next_image, next_direction, next_mission), dim=1)

        # Store the transition in the replay buffer
        memory.push(state, action, next_state, reward, done)

        # If there are enough samples in the replay buffer, perform a DQN update
        if len(memory.memory) >= batch_size:
            transitions = memory.sample(batch_size)

            # Separate and convert transition elements to tensors
            state_batch = torch.stack([t.state.to(device) for t in transitions]).to(device)
            action_batch = torch.tensor([t.action for t in transitions]).to(device)
            next_state_batch = torch.stack([t.next_state[0] for t in transitions]).to(device)
            reward_batch = torch.tensor([t.reward for t in transitions]).to(device)
            done_batch = torch.tensor([t.done for t in transitions]).to(device)
            done_batch = done_batch.float().to(device)  # Convert boolean tensor to float tensor
            state_batch = state_batch.view(state_batch.size(0), -1).to(device)

            # Calculate Q-values for current and next state-action pairs
            q_values_current_state_action_pairs = dqn(state_batch).gather(1, action_batch.view(-1, 1)).squeeze(1)
            q_values_next_state_max_actions = dqn(next_state_batch).max(1)[0]

            # Calculate expected Q-values for the current state-action pairs
            expected_q_values_current_state_action_pairs = reward_batch + gamma * q_values_next_state_max_actions * (1 - done_batch)

            # Compute the loss and update the DQN model
            loss = criterion(q_values_current_state_action_pairs, expected_q_values_current_state_action_pairs)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        state = next_state
        total_reward += reward

    # Update the target DQN every N episodes
    if episode % target_update_interval == 0:
        target_dqn.load_state_dict(dqn.state_dict())

    # Decay the exploration rate
    epsilon = max(epsilon * epsilon_decay, min_epsilon)

    # Print the episode's total reward
    print(f"Episode {episode}, Total Reward: {total_reward}")

# Save the trained DQN model
torch.save(dqn.state_dict(), "dqn_model.pth")


Episode 0, Total Reward: -18.265625
Episode 1, Total Reward: -13.709375
Episode 2, Total Reward: -3.3171875
Episode 3, Total Reward: -2.3468750000000003
Episode 4, Total Reward: -2.1921875
Sampling rejected: unreachable object at (1, 1)
Episode 5, Total Reward: 0.0859375
Episode 6, Total Reward: -0.2796875000000001
Episode 7, Total Reward: -2.3609375
Episode 8, Total Reward: -8.0703125
Episode 9, Total Reward: -3.3734375000000005
Episode 10, Total Reward: -11.20625
Episode 11, Total Reward: -1.0953125
Episode 12, Total Reward: -0.15312499999999996
Episode 13, Total Reward: -0.37812500000000004
Episode 14, Total Reward: -0.18125000000000013
Sampling rejected: unreachable object at (6, 5)
Episode 15, Total Reward: -11.825000000000001
Episode 16, Total Reward: -2.3609375
Episode 17, Total Reward: 0.7609375
Episode 18, Total Reward: -4.1468750000000005
Episode 19, Total Reward: -13.765625
Episode 20, Total Reward: -1.4609375
Episode 21, Total Reward: 0.9578125
Episode 22, Total Reward: -0.