<a href="https://colab.research.google.com/github/abyaadrafid/Representation_Learning_RL/blob/main/Learn_Env_From_Representation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1042]:
!apt install swig cmake libopenmpi-dev zlib1g-dev
!pip install stable-baselines[mpi]==2.10.0 box2d box2d-kengz

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libopenmpi-dev is already the newest version (2.1.1-8).
swig is already the newest version (3.0.12-1).
cmake is already the newest version (3.10.2-1ubuntu2.18.04.2).
zlib1g-dev is already the newest version (1:1.2.11.dfsg-0ubuntu2.1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [1043]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1044]:
import gym
from gym.spaces import Discrete
import torch
from collections import deque, defaultdict, namedtuple
import random
import numpy as np
import torch.nn as nn
from torch.optim import Adam
from tqdm.notebook import tqdm

In [1045]:
MAX_EPISODES = 1000
MAX_EPISODE_LEN = 100
BATCH_SIZE = 15
EMBEDDING_SIZE = 16
SEED = 0
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [1046]:
env = gym.make('LunarLander-v2')
env.seed(0)
print(env.action_space)
print(env.observation_space)

Discrete(4)
Box(-inf, inf, (8,), float32)


In [1047]:
class DQN(nn.Module):
  def __init__(self, state_size, fc1_size, fc2_size, action_size, seed):
    super(DQN, self).__init__()
    self.seed = torch.manual_seed(seed)
    self.layers = nn.Sequential(
        nn.Linear(state_size, fc1_size),
        nn.ReLU(),
        nn.Linear(fc1_size, fc2_size),
        nn.ReLU(),
        nn.Linear(fc2_size, action_size)
    )
      
  def forward(self, x):
    return self.layers(x)    

In [1048]:
class RandomAgent() :
  def __init__(self, seed : int, action_space : Discrete) :
    self.seed = seed
    self.action_space = action_space

  def act(self, observation = None) :
    return self.action_space.sample()

In [1049]:
class TrainedAgent(nn.Module):
  def __init__(self, path, state_size = env.observation_space.shape[0], fc1_size = 128, fc2_size = 256, action_size = env.action_space.n):
    super(TrainedAgent, self).__init__()
    self.network = DQN(state_size, fc1_size, fc2_size, action_size , 0)
    self._load_weights(path)
  
  def _load_weights(self, path):
    if torch.cuda.is_available() :
      self.network.load_state_dict(torch.load(path))
    else :
      self.network.load_state_dict(torch.load(path, map_location=torch.device('cpu')))
  
  def act(self, state):
    state = torch.tensor(state)
    return np.argmax(self.network(state).cpu().data.numpy())

In [1050]:
collector_config = {
    "seed" : 0,
    "env" : env,
    "agent" : "trained",
    "agent_weights_path" : "/content/drive/MyDrive/dqn_weights.pt",
    "max_episodes" : MAX_EPISODES,
    "max_episode_len" : MAX_EPISODE_LEN,
    "action_space" : env.action_space,
}

In [1051]:
class ExperienceCollector():
  def __init__(self, config : dict):
    self.seed = config.get("seed",0)
    self.env = config.get("env")
    self.agent_type = config.get("agent", "random")
    self.max_episode_len = config.get("max_episode_len", 300)
    self.max_episodes = config.get("max_episodes")
    self.action_space = config.get("action_space", Discrete(4))
    self.memory = deque(maxlen=self.max_episodes)
    self.agent_weights = config.get("agent_weights_path", None)

    self.agent = self._make_agent()
    self.current_episode = -1
  
  def _make_agent(self):
    if self.agent_type == "random" :
      return RandomAgent(self.seed, self.action_space)
    elif self.agent_type == "trained" :
      return TrainedAgent(self.agent_weights)

  def add_episode(self, episode):
    self.memory.append([])
    self.current_episode +=1 
     
    index = self.current_episode
    if self.current_episode >= self.max_episodes :
      self.current_episode -= 1
      index = np.random.randint(0, self.max_episodes)

    self.memory[index].append(episode)

  def sample(self):
    states = torch.zeros((MAX_EPISODE_LEN,env.observation_space.shape[0]))
    actions = torch.full((MAX_EPISODE_LEN, ), fill_value = -1, dtype = torch.float32)
    rewards = torch.zeros((MAX_EPISODE_LEN))
    next_states = torch.zeros((MAX_EPISODE_LEN,env.observation_space.shape[0]))
    dones = torch.ones((MAX_EPISODE_LEN))

    episode = random.sample(self.memory, k=1)
    episode = np.array(episode, dtype=object)
    if episode.size % 5 != 0 :
      return [None]*5
    episode = episode.reshape(-1,5)

    for index, step in enumerate(episode) :
      states[index] = torch.from_numpy(step[0])
      actions[index] = step[1]
      rewards[index] = step[2]
      next_states[index] = torch.from_numpy(step[3])
      dones[index] = step[4]

    return states.to(device), actions.to(device), rewards.to(device), next_states.to(device), dones.to(device)


  def collect(self, num_episodes : int = 0) :
    for _ in range(num_episodes) :
      current_episode = []
      episode_length = 0
      state = self.env.reset()
      done = False

      while not done :
        if episode_length >= self.max_episode_len : break
        action = self.agent.act(state)
        next_state, reward, done, _ = self.env.step(action)
        if done : done = 1 
        else : done = 0
        if state is not None and action is not None and reward is not None and next_state is not None and done is not None :
          current_episode.append([state, action, reward, next_state, done])
          episode_length +=1
        
        state = next_state
      self.add_episode(current_episode)

    print(f'{num_episodes} episodes added to memory')

In [1052]:
collector = ExperienceCollector(collector_config)
collector.collect(1000)

1000 episodes added to memory


In [1053]:
_ = collector.sample()

In [1054]:
class StateEncoder(nn.Module) :
  def __init__(self, state_size : int, embedding_size : int, fc1_size : int = 16):
    super(StateEncoder, self).__init__()
    self.state_enc = nn.Sequential(
        nn.Linear(state_size, fc1_size),
        nn.ReLU(),
        nn.Linear(fc1_size, embedding_size)
    )
  def forward(self, state):
    return self.state_enc(state)

In [1055]:
class ActionEncoder(nn.Module) :
  def __init__(self, embedding_size,fc1_size :int = 16) :
    super(ActionEncoder, self).__init__()
    self.action_enc = nn.Sequential(
      nn.Linear(1, fc1_size),
      nn.ReLU(),
      nn.Linear(fc1_size, embedding_size)
    )
  def forward(self, action) :
    return self.action_enc(action.unsqueeze(-1))


In [1056]:
class StateDecoder(nn.Module) :
  def __init__(self, state_size : int, embedding_size : int, fc1_size : int = 10):
    super(StateDecoder, self).__init__()
    self.state_dec = nn.Sequential(
        nn.Linear(embedding_size, fc1_size),
        nn.ReLU(),
        nn.Linear(fc1_size, state_size)
    )
  def forward(self, state):
    return self.state_dec(state)

In [1057]:
class RewardModel(nn.Module) :
  def __init__(self, embedding_size=16, fc1_size = 8):
    super(RewardModel, self).__init__()
    self.layers = nn.Sequential(
        nn.Linear(embedding_size*2, fc1_size),
        nn.ReLU(),
        nn.Linear(fc1_size, 1)
    )
  
  def forward(self, x):
    return self.layers(x)

In [1058]:
class DonePredictor(nn.Module) :
  def __init__(self, embedding_size=16, fc1_size = 8):
    super(DonePredictor, self).__init__()
    self.layers = nn.Sequential(
        nn.Linear(embedding_size, fc1_size),
        nn.ReLU(),
        nn.Linear(fc1_size, 2)
    )
  
  def forward(self, x):
    return self.layers(x)

In [1059]:
class WorldModel(nn.Module) :
  def __init__(self, embedding_size, hidden_size= 16) :
    super(WorldModel, self).__init__()
    self.hidden_size = hidden_size
    self.gru = nn.GRU(input_size = embedding_size*2, hidden_size = hidden_size)
    self.reward_model = RewardModel(embedding_size)
    self.done_model = DonePredictor(embedding_size)
    self.init_hidden()
    
  def init_hidden(self):
    self.hidden = torch.zeros(1, self.hidden_size).to(device)
  
  def forward(self, encoded_states, actions):
    inputs = torch.cat([encoded_states, actions], dim = -1)
    output, self.hidden = self.gru(inputs, self.hidden)
    rewards = self.reward_model(inputs)
    dones = self.done_model(output)

    return output, rewards.squeeze(-1), dones

In [1060]:
state_encoder = StateEncoder(env.observation_space.shape[0], EMBEDDING_SIZE).to(device)

In [1061]:
state_decoder = StateDecoder(env.observation_space.shape[0], EMBEDDING_SIZE).to(device)

In [1062]:
action_encoder = ActionEncoder(EMBEDDING_SIZE).to(device)

In [1063]:
wm = WorldModel(EMBEDDING_SIZE, hidden_size = 16).to(device)

In [1064]:
transition_loss = nn.MSELoss()
reward_loss = nn.MSELoss()
reconstruction_loss = nn.MSELoss()
done_loss = nn.CrossEntropyLoss()
optimizer = Adam(list(state_encoder.parameters()) + list(wm.parameters()) + list(action_encoder.parameters()) + list(state_decoder.parameters()))

In [1065]:
import time
import statistics

In [1066]:
t_losses, r_losses, rec_losses, losses = [], [], [], []

def learn_one_ep():
  state_encoder.zero_grad()
  action_encoder.zero_grad()
  wm.init_hidden()
  wm.zero_grad()
  states, actions, rewards, next_states, dones = collector.sample()

  if states is None : return 

  encoded_states = state_encoder(states)
  encoded_next_states = state_encoder(next_states)
  encoded_actions = action_encoder(actions)

  predicted_next_states,  predicted_rewards, dones = wm(encoded_states, encoded_actions)
  decoded_next_states = state_decoder(predicted_next_states)
  t_loss = transition_loss(predicted_next_states, encoded_next_states)
  r_loss = reward_loss(predicted_rewards, rewards)
  rec_loss = reconstruction_loss(decoded_next_states, next_states)
  loss = t_loss + r_loss +rec_loss 

  loss.backward(retain_graph=True)
  losses.append(loss.item())
  t_losses.append(t_loss.item())
  r_losses.append(r_loss.item())
  rec_losses.append(rec_loss.item())
  optimizer.step()

In [1067]:
for i in tqdm(range(10000)) :
  if i%1000 == 0:
    collector.collect(10)

  learn_one_ep()

  0%|          | 0/10000 [00:00<?, ?it/s]

10 episodes added to memory
10 episodes added to memory


IndexError: ignored

In [None]:
statistics.mean(losses)

In [None]:
agent = TrainedAgent('/content/drive/MyDrive/dqn_weights.pt').to(device)

In [None]:
def run_episode(state, agent, num_steps =300):
  done = False
  steps = 0
  rewards = 0
  for _ in tqdm(range(num_steps)) :
    action = agent.act(state)
    action = torch.tensor(action, dtype=torch.float32).unsqueeze(0).to(device)
    encoded_action = action_encoder(action)
    encoded_state = state_encoder(state)

    encoded_next_state, reward, done = wm(encoded_action, encoded_state)
    next_state = state_decoder(encoded_next_state)
    state = next_state.clone().detach()

    rewards += reward.detach().item()
    if bool(done.argmax().item()) : break
  
  return rewards


In [None]:
run_episode(torch.Tensor(env.reset()).reshape(1,-1).to(device), agent)